PROG= hammer2
-SRCS= main.c subs.c
-SRCS+= cmd_remote.c cmd_snapshot.c cmd_pfs.c cmd_helper.c
+SRCS= main.c subs.c icrc.c msg.c
+SRCS+= cmd_remote.c cmd_snapshot.c cmd_pfs.c
+SRCS+= cmd_node.c cmd_leaf.c cmd_debug.c
#MAN= hammer2.8
NOMAN= TRUE
--- /dev/null
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+static void debug_recv(hammer2_iocom_t *iocom);
+static void debug_send(hammer2_iocom_t *iocom);
+static void debug_tty(hammer2_iocom_t *iocom);
+static void hammer2_debug_parse(hammer2_iocom_t *iocom,
+ hammer2_msg_t *msg, char *cmdbuf);
+
+int
+cmd_debug(void)
+{
+ struct sockaddr_in lsin;
+ struct hammer2_iocom iocom;
+ hammer2_msg_t *msg;
+ int fd;
+
+ /*
+ * Acquire socket and set options
+ */
+ if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ fprintf(stderr, "cmd_debug: socket(): %s\n",
+ strerror(errno));
+ return 1;
+ }
+
+ /*
+ * Connect to the target
+ */
+ bzero(&lsin, sizeof(lsin));
+ lsin.sin_family = AF_INET;
+ lsin.sin_addr.s_addr = 0;
+ lsin.sin_port = htons(HAMMER2_LISTEN_PORT);
+ if (connect(fd, (struct sockaddr *)&lsin, sizeof(lsin)) < 0) {
+ close(fd);
+ fprintf(stderr, "debug: connect failed: %s\n",
+ strerror(errno));
+ return 0;
+ }
+
+ /*
+ * Run the session. The remote end transmits our prompt.
+ */
+ hammer2_iocom_init(&iocom, fd, 0);
+ printf("debug: connected\n");
+
+ msg = hammer2_iocom_allocmsg(&iocom, HAMMER2_DBG_SHELL, 0);
+ hammer2_ioq_write(&iocom, msg);
+
+ hammer2_iocom_core(&iocom, debug_recv, debug_send, debug_tty);
+ fprintf(stderr, "debug: disconnected\n");
+ close(fd);
+ return 0;
+}
+
+/*
+ * Callback from hammer2_iocom_core() when messages might be present
+ * on the socket.
+ */
+static
+void
+debug_recv(hammer2_iocom_t *iocom)
+{
+ hammer2_msg_t *msg;
+
+ while ((iocom->flags & HAMMER2_IOCOMF_EOF) == 0 &&
+ (msg = hammer2_ioq_read(iocom)) != NULL) {
+ switch(msg->any.head.cmd & HAMMER2_MSGF_CMDSWMASK) {
+ case HAMMER2_LNK_ERROR:
+ fprintf(stderr, "Link Error: %d\n",
+ msg->any.head.error);
+ break;
+ case HAMMER2_DBG_SHELL:
+ /*
+ * We send the commands, not accept them.
+ */
+ hammer2_iocom_freemsg(iocom, msg);
+ break;
+ case HAMMER2_DBG_SHELL | HAMMER2_MSGF_REPLY:
+ /*
+ * A reply from the remote is data we copy to stdout.
+ */
+ if (msg->aux_size) {
+ msg->aux_data[msg->aux_size - 1] = 0;
+ write(1, msg->aux_data, strlen(msg->aux_data));
+ }
+ hammer2_iocom_freemsg(iocom, msg);
+ break;
+ default:
+ assert((msg->any.head.cmd & HAMMER2_MSGF_REPLY) == 0);
+ fprintf(stderr, "Unknown message: %08x\n",
+ msg->any.head.cmd);
+ hammer2_ioq_reply_term(iocom, msg,
+ HAMMER2_MSG_ERR_UNKNOWN);
+ break;
+ }
+ }
+ if (iocom->ioq_rx.error) {
+ fprintf(stderr, "node_master_recv: comm error %d\n",
+ iocom->ioq_rx.error);
+ }
+}
+
+/*
+ * Callback from hammer2_iocom_core() when messages might be transmittable
+ * to the socket.
+ */
+static
+void
+debug_send(hammer2_iocom_t *iocom)
+{
+ hammer2_ioq_write(iocom, NULL);
+}
+
+static
+void
+debug_tty(hammer2_iocom_t *iocom)
+{
+ hammer2_msg_t *msg;
+ char buf[256];
+ size_t len;
+
+ if (fgets(buf, sizeof(buf), stdin) != NULL) {
+ len = strlen(buf);
+ if (len && buf[len - 1] == '\n')
+ buf[--len] = 0;
+ ++len;
+ msg = hammer2_iocom_allocmsg(iocom, HAMMER2_DBG_SHELL, len);
+ bcopy(buf, msg->aux_data, len);
+ hammer2_ioq_write(iocom, msg);
+ } else {
+ /*
+ * Set EOF flag without setting any error code for normal
+ * EOF.
+ */
+ iocom->flags |= HAMMER2_IOCOMF_EOF;
+ }
+}
+
+/*
+ * This is called from the master node to process a received debug
+ * shell command. We process the command, outputting the results,
+ * then finish up by outputting another prompt.
+ */
+void
+hammer2_debug_remote(hammer2_iocom_t *iocom, hammer2_msg_t *msg)
+{
+ if (msg->aux_data)
+ msg->aux_data[msg->aux_size - 1] = 0;
+ if (msg->any.head.cmd & HAMMER2_MSGF_REPLY) {
+ /*
+ * A reply just prints out the string. No newline is added
+ * (it is expected to be embedded if desired).
+ */
+ if (msg->aux_data)
+ write(2, msg->aux_data, strlen(msg->aux_data));
+ hammer2_iocom_freemsg(iocom, msg);
+ } else {
+ /*
+ * Otherwise this is a command which we must process.
+ * When we are finished we generate a final reply.
+ */
+ hammer2_debug_parse(iocom, msg, msg->aux_data);
+ iocom_printf(iocom, msg, "debug> ");
+ hammer2_iocom_freemsg(iocom, msg);
+ }
+}
+
+static void
+hammer2_debug_parse(hammer2_iocom_t *iocom, hammer2_msg_t *msg, char *cmdbuf)
+{
+ char *cmd = strsep(&cmdbuf, " \t");
+
+ if (cmd == NULL || *cmd == 0) {
+ ;
+ } else if (strcmp(cmd, "help") == 0 || strcmp(cmd, "?") == 0) {
+ iocom_printf(iocom, msg,
+ "help Command help\n"
+ );
+ } else {
+ iocom_printf(iocom, msg, "Unrecognized command: %s\n", cmd);
+ }
+}
+
+void
+iocom_printf(hammer2_iocom_t *iocom, hammer2_msg_t *msg, const char *ctl, ...)
+{
+ hammer2_msg_t *rmsg;
+ va_list va;
+ char buf[1024];
+ size_t len;
+
+ va_start(va, ctl);
+ vsnprintf(buf, sizeof(buf), ctl, va);
+ va_end(va);
+ len = strlen(buf) + 1;
+
+ rmsg = hammer2_iocom_allocmsg(iocom, HAMMER2_DBG_SHELL, len);
+ bcopy(buf, rmsg->aux_data, len);
+ rmsg->any.head = msg->any.head;
+ rmsg->any.head.aux_icrc = 0;
+
+ hammer2_ioq_reply(iocom, rmsg);
+}
--- /dev/null
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * Start-up the leaf daemon for a PFS on this machine.
+ *
+ * One leaf daemon is run for each mounted PFS. The daemon may multi-thread
+ * to improve performance if desired. The daemon performs the following
+ * functions:
+ *
+ * (1) Makes and maintains connections to all cluster nodes found for
+ * the PFS, retrieved from the REMOTE configuration stored in
+ * the HAMMER2 mount. A localhost connection is always implied
+ * (using the backbone), but also having more direct connections
+ * can result in higher performance.
+ *
+ * This also includes any required encryption or authentication.
+ *
+ * (2) Runs the spanning tree protocol as a leaf, meaning that
+ * the leaf daemon does not serve as a relay and the individual
+ * connections made in (1) do not cross-connect.
+ *
+ * (3) Obtains the PFS's registration and makes it available to the
+ * cluster via the spanning tree protocol.
+ *
+ * (4) Creates a communications pipe to the HAMMER2 VFS in the kernel
+ * (installed via ioctl()) which the HAMMER2 VFS uses to accept and
+ * communicate high-level requests.
+ *
+ * (5) Performs all complex high-level messaging protocol operations,
+ * such as quorum operations, maintains persistent cache state,
+ * and so on and so forth.
+ *
+ * As you may have noted, the leaf daemon serves as an intermediary between
+ * the kernel and the rest of the cluster. The kernel will issue high level
+ * protocol commands to the leaf which performs the protocol and sends a
+ * response. The kernel does NOT have to deal with the quorum or other
+ * complex maintainance.
+ *
+ * Basically the kernel is simply another client from the point of view
+ * of the high-level protocols, requesting cache state locks and such from
+ * the leaf (in a degenerate situation one master lock is all that is needed).
+ * If the kernel PFS has local media storage that storage can be used for
+ * numerous purposes, such as caching, and in the degenerate non-clustered
+ * case simply represents the one-and-only master copy of the filesystem.
+ */
+int
+cmd_leaf(const char *sel_info)
+{
+ int ecode = 0;
+ int fd;
+
+ /*
+ * Obtain an ioctl descriptor and retrieve the registration info
+ * for the PFS.
+ */
+ if ((fd = hammer2_ioctl_handle(sel_info)) < 0)
+ return(1);
+
+ /*
+ * Start a daemon to interconnect the HAMMER2 PFS in-kernel to the
+ * master-node daemon. This daemon's thread will spend most of its
+ * time in the kernel.
+ */
+/* hammer2_demon(helper_pfs_interlink, (void *)(intptr_t)fd);*/
+ if (NormalExit)
+ close(fd);
+
+ return ecode;
+}
+
+#if 0
+/*
+ * LEAF interconnect between PFS and the messaging core. We create a
+ * socket connection to the messaging core, register the PFS with the
+ * core, and then pass the messaging descriptor to the kernel.
+ *
+ * The kernel takes over operation of the interconnect until the filesystem
+ * is unmounted or the descriptor is lost or explicitly terminated via
+ * a hammer2 command.
+ *
+ * This is essentially a localhost connection, so we don't have to worry
+ * about encryption. Any encryption will be handled by the messaging
+ * core.
+ */
+static
+void *
+leaf_connect(void *data)
+{
+ int fd;
+
+ fd = (int)(intptr_t)data;
+
+ return (NULL);
+}
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+static void *node_master_accept(void *data);
+static void *node_master_service(void *data);
+static void node_master_recv(hammer2_iocom_t *iocom);
+static void node_master_send(hammer2_iocom_t *iocom);
+
+/*
+ * Start-up the master listener daemon for the machine.
+ *
+ * The master listener serves as a rendezvous point in the cluster, accepting
+ * connections, performing registrations and authentications, maintaining
+ * the spanning tree, and keeping track of message state so disconnects can
+ * be handled properly.
+ *
+ * Once authenticated only low-level messaging protocols (which includes
+ * tracking persistent messages) are handled by this daemon. This daemon
+ * does not run the higher level quorum or locking protocols.
+ *
+ * This daemon can also be told to maintain connections to other nodes,
+ * forming a messaging backbone, which in turn allows PFS's (if desired) to
+ * simply connect to the master daemon via localhost if desired.
+ * Backbones are specified via /etc/hammer2.conf.
+ */
+int
+cmd_node(void)
+{
+ struct sockaddr_in lsin;
+ int on;
+ int lfd;
+
+ /*
+ * Acquire socket and set options
+ */
+ if ((lfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ fprintf(stderr, "node_master_listen: socket(): %s\n",
+ strerror(errno));
+ return 1;
+ }
+ on = 1;
+ setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
+
+ /*
+ * Setup listen port and try to bind. If the bind fails we assume
+ * that a master listener process is already running and silently
+ * fail.
+ */
+ bzero(&lsin, sizeof(lsin));
+ lsin.sin_family = AF_INET;
+ lsin.sin_addr.s_addr = INADDR_ANY;
+ lsin.sin_port = htons(HAMMER2_LISTEN_PORT);
+ if (bind(lfd, (struct sockaddr *)&lsin, sizeof(lsin)) < 0) {
+ close(lfd);
+ fprintf(stderr, "master listen: daemon already running\n");
+ return 0;
+ }
+ fprintf(stderr, "master listen: startup\n");
+ listen(lfd, 50);
+
+ /*
+ * Fork and disconnect the controlling terminal and parent process,
+ * executing the specified function as a pthread.
+ *
+ * Returns to the original process which can then continue running.
+ * In debug mode this call will create the pthread without forking
+ * and set NormalExit to 0, instead of fork.
+ */
+ hammer2_demon(node_master_accept, (void *)(intptr_t)lfd);
+ if (NormalExit)
+ close(lfd);
+ return 0;
+}
+
+/*
+ * Master listen/accept thread. Accept connections on the master socket,
+ * starting a pthread for each one.
+ */
+static
+void *
+node_master_accept(void *data)
+{
+ struct sockaddr_in asin;
+ socklen_t alen;
+ pthread_t thread;
+ int lfd = (int)(intptr_t)data;
+ int fd;
+
+ /*
+ * Nobody waits for us
+ */
+ setproctitle("hammer2 master listen");
+ pthread_detach(pthread_self());
+
+ /*
+ * Accept connections and create pthreads to handle them after
+ * validating the IP.
+ */
+ for (;;) {
+ alen = sizeof(asin);
+ fd = accept(lfd, (struct sockaddr *)&asin, &alen);
+ if (fd < 0) {
+ if (errno == EINTR)
+ continue;
+ break;
+ }
+ thread = NULL;
+ fprintf(stderr, "node_master_accept: accept fd %d\n", fd);
+ pthread_create(&thread, NULL,
+ node_master_service, (void *)(intptr_t)fd);
+ }
+ return (NULL);
+}
+
+/*
+ * Service an accepted connection (runs as a pthread)
+ */
+static
+void *
+node_master_service(void *data)
+{
+ hammer2_iocom_t iocom;
+ int fd;
+
+ fd = (int)(intptr_t)data;
+ hammer2_iocom_init(&iocom, fd, -1);
+ hammer2_iocom_core(&iocom, node_master_recv, node_master_send, NULL);
+
+ fprintf(stderr,
+ "iocom on fd %d terminated error rx=%d, tx=%d\n",
+ fd, iocom.ioq_rx.error, iocom.ioq_tx.error);
+ close(fd);
+
+ return (NULL);
+}
+
+/*
+ * Callback from hammer2_iocom_core() when messages might be present
+ * on the socket.
+ */
+static
+void
+node_master_recv(hammer2_iocom_t *iocom)
+{
+ hammer2_msg_t *msg;
+
+ while ((iocom->flags & HAMMER2_IOCOMF_EOF) == 0 &&
+ (msg = hammer2_ioq_read(iocom)) != NULL) {
+ fprintf(stderr, "MSG RECEIVED: %08x error %d\n",
+ msg->any.head.cmd, msg->any.head.error);
+ switch(msg->any.head.cmd & HAMMER2_MSGF_CMDSWMASK) {
+ case HAMMER2_LNK_ERROR:
+ break;
+ case HAMMER2_DBG_SHELL:
+ case HAMMER2_DBG_SHELL | HAMMER2_MSGF_REPLY:
+ hammer2_debug_remote(iocom, msg);
+ break;
+ default:
+ hammer2_ioq_reply_term(iocom, msg,
+ HAMMER2_MSG_ERR_UNKNOWN);
+ break;
+ }
+ }
+ if (iocom->ioq_rx.error) {
+ fprintf(stderr,
+ "node_master_recv: comm error %d\n",
+ iocom->ioq_rx.error);
+ }
+}
+
+/*
+ * Callback from hammer2_iocom_core() when messages might be transmittable
+ * to the socket.
+ */
+static
+void
+node_master_send(hammer2_iocom_t *iocom)
+{
+ hammer2_ioq_write(iocom, NULL);
+}
* Rollup headers for hammer2 utility
*/
#include <sys/types.h>
+#include <sys/uio.h>
#include <sys/mount.h>
#include <sys/file.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <sys/tty.h>
+#include <sys/endian.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <vfs/hammer2/hammer2_disk.h>
#include <vfs/hammer2/hammer2_mount.h>
#include <vfs/hammer2/hammer2_ioctl.h>
+#include <vfs/hammer2/hammer2_network.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <stddef.h>
-#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
#include <string.h>
#include <unistd.h>
-#include <pthread.h>
+#include <ctype.h>
#include <uuid.h>
+#include <assert.h>
+#include <pthread.h>
+#include <poll.h>
+
+#include "network.h"
extern int DebugOpt;
extern int NormalExit;
int hammer2_ioctl_handle(const char *sel_path);
-void hammer2_disconnect(void *(*func)(void *), void *arg);
+void hammer2_demon(void *(*func)(void *), void *arg);
+void hammer2_bswap_head(hammer2_msg_hdr_t *head);
int cmd_remote_connect(const char *sel_path, const char *url);
int cmd_remote_disconnect(const char *sel_path, const char *url);
uint8_t pfs_type, const char *uuid_str);
int cmd_pfs_delete(const char *sel_path, const char *name);
-int cmd_helper(const char *sel_path);
+int cmd_node(void);
+int cmd_leaf(const char *sel_path);
+int cmd_debug(void);
+
+void hammer2_ioq_init(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq);
+void hammer2_ioq_done(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq);
+void hammer2_iocom_init(hammer2_iocom_t *iocom, int sock_fd, int alt_fd);
+void hammer2_iocom_done(hammer2_iocom_t *iocom);
+hammer2_msg_t *hammer2_iocom_allocmsg(hammer2_iocom_t *iocom,
+ uint32_t cmd, int aux_size);
+void hammer2_iocom_reallocmsg(hammer2_iocom_t *iocom, hammer2_msg_t *msg,
+ int aux_size);
+void hammer2_iocom_freemsg(hammer2_iocom_t *iocom, hammer2_msg_t *msg);
+
+void hammer2_iocom_core(hammer2_iocom_t *iocom,
+ void (*iocom_recvmsg)(hammer2_iocom_t *),
+ void (*iocom_sendmsg)(hammer2_iocom_t *),
+ void (*iocom_altmsg)(hammer2_iocom_t *));
+hammer2_msg_t *hammer2_ioq_read(hammer2_iocom_t *iocon);
+void hammer2_ioq_write(hammer2_iocom_t *iocon, hammer2_msg_t *msg);
+void hammer2_ioq_reply(hammer2_iocom_t *iocom, hammer2_msg_t *msg);
+void hammer2_ioq_reply_term(hammer2_iocom_t *iocom, hammer2_msg_t *msg,
+ uint16_t error);
+void hammer2_ioq_write_drain(hammer2_iocom_t *iocon);
+
+void hammer2_debug_remote(hammer2_iocom_t *iocom, hammer2_msg_t *msg);
+void iocom_printf(hammer2_iocom_t *iocom, hammer2_msg_t *msg,
+ const char *ctl, ...);
--- /dev/null
+/*-
+ * Copyright (c) 2005-2010 Daniel Braniss <danny@cs.huji.ac.il>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ | iSCSI
+ | $Id: isc_subr.c 560 2009-05-07 07:37:49Z danny $
+ */
+
+#include <sys/types.h>
+#include <sys/uuid.h>
+
+#include <vfs/hammer2/hammer2_disk.h>
+
+/*****************************************************************/
+/* */
+/* CRC LOOKUP TABLE */
+/* ================ */
+/* The following CRC lookup table was generated automagically */
+/* by the Rocksoft^tm Model CRC Algorithm Table Generation */
+/* Program V1.0 using the following model parameters: */
+/* */
+/* Width : 4 bytes. */
+/* Poly : 0x1EDC6F41L */
+/* Reverse : TRUE. */
+/* */
+/* For more information on the Rocksoft^tm Model CRC Algorithm, */
+/* see the document titled "A Painless Guide to CRC Error */
+/* Detection Algorithms" by Ross Williams */
+/* (ross@guest.adelaide.edu.au.). This document is likely to be */
+/* in the FTP archive "ftp.adelaide.edu.au/pub/rocksoft". */
+/* */
+/*****************************************************************/
+
+static uint32_t crc32Table[256] = {
+ 0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
+ 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
+ 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
+ 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
+ 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
+ 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
+ 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
+ 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
+ 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
+ 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
+ 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
+ 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
+ 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
+ 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
+ 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
+ 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
+ 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
+ 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
+ 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
+ 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
+ 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
+ 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
+ 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
+ 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
+ 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
+ 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
+ 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
+ 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
+ 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
+ 0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
+ 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
+ 0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
+ 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
+ 0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
+ 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
+ 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
+ 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
+ 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
+ 0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
+ 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
+ 0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
+ 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
+ 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
+ 0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
+ 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
+ 0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
+ 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
+ 0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
+ 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
+ 0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
+ 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
+ 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
+ 0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
+ 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
+ 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
+ 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
+ 0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
+ 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
+ 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
+ 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
+ 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
+ 0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL,
+ 0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
+ 0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
+};
+
+uint32_t
+hammer2_icrc32(const void *buf, size_t size)
+{
+ const uint8_t *p = buf;
+ uint32_t crc = 0;
+
+ crc = crc ^ 0xffffffff;
+ while (size--)
+ crc = crc32Table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+ crc = crc ^ 0xffffffff;
+ return crc;
+}
+
+uint32_t
+hammer2_icrc32c(const void *buf, size_t size, uint32_t crc)
+{
+ const uint8_t *p = buf;
+
+ crc = crc ^ 0xffffffff;
+ while (size--)
+ crc = crc32Table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+ crc = crc ^ 0xffffffff;
+ return crc;
+}
/*
* Core options
*/
- while ((ch = getopt(ac, av, "aqs:t:u:")) != -1) {
+ while ((ch = getopt(ac, av, "adqs:t:u:")) != -1) {
switch(ch) {
case 'a':
all_opt = 1;
* Create snapshot with optional pfs_type and optional
* label override.
*/
- } else if (strcmp(av[0], "helper") == 0) {
+ } else if (strcmp(av[0], "node") == 0) {
/*
- * Typically run as a daemon, this multi-threaded helper
- * subsystem manages socket communications for the
- * filesystem.
+ * Start the master node daemon. This daemon accepts
+ * connections from local and remote clients, implements
+ * and maintains the spanning tree protocol, and manages
+ * the core messaging protocol.
*/
- ecode = cmd_helper(sel_path);
+ ecode = cmd_node();
+ } else if (strcmp(av[0], "leaf") == 0) {
+ /*
+ * Start the management daemon for a specific PFS.
+ *
+ * This will typically connect to the local master node
+ * daemon, register the PFS, and then pass its side of
+ * the socket descriptor to the kernel HAMMER2 VFS via an
+ * ioctl(). The process and/or thread context remains in the
+ * kernel until the PFS is unmounted or the connection is
+ * lost, then returns from the ioctl.
+ *
+ * It is possible to connect directly to a remote master node
+ * instead of the local master node in situations where
+ * encryption is not desired or no local master node is
+ * desired. This is not recommended because it represents
+ * a single point of failure for the PFS's communications.
+ *
+ * Direct kernel<->kernel communication between HAMMER2 VFSs
+ * is theoretically possible for directly-connected
+ * registrations (i.e. where the spanning tree is degenerate),
+ * but not recommended. We specifically try to reduce the
+ * complexity of the HAMMER2 VFS kernel code.
+ */
+ ecode = cmd_leaf(sel_path);
+ } else if (strcmp(av[0], "debug") == 0) {
+ /*
+ * Connect to the command line monitor in the hammer2 master
+ * node for the machine using HAMMER2_DBG_SHELL messages.
+ */
+ ecode = cmd_debug();
} else {
fprintf(stderr, "Unrecognized command: %s\n", av[0]);
usage(1);
--- /dev/null
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * Initialize a low-level ioq
+ */
+void
+hammer2_ioq_init(hammer2_iocom_t *iocom __unused, hammer2_ioq_t *ioq)
+{
+ bzero(ioq, sizeof(*ioq));
+ ioq->state = HAMMER2_MSGQ_STATE_HEADER1;
+ TAILQ_INIT(&ioq->msgq);
+}
+
+void
+hammer2_ioq_done(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq)
+{
+ hammer2_msg_t *msg;
+
+ while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
+ TAILQ_REMOVE(&ioq->msgq, msg, entry);
+ hammer2_iocom_freemsg(iocom, msg);
+ }
+ if ((msg = ioq->msg) != NULL) {
+ ioq->msg = NULL;
+ hammer2_iocom_freemsg(iocom, msg);
+ }
+}
+
+/*
+ * Initialize a low-level communications channel
+ */
+void
+hammer2_iocom_init(hammer2_iocom_t *iocom, int sock_fd, int alt_fd)
+{
+ bzero(iocom, sizeof(*iocom));
+
+ TAILQ_INIT(&iocom->freeq);
+ TAILQ_INIT(&iocom->freeq_aux);
+ iocom->sock_fd = sock_fd;
+ iocom->alt_fd = alt_fd;
+ iocom->flags = HAMMER2_IOCOMF_RREQ | HAMMER2_IOCOMF_WIDLE;
+ hammer2_ioq_init(iocom, &iocom->ioq_rx);
+ hammer2_ioq_init(iocom, &iocom->ioq_tx);
+
+ if (sock_fd >= 0)
+ fcntl(sock_fd, F_SETFL, O_NONBLOCK);
+#if 0
+ /* if line buffered our single fgets() should be fine */
+ if (alt_fd >= 0)
+ fcntl(alt_fd, F_SETFL, O_NONBLOCK);
+#endif
+}
+
+void
+hammer2_iocom_done(hammer2_iocom_t *iocom)
+{
+ hammer2_msg_t *msg;
+
+ iocom->sock_fd = -1;
+ hammer2_ioq_done(iocom, &iocom->ioq_rx);
+ hammer2_ioq_done(iocom, &iocom->ioq_tx);
+ if ((msg = TAILQ_FIRST(&iocom->freeq)) != NULL) {
+ TAILQ_REMOVE(&iocom->freeq, msg, entry);
+ free(msg);
+ }
+ if ((msg = TAILQ_FIRST(&iocom->freeq_aux)) != NULL) {
+ TAILQ_REMOVE(&iocom->freeq_aux, msg, entry);
+ free(msg->aux_data);
+ msg->aux_data = NULL;
+ free(msg);
+ }
+}
+
+hammer2_msg_t *
+hammer2_iocom_allocmsg(hammer2_iocom_t *iocom, uint32_t cmd, int aux_size)
+{
+ hammer2_msg_t *msg;
+ int hbytes;
+
+ if (aux_size) {
+ aux_size = (aux_size + HAMMER2_MSG_ALIGNMASK) &
+ ~HAMMER2_MSG_ALIGNMASK;
+ if ((msg = TAILQ_FIRST(&iocom->freeq_aux)) != NULL)
+ TAILQ_REMOVE(&iocom->freeq_aux, msg, entry);
+ } else {
+ if ((msg = TAILQ_FIRST(&iocom->freeq)) != NULL)
+ TAILQ_REMOVE(&iocom->freeq, msg, entry);
+ }
+ if (msg == NULL) {
+ msg = malloc(sizeof(*msg));
+ msg->aux_data = NULL;
+ msg->aux_size = 0;
+ }
+ if (msg->aux_size != aux_size) {
+ if (msg->aux_data) {
+ free(msg->aux_data);
+ msg->aux_data = NULL;
+ msg->aux_size = 0;
+ }
+ if (aux_size) {
+ msg->aux_data = malloc(aux_size);
+ msg->aux_size = aux_size;
+ }
+ }
+ msg->flags = 0;
+ hbytes = (cmd & HAMMER2_MSGF_SIZE) * HAMMER2_MSG_ALIGN;
+ bzero(&msg->any.head, hbytes);
+ msg->any.head.cmd = cmd;
+
+ return (msg);
+}
+
+void
+hammer2_iocom_reallocmsg(hammer2_iocom_t *iocom __unused, hammer2_msg_t *msg,
+ int aux_size)
+{
+ aux_size = (aux_size + HAMMER2_MSG_ALIGNMASK) & ~HAMMER2_MSG_ALIGNMASK;
+ if (aux_size && msg->aux_size != aux_size) {
+ if (msg->aux_data) {
+ free(msg->aux_data);
+ msg->aux_data = NULL;
+ }
+ msg->aux_data = malloc(aux_size);
+ msg->aux_size = aux_size;
+ }
+ msg->flags = 0;
+}
+
+void
+hammer2_iocom_freemsg(hammer2_iocom_t *iocom, hammer2_msg_t *msg)
+{
+ if (msg->aux_data)
+ TAILQ_INSERT_TAIL(&iocom->freeq_aux, msg, entry);
+ else
+ TAILQ_INSERT_TAIL(&iocom->freeq, msg, entry);
+}
+
+/*
+ * I/O core loop for an iocom.
+ */
+void
+hammer2_iocom_core(hammer2_iocom_t *iocom,
+ void (*recvmsg_func)(hammer2_iocom_t *),
+ void (*sendmsg_func)(hammer2_iocom_t *),
+ void (*altmsg_func)(hammer2_iocom_t *))
+{
+ struct pollfd fds[2];
+ int timeout = 5000;
+
+ iocom->recvmsg_callback = recvmsg_func;
+ iocom->sendmsg_callback = sendmsg_func;
+ iocom->altmsg_callback = altmsg_func;
+
+ while ((iocom->flags & HAMMER2_IOCOMF_EOF) == 0) {
+ fds[0].fd = iocom->sock_fd;
+ fds[0].events = 0;
+ fds[0].revents = 0;
+
+ if (iocom->flags & HAMMER2_IOCOMF_RREQ)
+ fds[0].events |= POLLIN;
+ else
+ timeout = 0;
+ if ((iocom->flags & HAMMER2_IOCOMF_WIDLE) == 0) {
+ if (iocom->flags & HAMMER2_IOCOMF_WREQ)
+ fds[0].events |= POLLOUT;
+ else
+ timeout = 0;
+ }
+
+ if (iocom->alt_fd >= 0) {
+ fds[1].fd = iocom->alt_fd;
+ fds[1].events |= POLLIN;
+ fds[1].revents = 0;
+ poll(fds, 2, timeout);
+ } else {
+ poll(fds, 1, timeout);
+ }
+ if ((fds[0].revents & POLLIN) ||
+ (iocom->flags & HAMMER2_IOCOMF_RREQ) == 0) {
+ iocom->recvmsg_callback(iocom);
+ }
+ if ((iocom->flags & HAMMER2_IOCOMF_WIDLE) == 0) {
+ if ((fds[0].revents & POLLOUT) ||
+ (iocom->flags & HAMMER2_IOCOMF_WREQ) == 0) {
+ iocom->sendmsg_callback(iocom);
+ }
+ }
+ if (iocom->alt_fd >= 0 && (fds[1].revents & POLLIN))
+ iocom->altmsg_callback(iocom);
+ }
+}
+
+/*
+ * Read the next ready message from the ioq, issuing I/O if needed.
+ * Caller should retry on a read-event when NULL is returned.
+ *
+ * If an error occurs during reception a HAMMER2_LNK_ERROR msg will
+ * be returned (and the caller must not call us again after that).
+ */
+hammer2_msg_t *
+hammer2_ioq_read(hammer2_iocom_t *iocom)
+{
+ hammer2_ioq_t *ioq = &iocom->ioq_rx;
+ hammer2_msg_t *msg;
+ hammer2_msg_hdr_t *head;
+ ssize_t n;
+ int bytes;
+ int flags;
+ int nmax;
+ uint16_t xcrc16;
+ uint32_t xcrc32;
+
+ /*
+ * If a message is already pending we can just remove and
+ * return it.
+ */
+ if ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
+ TAILQ_REMOVE(&ioq->msgq, msg, entry);
+ return(msg);
+ }
+
+ /*
+ * Message read in-progress (msg is NULL at the moment). We don't
+ * allocate a msg until we have its core header.
+ */
+ bytes = ioq->fifo_end - ioq->fifo_beg;
+ nmax = sizeof(iocom->rxbuf) - ioq->fifo_end;
+ msg = ioq->msg;
+
+ switch(ioq->state) {
+ case HAMMER2_MSGQ_STATE_HEADER1:
+ /*
+ * Load the primary header, fail on any non-trivial read
+ * error or on EOF. Since the primary header is the same
+ * size is the message alignment it will never straddle
+ * the end of the buffer.
+ */
+ if (bytes < (int)sizeof(msg->any.head)) {
+ n = read(iocom->sock_fd,
+ iocom->rxbuf + ioq->fifo_end,
+ nmax);
+ if (n <= 0) {
+ if (n == 0) {
+ ioq->error = HAMMER2_IOQ_ERROR_EOF;
+ break;
+ }
+ if (errno != EINTR &&
+ errno != EINPROGRESS &&
+ errno != EAGAIN) {
+ ioq->error = HAMMER2_IOQ_ERROR_SOCK;
+ break;
+ }
+ n = 0;
+ /* fall through */
+ }
+ ioq->fifo_end += n;
+ bytes += n;
+ nmax -= n;
+ }
+
+ /*
+ * Insufficient data accumulated (msg is NULL, caller will
+ * retry on event).
+ */
+ assert(msg == NULL);
+ if (bytes < (int)sizeof(msg->any.head))
+ break;
+
+ flags = 0;
+ head = (void *)(iocom->rxbuf + ioq->fifo_beg);
+
+ /*
+ * XXX Decrypt the core header
+ */
+
+ /*
+ * Check and fixup the core header. Note that the icrc
+ * has to be calculated before any fixups, but the crc
+ * fields in the msg may have to be swapped like everything
+ * else.
+ */
+ if (head->magic != HAMMER2_MSGHDR_MAGIC &&
+ head->magic != HAMMER2_MSGHDR_MAGIC_REV) {
+ ioq->error = HAMMER2_IOQ_ERROR_SYNC;
+ break;
+ }
+
+ xcrc32 = hammer2_icrc32((char *)head + HAMMER2_MSGHDR_CRCOFF,
+ HAMMER2_MSGHDR_CRCBYTES);
+ if (head->magic == HAMMER2_MSGHDR_MAGIC_REV) {
+ hammer2_bswap_head(head);
+ flags |= HAMMER2_MSGX_BSWAPPED;
+ }
+ xcrc16 = (uint16_t)xcrc32 ^ (uint16_t)(xcrc32 >> 16);
+ if (xcrc16 != head->icrc1) {
+ ioq->error = HAMMER2_IOQ_ERROR_HCRC;
+ break;
+ }
+
+ /*
+ * Calculate the full header size and aux data size
+ */
+ ioq->hbytes = (head->cmd & HAMMER2_MSGF_SIZE) *
+ HAMMER2_MSG_ALIGN;
+ ioq->abytes = head->aux_bytes * HAMMER2_MSG_ALIGN;
+ if (ioq->hbytes < (int)sizeof(msg->any.head) ||
+ ioq->hbytes > (int)sizeof(msg->any) ||
+ ioq->abytes > HAMMER2_MSGAUX_MAX) {
+ ioq->error = HAMMER2_IOQ_ERROR_FIELD;
+ break;
+ }
+
+ /*
+ * Finally allocate the message and copy the core header
+ * to the embedded extended header.
+ */
+ if (ioq->abytes) {
+ if ((msg = TAILQ_FIRST(&iocom->freeq_aux)) != NULL) {
+ TAILQ_REMOVE(&iocom->freeq_aux, msg, entry);
+ } else {
+ msg = malloc(sizeof(*msg));
+ msg->aux_data = NULL;
+ msg->aux_size = 0;
+ }
+ if (msg->aux_size != ioq->abytes) {
+ if (msg->aux_data) {
+ free(msg->aux_data);
+ msg->aux_data = NULL;
+ }
+ msg->aux_data = malloc(ioq->abytes);
+ /* msg->aux_size = ioq->abytes; */
+ }
+ } else {
+ if ((msg = TAILQ_FIRST(&iocom->freeq)) != NULL) {
+ TAILQ_REMOVE(&iocom->freeq, msg, entry);
+ } else {
+ msg = malloc(sizeof(*msg));
+ msg->aux_data = NULL;
+ /* msg->aux_size = 0; */
+ }
+ }
+ msg->aux_size = 0; /* data copied so far */
+ msg->flags = flags;
+ ioq->msg = msg;
+
+ /*
+ * We are either done or we fall-through
+ */
+ if (ioq->hbytes == sizeof(msg->any.head) && ioq->abytes == 0) {
+ bcopy(head, &msg->any.head, sizeof(msg->any.head));
+ ioq->fifo_beg += ioq->hbytes;
+ break;
+ }
+
+ /*
+ * Fall through to the next state. Make sure that the
+ * extended header does not straddle the end of the buffer.
+ * We still want to issue larger reads into our buffer,
+ * book-keeping is easier if we don't bcopy() yet.
+ */
+ if (bytes + nmax < ioq->hbytes) {
+ bcopy(iocom->rxbuf + ioq->fifo_beg, iocom->rxbuf,
+ bytes);
+ ioq->fifo_beg = 0;
+ ioq->fifo_end = bytes;
+ nmax = sizeof(iocom->rxbuf) - ioq->fifo_end;
+ }
+ ioq->state = HAMMER2_MSGQ_STATE_HEADER2;
+ /* fall through */
+ case HAMMER2_MSGQ_STATE_HEADER2:
+ /*
+ * Fill out the extended header.
+ */
+ assert(msg != NULL);
+ if (bytes < ioq->hbytes) {
+ n = read(iocom->sock_fd,
+ msg->any.buf + ioq->fifo_end,
+ nmax);
+ if (n <= 0) {
+ if (n == 0) {
+ ioq->error = HAMMER2_IOQ_ERROR_EOF;
+ break;
+ }
+ if (errno != EINTR &&
+ errno != EINPROGRESS &&
+ errno != EAGAIN) {
+ ioq->error = HAMMER2_IOQ_ERROR_SOCK;
+ break;
+ }
+ n = 0;
+ /* fall through */
+ }
+ ioq->fifo_end += n;
+ bytes += n;
+ nmax -= n;
+ }
+
+ /*
+ * Insufficient data accumulated (set msg NULL so caller will
+ * retry on event).
+ */
+ if (bytes < ioq->hbytes) {
+ msg = NULL;
+ break;
+ }
+
+ /*
+ * XXX Decrypt the extended header
+ */
+ head = (void *)(iocom->rxbuf + ioq->fifo_beg);
+
+ /*
+ * Check the crc on the extended header
+ */
+ if (ioq->hbytes > (int)sizeof(hammer2_msg_hdr_t)) {
+ xcrc32 = hammer2_icrc32(head + 1,
+ ioq->hbytes - sizeof(*head));
+ xcrc16 = (uint16_t)xcrc32 ^ (uint16_t)(xcrc32 >> 16);
+ if (head->icrc2 != xcrc16) {
+ ioq->error = HAMMER2_IOQ_ERROR_XCRC;
+ break;
+ }
+ }
+
+ /*
+ * Copy the extended header into the msg and adjust the
+ * FIFO.
+ */
+ bcopy(head, &msg->any, ioq->hbytes);
+
+ /*
+ * We are either done or we fall-through.
+ */
+ if (ioq->abytes == 0) {
+ ioq->fifo_beg += ioq->hbytes;
+ break;
+ }
+
+ /*
+ * Must adjust nmax and bytes (and the state) when falling
+ * through.
+ */
+ ioq->fifo_beg += ioq->hbytes;
+ nmax -= ioq->hbytes;
+ bytes -= ioq->hbytes;
+ ioq->state = HAMMER2_MSGQ_STATE_AUXDATA1;
+ /* fall through */
+ case HAMMER2_MSGQ_STATE_AUXDATA1:
+ /*
+ * Copy the partial or complete payload from remaining
+ * bytes in the FIFO. We have to fall-through either
+ * way so we can check the crc.
+ */
+ assert(msg->aux_size == 0);
+ if (bytes >= ioq->abytes) {
+ bcopy(iocom->rxbuf + ioq->fifo_beg, msg->aux_data,
+ ioq->abytes);
+ msg->aux_size = ioq->abytes;
+ ioq->fifo_beg += ioq->abytes;
+ bytes -= ioq->abytes;
+ } else if (bytes) {
+ bcopy(iocom->rxbuf + ioq->fifo_beg, msg->aux_data,
+ bytes);
+ msg->aux_size = bytes;
+ ioq->fifo_beg += bytes;
+ bytes = 0;
+ }
+ ioq->state = HAMMER2_MSGQ_STATE_AUXDATA2;
+ /* fall through */
+ case HAMMER2_MSGQ_STATE_AUXDATA2:
+ /*
+ * Read the remainder of the payload directly into the
+ * msg->aux_data buffer.
+ */
+ assert(msg);
+ if (msg->aux_size < ioq->abytes) {
+ assert(bytes == 0);
+ n = read(iocom->sock_fd,
+ msg->aux_data + msg->aux_size,
+ ioq->abytes - msg->aux_size);
+ if (n <= 0) {
+ if (n == 0) {
+ ioq->error = HAMMER2_IOQ_ERROR_EOF;
+ break;
+ }
+ if (errno != EINTR &&
+ errno != EINPROGRESS &&
+ errno != EAGAIN) {
+ ioq->error = HAMMER2_IOQ_ERROR_SOCK;
+ break;
+ }
+ n = 0;
+ /* fall through */
+ }
+ msg->aux_size += n;
+ }
+
+ /*
+ * Insufficient data accumulated (set msg NULL so caller will
+ * retry on event).
+ */
+ if (msg->aux_size < ioq->abytes) {
+ msg = NULL;
+ break;
+ }
+ assert(msg->aux_size == ioq->abytes);
+
+ /*
+ * XXX Decrypt the data
+ */
+
+ /*
+ * Check aux_icrc, then we are done.
+ */
+ xcrc32 = hammer2_icrc32(msg->aux_data, msg->aux_size);
+ if (xcrc32 != msg->any.head.aux_icrc) {
+ ioq->error = HAMMER2_IOQ_ERROR_ACRC;
+ break;
+ }
+ break;
+ case HAMMER2_MSGQ_STATE_ERROR:
+ default:
+ /*
+ * We don't double-return errors, the caller should not
+ * have called us again after getting an error msg.
+ */
+ assert(0);
+ break;
+ }
+
+ /*
+ * Handle error, RREQ, or completion
+ *
+ * NOTE: nmax and bytes are invalid at this point, we don't bother
+ * to update them when breaking out.
+ */
+ if (ioq->error) {
+ /*
+ * An unrecoverable error occured during processing,
+ * return a special error message. Try to leave the
+ * ioq state alone for post-mortem debugging.
+ *
+ * Link error messages are returned as one-way messages,
+ * so no flags get set. Source and target is 0 (link-level),
+ * msgid is 0 (link-level). All we really need to do is
+ * set up magic, cmd, and error.
+ */
+ if (msg == NULL) {
+ if ((msg = TAILQ_FIRST(&iocom->freeq)) != NULL) {
+ TAILQ_REMOVE(&iocom->freeq, msg, entry);
+ } else {
+ msg = malloc(sizeof(*msg));
+ msg->aux_data = NULL;
+ msg->aux_size = 0;
+ }
+ assert(ioq->msg == NULL);
+ } else {
+ assert(ioq->msg == msg);
+ ioq->msg = NULL;
+ }
+ if (msg->aux_data) {
+ free(msg->aux_data);
+ msg->aux_data = NULL;
+ msg->aux_size = 0;
+ }
+ bzero(&msg->any.head, sizeof(msg->any.head));
+ msg->any.head.magic = HAMMER2_MSGHDR_MAGIC;
+ msg->any.head.cmd = HAMMER2_LNK_ERROR;
+ msg->any.head.error = ioq->error;
+ ioq->state = HAMMER2_MSGQ_STATE_ERROR;
+ iocom->flags |= HAMMER2_IOCOMF_EOF;
+ } else if (msg == NULL) {
+ /*
+ * Insufficient data received to finish building the message,
+ * set RREQ and return NULL.
+ *
+ * Leave ioq->msg intact.
+ * Leave the FIFO intact.
+ */
+ iocom->flags |= HAMMER2_IOCOMF_RREQ;
+ ioq->fifo_beg = 0;
+ ioq->fifo_end = 0;
+ } else {
+ /*
+ * Return msg, clear the FIFO if it is now empty.
+ * Flag RREQ if the caller needs to wait for a read-event
+ * or not.
+ *
+ * The fifo has already been advanced past the message.
+ * Trivially reset the FIFO indices if possible.
+ */
+ if (ioq->fifo_beg == ioq->fifo_end) {
+ iocom->flags |= HAMMER2_IOCOMF_RREQ;
+ ioq->fifo_beg = 0;
+ ioq->fifo_end = 0;
+ } else {
+ iocom->flags &= ~HAMMER2_IOCOMF_RREQ;
+ }
+ ioq->state = HAMMER2_MSGQ_STATE_HEADER1;
+ ioq->msg = NULL;
+ }
+ return (msg);
+}
+
+/*
+ * Calculate the header and data crc's and write a low-level message to
+ * the connection. If aux_icrc is non-zero the aux_data crc is already
+ * assumed to have been set.
+ *
+ * A non-NULL msg is added to the queue but not necessarily flushed.
+ * Calling this function with msg == NULL will get a flush going.
+ */
+void
+hammer2_ioq_write(hammer2_iocom_t *iocom, hammer2_msg_t *msg)
+{
+ hammer2_ioq_t *ioq = &iocom->ioq_tx;
+ ssize_t nmax;
+ ssize_t nact;
+ int hbytes;
+ int abytes;
+ int hoff;
+ int aoff;
+ uint16_t xcrc16;
+ uint32_t xcrc32;
+ struct iovec iov[HAMMER2_IOQ_MAXIOVEC];
+ int n;
+
+ if (ioq->error) {
+ if (msg) {
+ TAILQ_INSERT_TAIL(&ioq->msgq, msg, entry);
+ ++ioq->msgcount;
+ }
+ hammer2_ioq_write_drain(iocom);
+ return;
+ }
+
+ if (msg) {
+ /*
+ * Finish populating the msg fields
+ */
+ msg->any.head.magic = HAMMER2_MSGHDR_MAGIC;
+ msg->any.head.salt = (random() << 8) | (ioq->seq & 255);
+ ++ioq->seq;
+
+ /*
+ * Calculate aux_icrc if 0, calculate icrc2, and finally
+ * calculate icrc1.
+ */
+ if (msg->aux_size && msg->any.head.aux_icrc == 0) {
+ assert((msg->aux_size & HAMMER2_MSG_ALIGNMASK) == 0);
+ xcrc32 = hammer2_icrc32(msg->aux_data, msg->aux_size);
+ msg->any.head.aux_icrc = xcrc32;
+ }
+ msg->any.head.aux_bytes = msg->aux_size / HAMMER2_MSG_ALIGN;
+ assert((msg->aux_size & HAMMER2_MSG_ALIGNMASK) == 0);
+
+ if ((msg->any.head.cmd & HAMMER2_MSGF_SIZE) >
+ sizeof(msg->any.head) / HAMMER2_MSG_ALIGN) {
+ hbytes = (msg->any.head.cmd & HAMMER2_MSGF_SIZE) *
+ HAMMER2_MSG_ALIGN;
+ hbytes -= sizeof(msg->any.head);
+ xcrc32 = hammer2_icrc32(&msg->any.head + 1, hbytes);
+ xcrc16 = (uint16_t)xcrc32 ^ (uint16_t)(xcrc32 >> 16);
+ msg->any.head.icrc2 = xcrc16;
+ } else {
+ msg->any.head.icrc2 = 0;
+ }
+ xcrc32 = hammer2_icrc32(msg->any.buf + HAMMER2_MSGHDR_CRCOFF,
+ HAMMER2_MSGHDR_CRCBYTES);
+ xcrc16 = (uint16_t)xcrc32 ^ (uint16_t)(xcrc32 >> 16);
+ msg->any.head.icrc1 = xcrc16;
+
+ /*
+ * XXX Encrypt the message
+ */
+
+ /*
+ * Enqueue the message, stop now if we already know that
+ * we can't write.
+ */
+ TAILQ_INSERT_TAIL(&ioq->msgq, msg, entry);
+ ++ioq->msgcount;
+ iocom->flags &= ~HAMMER2_IOCOMF_WIDLE;
+ if (iocom->flags & HAMMER2_IOCOMF_WREQ)
+ return;
+
+ /*
+ * Flush if we can aggregate several msgs, otherwise
+ * we will wait for the global flush (msg == NULL).
+ */
+ if (ioq->msgcount < HAMMER2_IOQ_MAXIOVEC / 2)
+ return;
+ } else if (iocom->flags &= HAMMER2_IOCOMF_WIDLE) {
+ /*
+ * Nothing to do if WIDLE is set.
+ */
+ assert(TAILQ_FIRST(&ioq->msgq) == NULL);
+ return;
+ }
+
+ /*
+ * Pump messages out the connection by building an iovec.
+ */
+ n = 0;
+ nmax = 0;
+
+ TAILQ_FOREACH(msg, &ioq->msgq, entry) {
+ hoff = 0;
+ hbytes = (msg->any.head.cmd & HAMMER2_MSGF_SIZE) *
+ HAMMER2_MSG_ALIGN;
+ aoff = 0;
+ abytes = msg->aux_size;
+ if (n == 0) {
+ hoff += ioq->hbytes;
+ aoff += ioq->abytes;
+ }
+ if (hbytes - hoff > 0) {
+ iov[n].iov_base = (char *)&msg->any.head + hoff;
+ iov[n].iov_len = hbytes - hoff;
+ nmax += hbytes - hoff;
+ ++n;
+ if (n == HAMMER2_IOQ_MAXIOVEC)
+ break;
+ }
+ if (abytes - aoff > 0) {
+ assert(msg->aux_data != NULL);
+ iov[n].iov_base = msg->aux_data + aoff;
+ iov[n].iov_len = abytes - aoff;
+ nmax += abytes - aoff;
+ ++n;
+ if (n == HAMMER2_IOQ_MAXIOVEC)
+ break;
+ }
+ }
+ if (n == 0)
+ return;
+
+ /*
+ * Execute the writev() then figure out what happened.
+ */
+ nact = writev(iocom->sock_fd, iov, n);
+ if (nact < 0) {
+ if (errno != EINTR &&
+ errno != EINPROGRESS &&
+ errno != EAGAIN) {
+ ioq->error = HAMMER2_IOQ_ERROR_SOCK;
+ hammer2_ioq_write_drain(iocom);
+ } else {
+ iocom->flags |= HAMMER2_IOCOMF_WREQ;
+ }
+ return;
+ }
+ if (nact == nmax)
+ iocom->flags &= ~HAMMER2_IOCOMF_WREQ;
+ else
+ iocom->flags |= HAMMER2_IOCOMF_WREQ;
+
+ while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
+ hbytes = (msg->any.head.cmd & HAMMER2_MSGF_SIZE) *
+ HAMMER2_MSG_ALIGN;
+ abytes = msg->aux_size;
+
+ if (nact < hbytes - ioq->hbytes) {
+ ioq->hbytes += nact;
+ break;
+ }
+ nact -= hbytes - ioq->hbytes;
+ ioq->hbytes = hbytes;
+ if (nact < abytes - ioq->abytes) {
+ ioq->abytes += nact;
+ break;
+ }
+ nact -= abytes - ioq->abytes;
+
+ TAILQ_REMOVE(&ioq->msgq, msg, entry);
+ --ioq->msgcount;
+ ioq->hbytes = 0;
+ ioq->abytes = 0;
+ if (msg->aux_data)
+ TAILQ_INSERT_TAIL(&iocom->freeq_aux, msg, entry);
+ else
+ TAILQ_INSERT_TAIL(&iocom->freeq, msg, entry);
+ }
+ if (msg == NULL) {
+ iocom->flags |= HAMMER2_IOCOMF_WIDLE;
+ iocom->flags &= ~HAMMER2_IOCOMF_WREQ;
+ }
+ if (ioq->error) {
+ iocom->flags |= HAMMER2_IOCOMF_EOF |
+ HAMMER2_IOCOMF_WIDLE;
+ iocom->flags &= ~HAMMER2_IOCOMF_WREQ;
+ }
+}
+
+/*
+ * Kill pending msgs on ioq_tx and adjust the flags such that no more
+ * write events will occur. We don't kill read msgs because we want
+ * the caller to pull off our contrived terminal error msg to detect
+ * the connection failure.
+ */
+void
+hammer2_ioq_write_drain(hammer2_iocom_t *iocom)
+{
+ hammer2_ioq_t *ioq = &iocom->ioq_tx;
+ hammer2_msg_t *msg;
+
+ while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
+ TAILQ_REMOVE(&ioq->msgq, msg, entry);
+ --ioq->msgcount;
+ hammer2_iocom_freemsg(iocom, msg);
+ }
+ iocom->flags |= HAMMER2_IOCOMF_WIDLE;
+ iocom->flags &= ~HAMMER2_IOCOMF_WREQ;
+}
+
+/*
+ * Reply to a message after setting various fields appropriately.
+ * This function will swap (source) and (target) and enqueue the
+ * message for transmission.
+ */
+void
+hammer2_ioq_reply(hammer2_iocom_t *iocom, hammer2_msg_t *msg)
+{
+ uint16_t t16;
+
+ t16 = msg->any.head.source;
+ msg->any.head.source = msg->any.head.target;
+ msg->any.head.target = t16;
+ msg->any.head.cmd ^= HAMMER2_MSGF_REPLY;
+ hammer2_ioq_write(iocom, msg);
+}
+
+void
+hammer2_ioq_reply_term(hammer2_iocom_t *iocom, hammer2_msg_t *msg,
+ uint16_t error)
+{
+ if (msg->any.head.cmd & HAMMER2_MSGF_CREATE) {
+ msg->any.head.cmd |= HAMMER2_MSGF_CREATE | HAMMER2_MSGF_DELETE;
+ msg->any.head.error = error;
+ hammer2_ioq_reply(iocom, msg);
+ }
+}
--- /dev/null
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/***************************************************************************
+ * LOW LEVEL MESSAGING *
+ ***************************************************************************
+ *
+ * hammer2_msg - A standalone copy of a message, typically referenced by
+ * or embedded in other structures, or used with I/O queues.
+ *
+ * These structures are strictly temporary, so they do not have to be
+ * particularly optimized for size. All possible message headers are
+ * directly embedded (any), and the message may contain a reference
+ * to allocated auxillary data. The structure is recycled quite often
+ * by a connection.
+ *
+ * This structure is typically not used for storing persistent message
+ * state (see hammer2_pmsg for that).
+ */
+struct hammer2_msg {
+ TAILQ_ENTRY(hammer2_msg) entry; /* queue */
+ char *aux_data; /* aux-data if any */
+ int aux_size;
+ int flags;
+ hammer2_any_t any; /* raw extended msg header */
+};
+
+typedef struct hammer2_msg hammer2_msg_t;
+
+TAILQ_HEAD(hammer2_msg_queue, hammer2_msg);
+typedef struct hammer2_msg_queue hammer2_msg_queue_t;
+
+#define HAMMER2_MSGX_BSWAPPED 0x0001
+
+/*
+ * hammer2_ioq - An embedded component of hammer2_connect, holds state
+ * for the buffering and parsing of incoming and outgoing messages.
+ */
+struct hammer2_ioq {
+ enum { HAMMER2_MSGQ_STATE_HEADER1,
+ HAMMER2_MSGQ_STATE_HEADER2,
+ HAMMER2_MSGQ_STATE_AUXDATA1,
+ HAMMER2_MSGQ_STATE_AUXDATA2,
+ HAMMER2_MSGQ_STATE_ERROR } state;
+ int fifo_beg; /* buffered data */
+ int fifo_end;
+ int hbytes; /* header size */
+ int abytes; /* aux_data size */
+ int error;
+ int seq; /* salt sequencer */
+ int msgcount;
+ hammer2_msg_t *msg;
+ hammer2_msg_queue_t msgq;
+};
+
+typedef struct hammer2_ioq hammer2_ioq_t;
+
+#define HAMMER2_IOQ_ERROR_SYNC 1 /* bad magic / out of sync */
+#define HAMMER2_IOQ_ERROR_EOF 2 /* unexpected EOF */
+#define HAMMER2_IOQ_ERROR_SOCK 3 /* read() error on socket */
+#define HAMMER2_IOQ_ERROR_FIELD 4 /* invalid field */
+#define HAMMER2_IOQ_ERROR_HCRC 5 /* core header crc bad */
+#define HAMMER2_IOQ_ERROR_XCRC 6 /* ext header crc bad */
+#define HAMMER2_IOQ_ERROR_ACRC 7 /* aux data crc bad */
+#define HAMMER2_IOQ_ERROR_STATE 8 /* bad state */
+
+#define HAMMER2_IOQ_MAXIOVEC 16
+
+/*
+ * hammer2_iocom - governs a messaging stream connection
+ */
+struct hammer2_iocom {
+ hammer2_ioq_t ioq_rx;
+ hammer2_ioq_t ioq_tx;
+ hammer2_msg_queue_t freeq; /* free msgs hdr only */
+ hammer2_msg_queue_t freeq_aux; /* free msgs w/aux_data */
+ void (*recvmsg_callback)(struct hammer2_iocom *);
+ void (*sendmsg_callback)(struct hammer2_iocom *);
+ void (*altmsg_callback)(struct hammer2_iocom *);
+ int sock_fd; /* comm socket or pipe */
+ int alt_fd; /* thread signal, tty, etc */
+ int flags;
+ char rxbuf[HAMMER2_MSGBUF_SIZE]; /* for ioq_rx only */
+};
+
+typedef struct hammer2_iocom hammer2_iocom_t;
+
+#define HAMMER2_IOCOMF_EOF 0x00000001 /* EOF or ERROR on desc */
+#define HAMMER2_IOCOMF_RREQ 0x00000002 /* request read-data event */
+#define HAMMER2_IOCOMF_WREQ 0x00000004 /* request write-avail event */
+#define HAMMER2_IOCOMF_WIDLE 0x00000008 /* request write-avail event */
+#define HAMMER2_IOCOMF_SIGNAL 0x00000010
+
+/***************************************************************************
+ * HIGH LEVEL MESSAGING *
+ ***************************************************************************
+ *
+ */
+
+#if 0
+
+
+
+/*
+ * The global registration structure consolidates information accumulated
+ * via the spanning tree algorithm and tells us which connection (link)
+ * is the best path to get to any given registration.
+ *
+ * glob_node - Splay entry for this registration in the global index
+ * of all registrations.
+ *
+ * glob_entry - tailq entry when this registration's best_span element
+ * has changed state.
+ *
+ * span_list - Head of a simple list of spanning tree entries which
+ * we use to determine the best link.
+ *
+ * best_span - Which of the span structure on span_list is the best
+ * one.
+ *
+ * source_root - Splay tree root indexing all mesasges sent from this
+ * registration. The messages are indexed by
+ * {linkid,msgid} XXX
+ *
+ * target_root - Splay tree root indexing all messages being sent to
+ * this registration. The messages are indexed by
+ * {linkid,msgid}. XXX
+ *
+ *
+ * Whenever spanning tree data causes a registration's best_link field to
+ * change that registration is transmitted as spanning tree data to every
+ * active link. Note that pure clients to the cluster, of which there can
+ * be millions, typically do not transmit spanning tree data to each other.
+ *
+ * Each registration is assigned a unique linkid local to the node (another
+ * node might assign a different linkid to the same registration). This
+ * linkid must be persistent as long as messages are active and is used
+ * to identify the message source and target.
+ */
+TAILQ_HEAD(hammer2_span_list, hammer2_span);
+typedef struct hammer2_span_list hammer2_span_list_t;
+
+struct hammer2_reg {
+ SPLAY_ENTRY(hammer2_reg) glob_node; /* index of registrations */
+ TAILQ_ENTRY(hammer2_reg) glob_entry; /* when modified */
+ hammer2_span_list_t span_list; /* list of hammer2_span's */
+ hammer2_span_t *best_span; /* best span entry */
+ hammer2_pmsg_splay_head_t source_root; /* msgs sent from reg */
+ hammer2_pmsg_splay_head_t target_root; /* msgs sent to reg */
+ uuid_t pfs_id; /* key field */
+ uuid_t pfs_fsid; /* key field */
+ uint32_t linkid;
+ int flags;
+ int refs;
+};
+
+#define HAMMER2_PROTO_REGF_MODIFIED 0x0001
+
+/*
+ * Each link (connection) collects spanning tree data received via the
+ * link and stores it in these span structures.
+ */
+struct hammer2_span {
+ TAILQ_ENTRY(hammer2_span) span_entry; /* from hammer2_reg */
+ SPLAY_ENTRY(hammer2_span) span_node; /* from hammer2_link */
+ hammer2_reg_t *reg;
+ hammer2_link_t *link;
+ int weight;
+};
+
+/*
+ * Most hammer2 messages represent transactions and have persistent state
+ * which must be recorded. Some messages, such as cache states and inode
+ * representations are very long-lasting transactions.
+ *
+ * Each node in the graph must keep track of the message state in order
+ * to perform the proper action when a connection is lost. To do this
+ * the message is indexed on the source and target (global) registration,
+ * and the actual span element the message was received on and transmitted
+ * to is recorded (allowing us to retrieve the physical links involved).
+ *
+ * The {source_reg, target_reg, msgid} uniquely identifies a message. Any
+ * streaming operations using the same msgid use the same rendezvous.
+ *
+ * It is important to note that recorded state must use the same physical
+ * link (and thus the same chain of links across the graph) as was 'forged'
+ * by the initial message for that msgid. If the source span a message is
+ * received on does not match the recorded source, or the recorded target
+ * is no longer routeable, the message will be returned or generate an ABORT
+ * with LINKFAIL as appropriate.
+ */
+struct hammer2_pmsg {
+ SPLAY_ENTRY(hammer2_pmsg) source_reg;
+ SPLAY_ENTRY(hammer2_pmsg) target_reg;
+ hammer2_span_t *source;
+ hammer2_span_t *target;
+ uint16_t msgid;
+ void *aux_data; /* allocated aux data */
+ hammer2_msg_any_t any; /* dynamically allocated */
+};
+
+#endif
return (fd);
}
+/*
+ * Execute the specified function as a detached independent process/daemon,
+ * unless we are in debug mode. If we are in debug mode the function is
+ * executed as a pthread in the current process.
+ */
void
-hammer2_disconnect(void *(*func)(void *), void *arg)
+hammer2_demon(void *(*func)(void *), void *arg)
{
pthread_t thread = NULL;
pid_t pid;
pthread_exit(NULL);
_exit(2); /* NOT REACHED */
}
+
+/*
+ * This swaps endian for a hammer2_msg_hdr. Note that the extended
+ * header is not adjusted, just the core header.
+ */
+void
+hammer2_bswap_head(hammer2_msg_hdr_t *head)
+{
+ head->magic = bswap16(head->magic);
+ head->icrc1 = bswap16(head->icrc1);
+ head->salt = bswap32(head->salt);
+ head->source = bswap16(head->source);
+ head->target = bswap16(head->target);
+ head->msgid = bswap32(head->msgid);
+ head->cmd = bswap32(head->cmd);
+ head->error = bswap16(head->error);
+ head->resv05 = bswap16(head->resv05);
+ head->icrc2 = bswap16(head->icrc2);
+ head->aux_bytes = bswap16(head->aux_bytes);
+ head->aux_icrc = bswap32(head->aux_icrc);
+}
--- /dev/null
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef VFS_HAMMER2_NETWORK_H_
+#define VFS_HAMMER2_NETWORK_H_
+
+#ifndef _VFS_HAMMER2_DISK_H_
+#include "hammer2_disk.h"
+#endif
+
+/*
+ * Mesh network protocol structures.
+ *
+ * The mesh is constructed from point-to-point streaming links with varying
+ * levels of interconnectedness, forming a graph. When a link is established
+ * link id #0 is reserved for link-level communications. This link is used
+ * for authentication, registration, ping, further link id negotiations,
+ * spanning tree, and so on.
+ *
+ * The spanning tree forms a weighted shortest-path-first graph amongst
+ * those nodes with sufficient administrative rights to relay between
+ * registrations. Each link maintains a full reachability set, aggregates
+ * it, and retransmits via the shortest path. However, leaf nodes (even leaf
+ * nodes with multiple connections) can opt not to be part of the spanning
+ * tree and typically (due to administrative rights) their registrations
+ * are not reported to other leafs.
+ *
+ * All message responses follow the SAME PATH that the original message
+ * followed, but in reverse. This is an absolute requirement since messages
+ * expecting replies record persistent state at each hop.
+ *
+ * Message state is handled by the CREATE, DELETE, REPLY, and ABORT
+ * flags. Message state is typically recorded at the end points and
+ * at each hop until a DELETE is received from both sides.
+ *
+ * One-way messages such as those used by spanning tree commands are not
+ * recorded. These are sent with no flags set. Aborts and replies are not
+ * possible.
+ *
+ * A normal message with no persistent state is sent with CREATE|DELETE and
+ * the response is returned with REPLY|CREATE|DELETE. A normal command can
+ * be aborted by sending an ABORT message to the msgid that is in progress.
+ * An ABORT sent by the originator must still wait for the reply from the
+ * target and, since we've already sent the DELETE with our CREATE|DELETE,
+ * may also cross the REPLY|CREATE|DELETE message in the opposite direction.
+ * In this situation the message state has been destroyed on the target and
+ * the target ignores the ABORT (because CREATE is not set, and
+ * differentiated from one-way messages because ABORT is set).
+ *
+ * A command which has persistent state must maintain a persistent message.
+ * For example, a lock or cache state request. A persistent message is
+ * initiated with just CREATE and the initial response is returned with
+ * REPLY|CREATE. Successive messages are sent with no flags and responses
+ * with just REPLY. The DELETE flag acts like a half-close (and degenerately
+ * works the same as it does for normal messages). This flag can be set
+ * in the initial command or any successive command, and in the initial reply
+ * or in any successive reply. The recorded message state is destroyed when
+ * both sides have sent a DELETE.
+ *
+ * Aborts for persistent messages work in the same fashion as they do for
+ * normal messages, except that the target can also initiate an ABORT
+ * by using ABORT|REPLY. The target has one restriction, however, it cannot
+ * send an ABORT with the CREATE flag set (i.e. as the initial reply),
+ * because if the originator reuses the msgid the originator would not
+ * then be able to determine that the ABORT is associated with the previous
+ * session and not the new session.
+ *
+ * If a link failure occurs any active or persistent messages will be
+ * auto-replied to the originator, and auto-aborted to the target.
+ *
+ * Additional features:
+ *
+ * ABORT+CREATE - This may be used to make a non-blocking request.
+ * The target receives the normal command and is free
+ * to ignore the ABORT flag, but may use it as an
+ * indication that a non-blocking request is being
+ * made. The target must still reply the message of
+ * course. Works for normal and persistent messages
+ * but does NOT work for one-way messages (because
+ * ABORT alone without recorded msgid state has to be
+ * ignored).
+ *
+ * ABORT - ABORT messages are allowed to bypass input queues.
+ * Normal ABORTs are sent without the DELETE flag,
+ * even for normal messages which had already set the
+ * DELETE flag in the initial message. This allows
+ * the normal DELETE half-close operation to proceed
+ * so an ABORT is basically advisory and the originator
+ * must still wait for a reply. Aborts are also
+ * advisory when sent by targets.
+ *
+ * ABORT messages cannot be used with one-way messages
+ * as this would cause such messages to be ignored.
+ *
+ * ABORT+DELETE - This is a special form of ABORT that allows the
+ * recorded message state on the sender and on all
+ * hops the message is relayed through to be destroyed
+ * on the fly, as if a two-way DELETE had occurred.
+ * It will cause an auto-reply or auto-abort to be
+ * issued as if the link had been lost, but allows
+ * the link to remain live.
+ *
+ * This form is basically like a socket close(),
+ * where you aren't just sending an EOF but you are
+ * completely aborting the request in both directions.
+ *
+ * This form cannot be used with CREATE as that could
+ * generate a false reply if msgid is reused and
+ * crosses the abort over the wire.
+ *
+ * ABORT messages cannot be used with one-way messages
+ * as this would cause such messages to be ignored.
+ *
+ * SUBSTREAMS - Persistent messages coupled with the fact that
+ * all commands and responses run through a single
+ * chain of relays over reliable streams allows one
+ * to treat persistent message updates as a data
+ * stream and use the DELETE flag or an ABORT to
+ * indicate EOF.
+ *
+ *
+ * NEGOTIATION OF {source} AND {target}
+ *
+ * In this discussion 'originator' describes the original sender of a message
+ * and not the relays inbetween, while 'sender' describes the last relay.
+ * The two mean the same thing only when the originator IS the last relay.
+ *
+ * The {source} field is sender-localized. The sender assigns this field
+ * based on which connection the message originally came from. The initial
+ * message as sent by the originator sets source=0. This also means that a
+ * leaf connection will always send messages with source=0.
+ *
+ * The {source} field must be re-localized at each hop, since messages
+ * coming from multiple connections to a node will use conflicting
+ * {source} values. This can lead to linkid exhaustion which is discussed
+ * a few paragraphs down.
+ *
+ * The {target} field is sender-allocated. Messages sent to {target} are
+ * preceeded by a FORGE message to {target} which associates a registration
+ * with {target}, or UNFORGE to delete the associtation.
+ *
+ * The msgid field is 32 bits (remember some messages have long-lived
+ * persistent state so this is important!). One-way messages always use
+ * msgid=0.
+ *
+ * LINKID EXHAUSTION
+ *
+ * Because {source} must be re-localized at each hop it is possible to run
+ * out of link identifiers. At the same time we want to allow millions of
+ * client/leaf connections, and 'millions' is a lot bigger than 65535.
+ *
+ * We also have a problem with the persistent message state... If a single
+ * client's vnode cache has a million vnodes that can represent a million
+ * persistent cache states. Multiply by a million clients and ... oops!
+ *
+ * To solve these problems leafs connect into protocol-aggregators rather
+ * than directly to the cluster. The linkid and core message protocols only
+ * occur within the cluster and not by the leafs. A leaf can still connect
+ * to multiple aggregators for redundancy if it desires but may have to
+ * pick and choose which inodes go where since acquiring a cache state lock
+ * over one connection will cause conflicts to be invalidated on the other.
+ * In otherwords, there are limitations to this approach.
+ *
+ * A protocol aggregator takes any number of connections and aggregates
+ * the operations down to a single linkid. For example, this means that
+ * the protocol aggregator is responsible for maintaining all the cache
+ * state and performing crunches to reduce the overall amount of state
+ * down to something the cluster core can handle.
+ *
+ * --
+ *
+ * All message headers are 32-byte aligned and sized (all command and
+ * response structures must be 32-byte aligned), and all transports must
+ * support message headers up to HAMMER2_MSGHDR_MAX. The msg structure
+ * can handle up to 8160 bytes but to keep things fairly clean we limit
+ * message headers to 2048 bytes.
+ *
+ * Any in-band data is padded to a 32-byte alignment and placed directly
+ * after the extended header (after the higher-level cmd/rep structure).
+ * The actual unaligned size of the in-band data is encoded in the aux_bytes
+ * field in this case. Maximum data sizes are negotiated during registration.
+ *
+ * Use of out-of-band data must be negotiated. In this case bit 31 of
+ * aux_bytes will be set and the remaining bits will contain information
+ * specific to the out-of-band transfer (such as DMA channel, slot, etc).
+ *
+ * (must be 32 bytes exactly to match the alignment requirement and to
+ * support pad records in shared-memory FIFO schemes)
+ */
+struct hammer2_msg_hdr {
+ uint16_t magic; /* sanity, synchronization, endian */
+ uint16_t icrc1; /* base header crc &salt on */
+ uint32_t salt; /* random salt helps crypto/replay */
+
+ uint16_t source; /* source linkid */
+ uint16_t target; /* target linkid */
+ uint32_t msgid; /* message id */
+
+ uint32_t cmd; /* flags | cmd | hdr_size / 32 */
+ uint16_t error; /* error field */
+ uint16_t resv05;
+
+ uint16_t icrc2; /* extended header crc (after base) */
+ uint16_t aux_bytes; /* aux data descriptor or size / 32 */
+ uint32_t aux_icrc; /* aux data iscsi crc */
+};
+
+typedef struct hammer2_msg_hdr hammer2_msg_hdr_t;
+
+#define HAMMER2_MSGHDR_MAGIC 0x4832
+#define HAMMER2_MSGHDR_MAGIC_REV 0x3248
+#define HAMMER2_MSGHDR_CRCOFF offsetof(hammer2_msg_hdr_t, salt)
+#define HAMMER2_MSGHDR_CRCBYTES (sizeof(hammer2_msg_hdr_t) - \
+ HAMMER2_MSGHDR_CRCOFF)
+
+/*
+ * Administrative protocol limits.
+ */
+#define HAMMER2_MSGHDR_MAX 2048 /* msg struct max is 8192-32 */
+#define HAMMER2_MSGAUX_MAX 65536 /* msg struct max is 2MB-32 */
+#define HAMMER2_MSGBUF_SIZE (HAMMER2_MSGHDR_MAX * 4)
+#define HAMMER2_MSGBUF_MASK (HAMMER2_MSGBUF_SIZE - 1)
+
+/*
+ * The message (cmd) field also encodes various flags and the total size
+ * of the message header. This allows the protocol processors to validate
+ * persistency and structural settings for every command simply by
+ * switch()ing on the (cmd) field.
+ */
+#define HAMMER2_MSGF_CREATE 0x80000000U /* msg start */
+#define HAMMER2_MSGF_DELETE 0x40000000U /* msg end */
+#define HAMMER2_MSGF_REPLY 0x20000000U /* reply path */
+#define HAMMER2_MSGF_ABORT 0x10000000U /* abort req */
+#define HAMMER2_MSGF_AUXOOB 0x08000000U /* aux-data is OOB */
+#define HAMMER2_MSGF_FLAG2 0x04000000U
+#define HAMMER2_MSGF_FLAG1 0x02000000U
+#define HAMMER2_MSGF_FLAG0 0x01000000U
+
+#define HAMMER2_MSGF_FLAGS 0xFF000000U /* all flags */
+#define HAMMER2_MSGF_PROTOS 0x00F00000U /* all protos */
+#define HAMMER2_MSGF_CMDS 0x000FFF00U /* all cmds */
+#define HAMMER2_MSGF_SIZE 0x000000FFU /* N*32 */
+
+#define HAMMER2_MSGF_CMDSWMASK (HAMMER2_MSGF_CMDS | \
+ HAMMER2_MSGF_SIZE | \
+ HAMMER2_MSGF_PROTOS | \
+ HAMMER2_MSGF_REPLY)
+
+#define HAMMER2_MSG_PROTO_LNK 0x00000000U
+#define HAMMER2_MSG_PROTO_DBG 0x00100000U
+#define HAMMER2_MSG_PROTO_CAC 0x00200000U
+#define HAMMER2_MSG_PROTO_QRM 0x00300000U
+#define HAMMER2_MSG_PROTO_BLK 0x00400000U
+#define HAMMER2_MSG_PROTO_VOP 0x00500000U
+
+/*
+ * Message command constructors, sans flags
+ */
+#define HAMMER2_MSG_ALIGN 32
+#define HAMMER2_MSG_ALIGNMASK (HAMMER2_MSG_ALIGN - 1)
+#define HAMMER2_MSG_DOALIGN(bytes) (((bytes) + HAMMER2_MSG_ALIGNMASK) & \
+ ~HAMMER2_MSG_ALIGNMASK)
+#define HAMMER2_MSG_HDR_ENCODE(elm) ((sizeof(struct elm) + \
+ HAMMER2_MSG_ALIGNMASK) / \
+ HAMMER2_MSG_ALIGN)
+
+#define HAMMER2_MSG_LNK(cmd, elm) (HAMMER2_MSG_PROTO_LNK | \
+ ((cmd) << 8) | \
+ HAMMER2_MSG_HDR_ENCODE(elm))
+
+#define HAMMER2_MSG_DBG(cmd, elm) (HAMMER2_MSG_PROTO_DBG | \
+ ((cmd) << 8) | \
+ HAMMER2_MSG_HDR_ENCODE(elm))
+
+#define HAMMER2_MSG_CAC(cmd, elm) (HAMMER2_MSG_PROTO_CAC | \
+ ((cmd) << 8) | \
+ HAMMER2_MSG_HDR_ENCODE(elm))
+
+#define HAMMER2_MSG_QRM(cmd, elm) (HAMMER2_MSG_PROTO_QRM | \
+ ((cmd) << 8) | \
+ HAMMER2_MSG_HDR_ENCODE(elm))
+
+#define HAMMER2_MSG_BLK(cmd, elm) (HAMMER2_MSG_PROTO_BLK | \
+ ((cmd) << 8) | \
+ HAMMER2_MSG_HDR_ENCODE(elm))
+
+#define HAMMER2_MSG_VOP(cmd, elm) (HAMMER2_MSG_PROTO_VOP | \
+ ((cmd) << 8) | \
+ HAMMER2_MSG_HDR_ENCODE(elm))
+
+/*
+ * Link layer ops basically talk to just the other side of a direct
+ * connection.
+ *
+ * PAD - One-way message on link-0, ignored by target. Used to
+ * pad message buffers on shared-memory transports. Not
+ * typically used with TCP.
+ *
+ * AUTHn - Authenticate the connection, negotiate administrative
+ * rights & encryption, protocol class, etc. Only PAD and
+ * AUTH messages (not even PING) are accepted until
+ * authentication is complete. This message also identifies
+ * the host.
+ *
+ * PING - One-way message on link-0, keep-alive, run by both sides
+ * typically 1/sec on idle link, link is lost after 10 seconds
+ * of inactivity.
+ *
+ * HSPAN - One-way message on link-0, host-spanning tree message.
+ * Connection and authentication status is propagated using
+ * these messages on a per-connection basis. Works like SPAN
+ * but is only used for general status. See the hammer2
+ * 'rinfo' command.
+ *
+ * SPAN - One-way message on link-0, spanning tree message adds,
+ * drops, or updates a remote registration. Sent by both
+ * sides, delta changes only. Visbility into remote
+ * registrations may be limited and received registrations
+ * may be filtered depending on administrative controls.
+ *
+ * A multiply-connected node maintains SPAN information on
+ * each link independently and then retransmits an aggregation
+ * of the shortest-weighted path for each registration to
+ * all links when a received change adjusts the path.
+ *
+ * The leaf protocol also uses this to make a PFS available
+ * to the cluster (e.g. on-mount).
+ */
+#define HAMMER2_LNK_PAD HAMMER2_MSG_LNK(0x000, hammer2_msg_hdr)
+#define HAMMER2_LNK_PING HAMMER2_MSG_LNK(0x001, hammer2_msg_hdr)
+#define HAMMER2_LNK_AUTH HAMMER2_MSG_LNK(0x010, hammer2_lnk_auth)
+#define HAMMER2_LNK_HSPAN HAMMER2_MSG_LNK(0x011, hammer2_lnk_hspan)
+#define HAMMER2_LNK_SPAN HAMMER2_MSG_LNK(0x012, hammer2_lnk_span)
+#define HAMMER2_LNK_ERROR HAMMER2_MSG_LNK(0xFFF, hammer2_msg_hdr)
+
+/*
+ * Debug layer ops operate on any link
+ *
+ * SHELL - Persist stream, access the debug shell on the target
+ * registration. Multiple shells can be operational.
+ */
+#define HAMMER2_DBG_SHELL HAMMER2_MSG_DBG(0x001, hammer2_dbg_shell)
+
+struct hammer2_dbg_shell {
+ hammer2_msg_hdr_t head;
+};
+typedef struct hammer2_dbg_shell hammer2_dbg_shell_t;
+
+/*
+ * Cache layer ops operate on any link, link-0 may be used when the
+ * directly connected target is the desired registration.
+ *
+ * LOCK - Persist state, blockable, abortable.
+ *
+ * Obtain cache state (MODIFIED, EXCLUSIVE, SHARED, or INVAL)
+ * in any of three domains (TREE, INUM, ATTR, DIRENT) for a
+ * particular key relative to cache state already owned.
+ *
+ * TREE - Effects entire sub-tree at the specified element
+ * and will cause existing cache state owned by
+ * other nodes to be adjusted such that the request
+ * can be granted.
+ *
+ * INUM - Only effects inode creation/deletion of an existing
+ * element or a new element, by inumber and/or name.
+ * typically can be held for very long periods of time
+ * (think the vnode cache), directly relates to
+ * hammer2_chain structures representing inodes.
+ *
+ * ATTR - Only effects an inode's attributes, such as
+ * ownership, modes, etc. Used for lookups, chdir,
+ * open, etc. mtime has no affect.
+ *
+ * DIRENT - Only affects an inode's attributes plus the
+ * attributes or names related to any directory entry
+ * directly under this inode (non-recursively). Can
+ * be retained for medium periods of time when doing
+ * directory scans.
+ *
+ * This function may block and can be aborted. You may be
+ * granted cache state that is more broad than the state you
+ * requested (e.g. a different set of domains and/or an element
+ * at a higher layer in the tree). When quorum operations
+ * are used you may have to reconcile these grants to the
+ * lowest common denominator.
+ *
+ * In order to grant your request either you or the target
+ * (or both) may have to obtain a quorum agreement. Deadlock
+ * resolution may be required. When doing it yourself you
+ * will typically maintain an active message to each master
+ * node in the system. You can only grant the cache state
+ * when a quorum of nodes agree.
+ *
+ * The cache state includes transaction id information which
+ * can be used to resolve data requests.
+ */
+#define HAMMER2_CAC_LOCK HAMMER2_MSG_CAC(0x001, hammer2_cac_lock)
+
+/*
+ * Quorum layer ops operate on any link, link-0 may be used when the
+ * directly connected target is the desired registration.
+ *
+ * COMMIT - Persist state, blockable, abortable
+ *
+ * Issue a COMMIT in two phases. A quorum must acknowledge
+ * the operation to proceed to phase-2. Message-update to
+ * proceed to phase-2.
+ */
+#define HAMMER2_QRM_COMMIT HAMMER2_MSG_QRM(0x001, hammer2_qrm_commit)
+
+/*
+ * General message errors
+ *
+ * 0x00 - 0x1F Local iocomm errors
+ * 0x20 - 0x2F Global errors
+ */
+#define HAMMER2_MSG_ERR_UNKNOWN 0x20
+
+union hammer2_any {
+ char buf[HAMMER2_MSGHDR_MAX];
+ hammer2_msg_hdr_t head;
+};
+
+typedef union hammer2_any hammer2_any_t;
+
+#endif