hammer2 - dmsg blockdev work
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 26 Oct 2012 02:57:12 +0000 (19:57 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 26 Oct 2012 02:57:12 +0000 (19:57 -0700)
* Adjust the LNK_CONN and LNK_SPAN messages to provide a cluster label
  and a filesystem label instead of just a filesystem label.

* Change the cluster controller to index clusters based on the
  peer_type, pfs_clid, and cl_label instead of just the pfs_clid.

  HAMMER2 PFS's are primarily identified by pfs_clid and unique nodes
  are identified by pfs_fsid.  fs_label holds the HAMMER2 super-root
  name.

  BLOCK devices are primarily identified by cl_label and unique nodes
  are identified by pfs_fsid.  pfs_clid and fs_label are empty.

* Adjust the cluster controller's matching filters such that, e.g.
  BLOCK device clients only have visibility to the BLOCK device server
  for the cl_label in question, and do not have visibility to other
  BLOCK device clients.

* Flesh out the hammer2 service daemons disk monitoring, have it attach
  a pipe to the in-kernel disk via an ioctl().

* Continued work on the skeleton network disk exporter (kern/subr_disk*) and
  network disk importer (dev/disk/xdisk/*).  xdisk has not been added to the
  build yet.

* Move a few more definitions from HAMMER2 to DMSG.

lib/libdmsg/msg_lnk.c
sbin/hammer2/cmd_debug.c
sys/dev/disk/xdisk/Makefile [new file with mode: 0644]
sys/dev/disk/xdisk/xdisk.c [new file with mode: 0644]
sys/kern/kern_dmsg.c
sys/kern/subr_diskiocom.c
sys/sys/dmsg.h
sys/sys/xdiskioctl.h [new file with mode: 0644]
sys/vfs/hammer2/hammer2_disk.h
sys/vfs/hammer2/hammer2_vfsops.c

index 67cc7a9..5e7eb27 100644 (file)
@@ -239,6 +239,8 @@ struct h2span_cluster {
        RB_ENTRY(h2span_cluster) rbnode;
        struct h2span_node_tree tree;
        uuid_t  pfs_clid;               /* shared fsid */
+       uint8_t peer_type;
+       char    cl_label[128];          /* cluster label (typ PEER_BLOCK) */
        int     refs;                   /* prevents destruction */
 };
 
@@ -246,9 +248,9 @@ struct h2span_node {
        RB_ENTRY(h2span_node) rbnode;
        struct h2span_link_tree tree;
        struct h2span_cluster *cls;
-       uint8_t peer_type;
+       uint8_t pfs_type;
        uuid_t  pfs_fsid;               /* unique fsid */
-       char    label[64];
+       char    fs_label[128];          /* fs label (typ PEER_HAMMER2) */
 };
 
 struct h2span_link {
@@ -296,26 +298,45 @@ typedef struct h2span_node h2span_node_t;
 typedef struct h2span_link h2span_link_t;
 typedef struct h2span_relay h2span_relay_t;
 
+#define dmsg_termstr(array)    _dmsg_termstr((array), sizeof(array))
+
+static __inline
+void
+_dmsg_termstr(char *base, size_t size)
+{
+       base[size-1] = 0;
+}
+
+/*
+ * Cluster peer_type, uuid, AND label must match for a match
+ */
 static
 int
 h2span_cluster_cmp(h2span_cluster_t *cls1, h2span_cluster_t *cls2)
 {
-       return(uuid_compare(&cls1->pfs_clid, &cls2->pfs_clid, NULL));
+       int r;
+
+       if (cls1->peer_type < cls2->peer_type)
+               return(-1);
+       if (cls1->peer_type > cls2->peer_type)
+               return(1);
+       r = uuid_compare(&cls1->pfs_clid, &cls2->pfs_clid, NULL);
+       if (r == 0)
+               r = strcmp(cls1->cl_label, cls2->cl_label);
+
+       return r;
 }
 
+/*
+ * Match against the uuid.  Currently we never match against the label.
+ */
 static
 int
 h2span_node_cmp(h2span_node_t *node1, h2span_node_t *node2)
 {
        int r;
 
-       if (node1->peer_type < node2->peer_type)
-               return(-1);
-       if (node1->peer_type > node2->peer_type)
-               return(1);
        r = uuid_compare(&node1->pfs_fsid, &node2->pfs_fsid, NULL);
-       if (r == 0 && node1->peer_type == DMSG_PEER_BLOCK)
-               r = strcmp(node1->label, node2->label);
        return (r);
 }
 
@@ -473,11 +494,12 @@ dmsg_lnk_conn(dmsg_msg_t *msg)
                 * acknowledge the request, leaving the transaction open.
                 * We then relay priority-selected SPANs.
                 */
-               fprintf(stderr, "LNK_CONN(%08x): %s/%s\n",
+               fprintf(stderr, "LNK_CONN(%08x): %s/%s/%s\n",
                        (uint32_t)msg->any.head.msgid,
                        dmsg_uuid_to_str(&msg->any.lnk_conn.pfs_clid,
                                            &alloc),
-                       msg->any.lnk_conn.label);
+                       msg->any.lnk_conn.cl_label,
+                       msg->any.lnk_conn.fs_label);
                free(alloc);
 
                conn = dmsg_alloc(sizeof(*conn));
@@ -643,16 +665,25 @@ dmsg_lnk_span(dmsg_msg_t *msg)
                assert(state->func == NULL);
                state->func = dmsg_lnk_span;
 
-               msg->any.lnk_span.label[sizeof(msg->any.lnk_span.label)-1] = 0;
+               dmsg_termstr(msg->any.lnk_span.cl_label);
+               dmsg_termstr(msg->any.lnk_span.fs_label);
 
                /*
                 * Find the cluster
                 */
                dummy_cls.pfs_clid = msg->any.lnk_span.pfs_clid;
+               dummy_cls.peer_type = msg->any.lnk_span.peer_type;
+               bcopy(msg->any.lnk_span.cl_label,
+                     dummy_cls.cl_label,
+                     sizeof(dummy_cls.cl_label));
                cls = RB_FIND(h2span_cluster_tree, &cluster_tree, &dummy_cls);
                if (cls == NULL) {
                        cls = dmsg_alloc(sizeof(*cls));
                        cls->pfs_clid = msg->any.lnk_span.pfs_clid;
+                       cls->peer_type = msg->any.lnk_span.peer_type;
+                       bcopy(msg->any.lnk_span.cl_label,
+                             cls->cl_label,
+                             sizeof(cls->cl_label));
                        RB_INIT(&cls->tree);
                        RB_INSERT(h2span_cluster_tree, &cluster_tree, cls);
                }
@@ -661,16 +692,13 @@ dmsg_lnk_span(dmsg_msg_t *msg)
                 * Find the node
                 */
                dummy_node.pfs_fsid = msg->any.lnk_span.pfs_fsid;
-               dummy_node.peer_type = msg->any.lnk_span.peer_type;
-               snprintf(dummy_node.label, sizeof(dummy_node.label),
-                        "%s", msg->any.lnk_span.label);
                node = RB_FIND(h2span_node_tree, &cls->tree, &dummy_node);
                if (node == NULL) {
                        node = dmsg_alloc(sizeof(*node));
                        node->pfs_fsid = msg->any.lnk_span.pfs_fsid;
-                       node->peer_type = msg->any.lnk_span.peer_type;
-                       snprintf(node->label, sizeof(node->label),
-                                "%s", msg->any.lnk_span.label);
+                       bcopy(msg->any.lnk_span.fs_label,
+                             node->fs_label,
+                             sizeof(node->fs_label));
                        node->cls = cls;
                        RB_INIT(&node->tree);
                        RB_INSERT(h2span_node_tree, &cls->tree, node);
@@ -702,12 +730,13 @@ dmsg_lnk_span(dmsg_msg_t *msg)
 
                RB_INSERT(h2span_link_tree, &node->tree, slink);
 
-               fprintf(stderr, "LNK_SPAN(thr %p): %p %s/%s dist=%d\n",
+               fprintf(stderr,
+                       "LNK_SPAN(thr %p): %p %s cl=%s fs=%s dist=%d\n",
                        msg->router->iocom,
                        slink,
-                       dmsg_uuid_to_str(&msg->any.lnk_span.pfs_clid,
-                                           &alloc),
-                       msg->any.lnk_span.label,
+                       dmsg_uuid_to_str(&msg->any.lnk_span.pfs_clid, &alloc),
+                       msg->any.lnk_span.cl_label,
+                       msg->any.lnk_span.fs_label,
                        msg->any.lnk_span.dist);
                free(alloc);
 #if 0
@@ -725,11 +754,12 @@ dmsg_lnk_span(dmsg_msg_t *msg)
                node = slink->node;
                cls = node->cls;
 
-               fprintf(stderr, "LNK_DELE(thr %p): %p %s/%s dist=%d\n",
+               fprintf(stderr, "LNK_DELE(thr %p): %p %s cl=%s fs=%s dist=%d\n",
                        msg->router->iocom,
                        slink,
                        dmsg_uuid_to_str(&cls->pfs_clid, &alloc),
-                       state->msg->any.lnk_span.label,
+                       state->msg->any.lnk_span.cl_label,
+                       state->msg->any.lnk_span.fs_label,
                        state->msg->any.lnk_span.dist);
                free(alloc);
 
@@ -905,9 +935,9 @@ dmsg_relay_scan_specific(h2span_node_t *node, h2span_conn_t *conn)
        h2span_relay_t *next_relay;
        h2span_link_t *slink;
        dmsg_lnk_conn_t *lconn;
+       dmsg_lnk_span_t *lspan;
        dmsg_msg_t *msg;
        int count = 2;
-       uint8_t peer_type;
 
        info.node = node;
        info.relay = NULL;
@@ -975,27 +1005,48 @@ dmsg_relay_scan_specific(h2span_node_t *node, h2span_conn_t *conn)
                 * Don't bother transmitting if the remote connection
                 * is not accepting this SPAN's peer_type.
                 */
-               peer_type = slink->state->msg->any.lnk_span.peer_type;
+               lspan = &slink->state->msg->any.lnk_span;
                lconn = &conn->state->msg->any.lnk_conn;
-               if (((1LLU << peer_type) & lconn->peer_mask) == 0)
+               if (((1LLU << lspan->peer_type) & lconn->peer_mask) == 0)
                        break;
 
                /*
-                * Filter based on pfs_clid or label (XXX).  This typically
-                * reduces the amount of SPAN traffic that a mount end-point
-                * sees by only passing along SPANs related to the cluster id
-                * (that is, it will see all PFS's associated with the
-                * particular cluster it represents).
+                * Do not give pure clients visibility to other pure clients
                 */
-               if (peer_type == lconn->peer_type &&
-                   peer_type == DMSG_PEER_HAMMER2) {
-                       if (!uuid_is_nil(&slink->node->cls->pfs_clid, NULL) &&
-                           uuid_compare(&slink->node->cls->pfs_clid,
-                                        &lconn->pfs_clid, NULL) != 0) {
-                               break;
-                       }
+               if (lconn->pfs_type == DMSG_PFSTYPE_CLIENT &&
+                   lspan->pfs_type == DMSG_PFSTYPE_CLIENT) {
+                       break;
+               }
+
+               /*
+                * Connection filter, if cluster uuid is not NULL it must
+                * match the span cluster uuid.  Only applies when the
+                * peer_type matches.
+                */
+               if (lspan->peer_type == lconn->peer_type &&
+                   !uuid_is_nil(&lconn->pfs_clid, NULL) &&
+                   uuid_compare(&slink->node->cls->pfs_clid,
+                                &lconn->pfs_clid, NULL)) {
+                       break;
                }
 
+               /*
+                * Connection filter, if cluster label is not empty it must
+                * match the span cluster label.  Only applies when the
+                * peer_type matches.
+                */
+               if (lspan->peer_type == lconn->peer_type &&
+                   lconn->cl_label[0] &&
+                   strcmp(lconn->cl_label, slink->node->cls->cl_label)) {
+                       break;
+               }
+
+               /*
+                * NOTE! fs_uuid differentiates nodes within the same cluster
+                *       so we obviously don't want to match those.  Similarly
+                *       for fs_label.
+                */
+
                /*
                 * Ok, we've accepted this SPAN for relaying.
                 */
@@ -1267,12 +1318,13 @@ dmsg_shell_tree(dmsg_router_t *router, char *cmdbuf __unused)
 
        pthread_mutex_lock(&cluster_mtx);
        RB_FOREACH(cls, h2span_cluster_tree, &cluster_tree) {
-               dmsg_router_printf(router, "Cluster %s\n",
-                                  dmsg_uuid_to_str(&cls->pfs_clid, &uustr));
+               dmsg_router_printf(router, "Cluster %s (%s)\n",
+                                  dmsg_uuid_to_str(&cls->pfs_clid, &uustr),
+                                  cls->cl_label);
                RB_FOREACH(node, h2span_node_tree, &cls->tree) {
                        dmsg_router_printf(router, "    Node %s (%s)\n",
                                dmsg_uuid_to_str(&node->pfs_fsid, &uustr),
-                               node->label);
+                               node->fs_label);
                        RB_FOREACH(slink, h2span_link_tree, &node->tree) {
                                dmsg_router_printf(router,
                                            "\tLink dist=%d via %d\n",
index aa92627..1d9f3ef 100644 (file)
@@ -195,6 +195,7 @@ hammer2_shell_parse(dmsg_msg_t *msg)
 static void
 shell_span(dmsg_router_t *router, char *cmdbuf)
 {
+       dmsg_master_service_info_t *info;
        const char *hostname = strsep(&cmdbuf, " \t");
        pthread_t thread;
        int fd;
@@ -215,8 +216,14 @@ shell_span(dmsg_router_t *router, char *cmdbuf)
                dmsg_router_printf(router, "Connection to %s failed\n", hostname);
        } else {
                dmsg_router_printf(router, "Connected to %s\n", hostname);
-               pthread_create(&thread, NULL,
-                              dmsg_master_service, (void *)(intptr_t)fd);
+
+               info = malloc(sizeof(*info));
+               bzero(info, sizeof(*info));
+               info->fd = fd;
+               info->detachme = 1;
+               info->dbgmsg_callback = hammer2_shell_parse;
+
+               pthread_create(&thread, NULL, dmsg_master_service, info);
                /*pthread_join(thread, &res);*/
        }
 }
diff --git a/sys/dev/disk/xdisk/Makefile b/sys/dev/disk/xdisk/Makefile
new file mode 100644 (file)
index 0000000..55cfb16
--- /dev/null
@@ -0,0 +1,6 @@
+# xdisk - Create network-controlled disk
+#
+KMOD=  xdisk
+SRCS=  xdisk.c
+
+.include <bsd.kmod.mk>
diff --git a/sys/dev/disk/xdisk/xdisk.c b/sys/dev/disk/xdisk/xdisk.c
new file mode 100644 (file)
index 0000000..291609d
--- /dev/null
@@ -0,0 +1,472 @@
+/*
+ * Copyright (c) 2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This module allows disk devices to be created and associated with a
+ * communications pipe or socket.  You open the device and issue an
+ * ioctl() to install a new disk along with its communications descriptor.
+ *
+ * All further communication occurs via the descriptor using the DMSG
+ * LNK_CONN, LNK_SPAN, and BLOCK protocols.  The descriptor can be a
+ * direct connection to a remote machine's disk (in-kernenl), to a remote
+ * cluster controller, to the local cluster controller, etc.
+ *
+ * /dev/xdisk is the control device, issue ioctl()s to create the /dev/xa%d
+ * devices.  These devices look like raw disks to the system.
+ */
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/device.h>
+#include <sys/devicestat.h>
+#include <sys/disk.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/udev.h>
+#include <sys/uuid.h>
+#include <sys/kern_syscall.h>
+
+#include <sys/dmsg.h>
+#include <sys/xdiskioctl.h>
+
+#include <sys/buf2.h>
+#include <sys/thread2.h>
+
+static int xdisk_attach(struct xdisk_attach_ioctl *xaioc);
+static void xa_exit(kdmsg_iocom_t *iocom);
+static int xa_msg_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
+static int xa_msg_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
+static int xa_lnk_rcvmsg(kdmsg_msg_t *msg);
+static int xa_lnk_dbgmsg(kdmsg_msg_t *msg);
+static int xa_adhoc_input(kdmsg_msg_t *msg);
+
+MALLOC_DEFINE(M_XDISK, "Networked disk client", "Network Disks");
+
+/*
+ * Control device, issue ioctls to create xa devices.
+ */
+static d_open_t xdisk_open;
+static d_close_t xdisk_close;
+static d_ioctl_t xdisk_ioctl;
+
+static struct dev_ops xdisk_ops = {
+       { "xdisk", 0, D_MPSAFE },
+        .d_open =      xdisk_open,
+        .d_close =     xdisk_close,
+        .d_ioctl =     xdisk_ioctl
+};
+
+/*
+ * XA disk devices
+ */
+static d_open_t xa_open;
+static d_close_t xa_close;
+static d_ioctl_t xa_ioctl;
+static d_strategy_t xa_strategy;
+static d_psize_t xa_size;
+
+static struct dev_ops xa_ops = {
+       { "xa", 0, D_DISK | D_CANFREE | D_MPSAFE },
+        .d_open =      xa_open,
+        .d_close =     xa_close,
+        .d_ioctl =     xa_ioctl,
+        .d_read =      physread,
+        .d_write =     physwrite,
+        .d_strategy =  xa_strategy,
+       .d_psize =      xa_size
+};
+
+struct xa_softc {
+       TAILQ_ENTRY(xa_softc) entry;
+       cdev_t          dev;
+       kdmsg_iocom_t   iocom;
+       struct xdisk_attach_ioctl xaioc;
+       struct disk_info info;
+       struct disk     disk;
+       uuid_t          pfs_fsid;
+       int             unit;
+       int             inprog;
+       int             connected;
+};
+
+static struct lwkt_token xdisk_token = LWKT_TOKEN_INITIALIZER(xdisk_token);
+static int xdisk_opencount;
+static cdev_t xdisk_dev;
+static TAILQ_HEAD(, xa_softc) xa_queue;
+
+/*
+ * Module initialization
+ */
+static int
+xdisk_modevent(module_t mod, int type, void *data)
+{
+       switch (type) {
+       case MOD_LOAD:
+               TAILQ_INIT(&xa_queue);
+               xdisk_dev = make_dev(&xdisk_ops, 0,
+                                    UID_ROOT, GID_WHEEL, 0600, "xdisk");
+               break;
+       case MOD_UNLOAD:
+       case MOD_SHUTDOWN:
+               if (xdisk_opencount || TAILQ_FIRST(&xa_queue))
+                       return (EBUSY);
+               if (xdisk_dev) {
+                       destroy_dev(xdisk_dev);
+                       xdisk_dev = NULL;
+               }
+               dev_ops_remove_all(&xdisk_ops);
+               dev_ops_remove_all(&xa_ops);
+               break;
+       default:
+               break;
+       }
+       return 0;
+}
+
+DEV_MODULE(xdisk, xdisk_modevent, 0);
+
+/*
+ * Control device
+ */
+static int
+xdisk_open(struct dev_open_args *ap)
+{
+       lwkt_gettoken(&xdisk_token);
+       ++xdisk_opencount;
+       lwkt_reltoken(&xdisk_token);
+       return(0);
+}
+
+static int
+xdisk_close(struct dev_close_args *ap)
+{
+       lwkt_gettoken(&xdisk_token);
+       --xdisk_opencount;
+       lwkt_reltoken(&xdisk_token);
+       return(0);
+}
+
+static int
+xdisk_ioctl(struct dev_ioctl_args *ap)
+{
+       int error;
+
+       switch(ap->a_cmd) {
+       case XDISKIOCATTACH:
+               error = xdisk_attach((void *)ap->a_data);
+               break;
+       default:
+               error = ENOTTY;
+               break;
+       }
+       return error;
+}
+
+/************************************************************************
+ *                             DMSG INTERFACE                          *
+ ************************************************************************/
+
+static int
+xdisk_attach(struct xdisk_attach_ioctl *xaioc)
+{
+       struct xa_softc *scan;
+       struct xa_softc *xa;
+       struct file *fp;
+       kdmsg_msg_t *msg;
+       int unit;
+       char devname[64];
+       cdev_t dev;
+
+       fp = holdfp(curproc->p_fd, xaioc->fd, -1);
+       if (fp == NULL)
+               return EINVAL;
+
+       xa = kmalloc(sizeof(*xa), M_XDISK, M_WAITOK|M_ZERO);
+
+       /*
+        * Find unit
+        */
+       lwkt_gettoken(&xdisk_token);
+       unit = 0;
+       do {
+               TAILQ_FOREACH(scan, &xa_queue, entry) {
+                       if (scan->unit == unit)
+                               break;
+               }
+       } while (scan != NULL);
+       xa->unit = unit;
+       xa->xaioc = *xaioc;
+       TAILQ_INSERT_TAIL(&xa_queue, xa, entry);
+       lwkt_reltoken(&xdisk_token);
+
+       /*
+        * Create device
+        */
+       dev = disk_create(unit, &xa->disk, &xa_ops);
+       dev->si_drv1 = xa;
+       xa->dev = dev;
+
+       xa->info.d_media_blksize = 512;
+       xa->info.d_media_blocks = xaioc->size / 512;
+       xa->info.d_dsflags = DSO_MBRQUIET | DSO_RAWPSIZE;
+       xa->info.d_secpertrack = 32;
+       xa->info.d_nheads = 64;
+       xa->info.d_secpercyl = xa->info.d_secpertrack * xa->info.d_nheads;
+       xa->info.d_ncylinders = 0;
+       disk_setdiskinfo_sync(&xa->disk, &xa->info);
+
+       /*
+        * Set up messaging connection
+        */
+       ksnprintf(devname, sizeof(devname), "xa%d", unit);
+       kdmsg_iocom_init(&xa->iocom, xa, M_XDISK,
+                        xa_lnk_rcvmsg,
+                        xa_lnk_dbgmsg,
+                        xa_adhoc_input);
+       xa->iocom.exit_func = xa_exit;
+       xa->inprog = 1;
+       kern_uuidgen(&xa->pfs_fsid, 1);
+       kdmsg_iocom_reconnect(&xa->iocom, fp, devname);
+
+       /*
+        * Issue DMSG_LNK_CONN for device.  This sets up filters so hopefully
+        * the only SPANs we receive are from servers providing the label
+        * being configured.  Hopefully that's just a single server(!)(!).
+        * (HAMMER peers might have multiple servers but block device peers
+        * currently only allow one).  There could still be multiple spans
+        * due to there being multiple paths available, however.
+        */
+
+       msg = kdmsg_msg_alloc(&xa->iocom.router, DMSG_LNK_CONN | DMSGF_CREATE,
+                             xa_msg_conn_reply, xa);
+       msg->any.lnk_conn.pfs_type = 0;
+       msg->any.lnk_conn.proto_version = DMSG_SPAN_PROTO_1;
+       msg->any.lnk_conn.peer_type = DMSG_PEER_BLOCK;
+       msg->any.lnk_conn.peer_mask = 1LLU << DMSG_PEER_BLOCK;
+       ksnprintf(msg->any.lnk_conn.cl_label,
+                 sizeof(msg->any.lnk_conn.cl_label),
+                 "%s", xaioc->cl_label);
+       msg->any.lnk_conn.pfs_fsid = xa->pfs_fsid;
+       xa->iocom.conn_state = msg->state;
+       kdmsg_msg_write(msg);
+
+       xa->inprog = 0;         /* unstall msg thread exit (if racing) */
+
+       return(0);
+}
+
+/*
+ * Handle reply to our LNK_CONN transaction (transaction remains open)
+ */
+static
+int
+xa_msg_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
+{
+       struct xa_softc *xa = state->any.any;
+        kdmsg_msg_t *rmsg;
+
+       if (msg->any.head.cmd & DMSGF_CREATE) {
+               kprintf("XA LNK_CONN received reply\n");
+               rmsg = kdmsg_msg_alloc(&xa->iocom.router,
+                                      DMSG_LNK_SPAN | DMSGF_CREATE,
+                                      xa_msg_span_reply, xa);
+               rmsg->any.lnk_span.pfs_type = 0;
+               rmsg->any.lnk_span.proto_version = DMSG_SPAN_PROTO_1;
+               rmsg->any.lnk_span.peer_type = DMSG_PEER_BLOCK;
+
+               ksnprintf(rmsg->any.lnk_span.cl_label,
+                         sizeof(rmsg->any.lnk_span.cl_label),
+                         "%s", xa->xaioc.cl_label);
+               kdmsg_msg_write(rmsg);
+       }
+       if ((state->txcmd & DMSGF_DELETE) == 0 &&
+           (msg->any.head.cmd & DMSGF_DELETE)) {
+               kprintf("DISK LNK_CONN terminated by remote\n");
+               xa->iocom.conn_state = NULL;
+               kdmsg_msg_reply(msg, 0);
+       }
+       return(0);
+}
+
+static int
+xa_msg_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
+{
+       kprintf("SPAN REPLY - Our sent span was terminated by the "
+               "remote %08x state %p\n", msg->any.head.cmd, state);
+       if ((state->txcmd & DMSGF_DELETE) == 0 &&
+           (msg->any.head.cmd & DMSGF_DELETE)) {
+               kdmsg_msg_reply(msg, 0);
+       }
+       return (0);
+}
+
+/*
+ * Called from iocom core transmit thread upon disconnect.
+ */
+static
+void
+xa_exit(kdmsg_iocom_t *iocom)
+{
+       struct xa_softc *xa = iocom->handle;
+
+       kprintf("XA_EXIT UNIT %d\n", xa->unit);
+
+       kdmsg_iocom_uninit(iocom);
+
+       while (xa->inprog) {
+               tsleep(xa, 0, "xarace", hz);
+       }
+
+       /*
+        * XXX allow reconnection, wait for users to terminate?
+        */
+
+       disk_destroy(&xa->disk);
+
+       lwkt_gettoken(&xdisk_token);
+       TAILQ_REMOVE(&xa_queue, xa, entry);
+       lwkt_reltoken(&xdisk_token);
+
+       kfree(xa, M_XDISK);
+}
+
+static int
+xa_lnk_rcvmsg(kdmsg_msg_t *msg)
+{
+       switch(msg->any.head.cmd & DMSGF_TRANSMASK) {
+       case DMSG_LNK_CONN | DMSGF_CREATE:
+               /*
+                * reply & leave trans open
+                */
+               kprintf("XA CONN RECEIVE - (just ignore it)\n");
+               kdmsg_msg_result(msg, 0);
+               break;
+       case DMSG_LNK_SPAN | DMSGF_CREATE:
+               kprintf("XA SPAN RECEIVE - ADDED FROM CLUSTER\n");
+               break;
+       case DMSG_LNK_SPAN | DMSGF_DELETE:
+               kprintf("SPAN RECEIVE - DELETED FROM CLUSTER\n");
+               break;
+       default:
+               break;
+       }
+       return(0);
+}
+
+static int
+xa_lnk_dbgmsg(kdmsg_msg_t *msg)
+{
+       switch(msg->any.head.cmd & DMSGF_CMDSWMASK) {
+       case DMSG_DBG_SHELL:
+               /*
+                * Execute shell command (not supported atm)
+                */
+               kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
+               break;
+       case DMSG_DBG_SHELL | DMSGF_REPLY:
+               if (msg->aux_data) {
+                       msg->aux_data[msg->aux_size - 1] = 0;
+                       kprintf("DEBUGMSG: %s\n", msg->aux_data);
+               }
+               break;
+       default:
+               kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
+               break;
+       }
+       return(0);
+}
+
+static int
+xa_adhoc_input(kdmsg_msg_t *msg)
+{
+        kprintf("XA ADHOC INPUT MSG %08x\n", msg->any.head.cmd);
+        return(0);
+}
+
+/************************************************************************
+ *                        XA DEVICE INTERFACE                          *
+ ************************************************************************/
+
+static int
+xa_open(struct dev_open_args *ap)
+{
+       cdev_t dev = ap->a_head.a_dev;
+       struct xa_softc *xa;
+
+       xa = dev->si_drv1;
+
+       dev->si_bsize_phys = 512;
+       dev->si_bsize_best = 32768;
+
+       /*
+        * Issue streaming open and wait for reply.
+        */
+
+       /* XXX check ap->a_oflags & FWRITE, EACCES if read-only */
+
+       return(0);
+}
+
+static int
+xa_close(struct dev_close_args *ap)
+{
+       cdev_t dev = ap->a_head.a_dev;
+}
+
+static int
+xa_strategy(struct dev_strategy_args *ap)
+{
+}
+
+static int
+xa_ioctl(struct dev_ioctl_args *ap)
+{
+       return (ENOTTY);
+}
+
+static int
+xa_size(struct dev_psize_args *ap)
+{
+       struct xa_softc *xa;
+
+       if ((xa = ap->a_head.a_dev->si_drv1) == NULL)
+               return (ENXIO);
+       if (xa->inprog)
+               return (ENXIO);
+       ap->a_result = xa->info.d_media_blocks;
+       return (0);
+}
index a6572f8..76fec4b 100644 (file)
@@ -507,12 +507,20 @@ cleanupwr:
        KKASSERT(RB_EMPTY(&iocom->statewr_tree));
        KKASSERT(iocom->conn_state == NULL);
 
-       /*
-        * iocom can be ripped out from under us once msgwr_td is set to NULL.
-        * The wakeup is safe.
-        */
-       iocom->msgwr_td = NULL;
-       wakeup(iocom);
+       if (iocom->exit_func) {
+               /*
+                * iocom is invalid after we call the exit function.
+                */
+               iocom->msgwr_td = NULL;
+               iocom->exit_func(iocom);
+       } else {
+               /*
+                * iocom can be ripped out from under us once msgwr_td is
+                * set to NULL.  The wakeup is safe.
+                */
+               iocom->msgwr_td = NULL;
+               wakeup(iocom);
+       }
        lwkt_exit();
 }
 
index 2c6bfff..0300aeb 100644 (file)
@@ -128,7 +128,8 @@ disk_iocom_reconnect(struct disk *dp, struct file *fp)
        msg->any.lnk_conn.peer_type = DMSG_PEER_BLOCK;
        msg->any.lnk_conn.peer_mask = 1LLU << DMSG_PEER_BLOCK;
 
-       ksnprintf(msg->any.lnk_conn.label, sizeof(msg->any.lnk_conn.label),
+       ksnprintf(msg->any.lnk_conn.cl_label,
+                 sizeof(msg->any.lnk_conn.cl_label),
                  "%s/%s", hostname, devname);
        dp->d_iocom.conn_state = msg->state;
        kdmsg_msg_write(msg);
@@ -156,8 +157,8 @@ disk_msg_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
                rmsg->any.lnk_span.proto_version = DMSG_SPAN_PROTO_1;
                rmsg->any.lnk_span.peer_type = DMSG_PEER_BLOCK;
 
-               ksnprintf(rmsg->any.lnk_span.label,
-                         sizeof(rmsg->any.lnk_span.label),
+               ksnprintf(rmsg->any.lnk_span.cl_label,
+                         sizeof(rmsg->any.lnk_span.cl_label),
                          "%s/%s%d",
                          hostname,
                          dev_dname(dp->d_rawdev),
index d0deba1..cdba875 100644 (file)
@@ -415,11 +415,23 @@ struct dmsg_lnk_conn {
        uint8_t         reserved02[8];
        int32_t         dist;           /* span distance */
        uint32_t        reserved03[14];
-       char            label[256];     /* PFS label (can be wildcard) */
+       char            cl_label[128];  /* cluster label (for PEER_BLOCK) */
+       char            fs_label[128];  /* PFS label (for PEER_HAMMER2) */
 };
 
 typedef struct dmsg_lnk_conn dmsg_lnk_conn_t;
 
+#define DMSG_PFSTYPE_NONE      0
+#define DMSG_PFSTYPE_ADMIN     1
+#define DMSG_PFSTYPE_CLIENT    2
+#define DMSG_PFSTYPE_CACHE     3
+#define DMSG_PFSTYPE_COPY      4
+#define DMSG_PFSTYPE_SLAVE     5
+#define DMSG_PFSTYPE_SOFT_SLAVE        6
+#define DMSG_PFSTYPE_SOFT_MASTER 7
+#define DMSG_PFSTYPE_MASTER    8
+#define DMSG_PFSTYPE_MAX       9       /* 0-8 */
+
 #define DMSG_PEER_NONE         0
 #define DMSG_PEER_CLUSTER      1       /* a cluster controller */
 #define DMSG_PEER_BLOCK                2       /* block devices */
@@ -478,7 +490,8 @@ struct dmsg_lnk_span {
        uint8_t         reserved02[8];
        int32_t         dist;           /* span distance */
        uint32_t        reserved03[15];
-       char            label[256];     /* PFS label (can be wildcard) */
+       char            cl_label[128];  /* cluster label (for PEER_BLOCK) */
+       char            fs_label[128];  /* PFS label (for PEER_HAMMER2) */
 };
 
 typedef struct dmsg_lnk_span dmsg_lnk_span_t;
@@ -749,6 +762,7 @@ struct kdmsg_iocom {
        int                     (*lnk_rcvmsg)(kdmsg_msg_t *msg);
        int                     (*dbg_rcvmsg)(kdmsg_msg_t *msg);
        int                     (*misc_rcvmsg)(kdmsg_msg_t *msg);
+       void                    (*exit_func)(struct kdmsg_iocom *);
        struct kdmsg_state      *conn_state;    /* active LNK_CONN state */
        struct kdmsg_state      *freerd_state;  /* allocation cache */
        struct kdmsg_state      *freewr_state;  /* allocation cache */
diff --git a/sys/sys/xdiskioctl.h b/sys/sys/xdiskioctl.h
new file mode 100644 (file)
index 0000000..d495175
--- /dev/null
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _SYS_XDISKIOCTL_H_
+#define _SYS_XDISKIOCTL_H_
+
+#ifndef _SYS_IOCCOM_H_
+#include <sys/ioccom.h>
+#endif
+#ifndef _SYS_PARAM_H_
+#include <sys/param.h>         /* PATH_MAX */
+#endif
+
+/*
+ * ioctl definitions
+ */
+struct xdisk_attach_ioctl {
+       int     fd;
+       int     unit;
+       int64_t size;           /* size of disk in bytes */
+       char    cl_label[64];   /* LNK_SPAN cl_label match */
+       char    serno[64];      /* serial number */
+};
+
+#define XDISKIOCATTACH _IOWR('X', 0, struct xdisk_attach_ioctl)
+
+#endif
index 6e8e049..64200e0 100644 (file)
@@ -524,16 +524,16 @@ typedef struct hammer2_inode_data hammer2_inode_data_t;
 /*
  * PFS types identify a PFS on media and in LNK_SPAN messages.
  */
-#define HAMMER2_PFSTYPE_NONE           0
-#define HAMMER2_PFSTYPE_ADMIN          1
-#define HAMMER2_PFSTYPE_CLIENT         2
-#define HAMMER2_PFSTYPE_CACHE          3
-#define HAMMER2_PFSTYPE_COPY           4
-#define HAMMER2_PFSTYPE_SLAVE          5
-#define HAMMER2_PFSTYPE_SOFT_SLAVE     6
-#define HAMMER2_PFSTYPE_SOFT_MASTER    7
-#define HAMMER2_PFSTYPE_MASTER         8
-#define HAMMER2_PFSTYPE_MAX            9       /* 0-8 */
+#define HAMMER2_PFSTYPE_NONE           DMSG_PFSTYPE_NONE
+#define HAMMER2_PFSTYPE_ADMIN          DMSG_PFSTYPE_ADMIN
+#define HAMMER2_PFSTYPE_CLIENT         DMSG_PFSTYPE_CLIENT
+#define HAMMER2_PFSTYPE_CACHE          DMSG_PFSTYPE_CACHE
+#define HAMMER2_PFSTYPE_COPY           DMSG_PFSTYPE_COPY
+#define HAMMER2_PFSTYPE_SLAVE          DMSG_PFSTYPE_SLAVE
+#define HAMMER2_PFSTYPE_SOFT_SLAVE     DMSG_PFSTYPE_SOFT_SLAVE
+#define HAMMER2_PFSTYPE_SOFT_MASTER    DMSG_PFSTYPE_SOFT_MASTER
+#define HAMMER2_PFSTYPE_MASTER         DMSG_PFSTYPE_MASTER
+#define HAMMER2_PFSTYPE_MAX            DMSG_PFSTYPE_MAX
 
 /*
  * The allocref structure represents the allocation table.  One 64K block
index b8c6bdd..eb786a4 100644 (file)
@@ -1026,11 +1026,13 @@ hammer2_cluster_reconnect(hammer2_pfsmount_t *pmp, struct file *fp)
        msg->any.lnk_conn.peer_type = pmp->hmp->voldata.peer_type;
        msg->any.lnk_conn.peer_mask = 1LLU << HAMMER2_PEER_HAMMER2;
        name_len = pmp->iroot->ip_data.name_len;
-       if (name_len >= sizeof(msg->any.lnk_conn.label))
-               name_len = sizeof(msg->any.lnk_conn.label) - 1;
-       bcopy(pmp->iroot->ip_data.filename, msg->any.lnk_conn.label, name_len);
+       if (name_len >= sizeof(msg->any.lnk_conn.fs_label))
+               name_len = sizeof(msg->any.lnk_conn.fs_label) - 1;
+       bcopy(pmp->iroot->ip_data.filename,
+             msg->any.lnk_conn.fs_label,
+             name_len);
        pmp->iocom.conn_state = msg->state;
-       msg->any.lnk_conn.label[name_len] = 0;
+       msg->any.lnk_conn.fs_label[name_len] = 0;
        kdmsg_msg_write(msg);
 }
 
@@ -1086,12 +1088,12 @@ hammer2_msg_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
                rmsg->any.lnk_span.peer_type = pmp->hmp->voldata.peer_type;
                rmsg->any.lnk_span.proto_version = DMSG_SPAN_PROTO_1;
                name_len = pmp->iroot->ip_data.name_len;
-               if (name_len >= sizeof(rmsg->any.lnk_span.label))
-                       name_len = sizeof(rmsg->any.lnk_span.label) - 1;
+               if (name_len >= sizeof(rmsg->any.lnk_span.fs_label))
+                       name_len = sizeof(rmsg->any.lnk_span.fs_label) - 1;
                bcopy(pmp->iroot->ip_data.filename,
-                     rmsg->any.lnk_span.label,
+                     rmsg->any.lnk_span.fs_label,
                      name_len);
-               rmsg->any.lnk_span.label[name_len] = 0;
+               rmsg->any.lnk_span.fs_label[name_len] = 0;
                kdmsg_msg_write(rmsg);
 
                /*