hammer2 - Wire-up the kernel<->userland messaging pipe
authorMatthew Dillon <dillon@apollo.backplane.com>
Sat, 9 Jun 2012 05:03:48 +0000 (22:03 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sat, 9 Jun 2012 05:03:48 +0000 (22:03 -0700)
This commit starts coding up the cluster controller messaging
infrastructure.  The cluster controller is a userland program
typically running on the same machine (but doesn't have to be).

The controller will be able to act in several capacities ranging
from simple remote mounts with no local storage to mirroring setups,
master/slave setups, and ultimately quorum setups.  Since communication
is over a socket it will eventually be possible to implement a
diskless hammer2 root mount without the need for a local controller.

The VFS only talks over one socket, so in quorum or multi-connection
setups the local cluster controller will deal with the complexity of
managing multiple connections and the hammer2 VFS messaging interface
remains simple.

The hammer2 VFS will also use this interface to request cache state
grants and, being a two-way protocol, the other end can request cache
state invalidations or downgrades.

* hammer2_mount now starts 'hammer2 service', connects to it via a socket,
  and passes the socket descriptor to the mount().

* The hammer2 VFS now refs the passed-in file pointer and starts a reader
  and writer thread to manage it.  The code does not yet process actual
  messages.

sbin/mount_hammer2/mount_hammer2.c
sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_mount.h
sys/vfs/hammer2/hammer2_vfsops.c

index 7a2688a..bf2ece2 100644 (file)
  */
 #include <sys/types.h>
 #include <sys/mount.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
 #include <vfs/hammer2/hammer2_mount.h>
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
+#include <unistd.h>
+
+static int cluster_connect(const char *volume);
 
 /*
  * Usage: mount_hammer2 [volume] [mtpt]
@@ -64,11 +69,81 @@ main(int argc, char *argv[])
                exit(1);
        }
 
+       /*
+        * Connect to the cluster controller.  This handles both remote
+        * mounts and device cache/master/slave mounts.
+        *
+        * When doing remote mounts that are allowed to run in the background
+        * the mount program will fork, detach, print a message, and exit(0)
+        * the originator while retrying in the background.
+        */
+       info.cluster_fd = cluster_connect(argv[1]);
+       if (info.cluster_fd < 0) {
+               fprintf(stderr,
+                       "hammer2_mount: cluster_connect(%s) failed\n",
+                       argv[1]);
+               exit(1);
+       }
+
+       /*
+        * Try to mount it
+        */
        info.volume = argv[1];
        info.hflags = 0;
        mountpt = argv[2];
 
        error = mount(vfc.vfc_name, mountpt, mount_flags, &info);
-       if (error)
+       if (error) {
                perror("mount: ");
+               exit(1);
+       }
+
+       /*
+        * XXX fork a backgrounded reconnector process to handle connection
+        *     failures. XXX
+        */
+
+       return (0);
+}
+
+/*
+ * Connect to the cluster controller.  We can connect to a local or remote
+ * cluster controller, depending.  For a multi-node cluster we always want
+ * to connect to the local controller and let it maintain the connections
+ * to the multiple remote nodes.
+ */
+static
+int
+cluster_connect(const char *volume __unused)
+{
+       struct sockaddr_in lsin;
+       int fd;
+
+       /*
+        * This starts the hammer2 service if it isn't already running,
+        * so we can connect to it.
+        */
+       system("/sbin/hammer2 -q service");
+
+       /*
+        * Connect us to the service but leave the rest to the kernel.
+        * If the connection is lost during the mount
+        */
+       if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+               perror("socket");
+               return(-1);
+       }
+       bzero(&lsin, sizeof(lsin));
+       lsin.sin_family = AF_INET;
+       lsin.sin_addr.s_addr = 0;
+       lsin.sin_port = htons(HAMMER2_LISTEN_PORT);
+
+       if (connect(fd, (struct sockaddr *)&lsin, sizeof(lsin)) < 0) {
+               close(fd);
+               fprintf(stderr, "mount_hammer2: unable to connect to "
+                               "cluster controller\n");
+               return(-1);
+       }
+
+       return(fd);
 }
index 37449fe..3a1cc44 100644 (file)
@@ -293,7 +293,8 @@ struct hammer2_mount {
        struct lock     voldatalk;      /* lockmgr lock */
 
        hammer2_volume_data_t voldata;
-       hammer2_freecache_t freecache[HAMMER2_FREECACHE_TYPES][HAMMER2_MAX_RADIX+1];
+       hammer2_freecache_t freecache[HAMMER2_FREECACHE_TYPES]
+                                    [HAMMER2_MAX_RADIX+1];
 };
 
 typedef struct hammer2_mount hammer2_mount_t;
@@ -309,10 +310,16 @@ struct hammer2_pfsmount {
        ccms_domain_t           ccms_dom;
        struct netexport        export;         /* nfs export */
        int                     ronly;          /* read-only mount */
+       struct file             *msg_fp;        /* cluster pipe->userland */
+       thread_t                msgrd_td;       /* cluster thread */
+       thread_t                msgwr_td;       /* cluster thread */
+       int                     msg_ctl;        /* wakeup flags */
 };
 
 typedef struct hammer2_pfsmount hammer2_pfsmount_t;
 
+#define HAMMER2_CLUSTERCTL_KILL        0x0001
+
 #if defined(_KERNEL)
 
 MALLOC_DECLARE(M_HAMMER2);
index ecedae3..d10ae8b 100644 (file)
@@ -45,7 +45,7 @@
 struct hammer2_mount_info {
        const char      *volume;
        int             hflags;         /* extended hammer mount flags */
-       int             unused01;
+       int             cluster_fd;     /* cluster management pipe/socket */
        char            reserved1[112];
 };
 
index 8278642..cad6538 100644 (file)
 #include <sys/uuid.h>
 #include <sys/vfsops.h>
 #include <sys/sysctl.h>
+#include <sys/socket.h>
 
 #include "hammer2.h"
 #include "hammer2_disk.h"
 #include "hammer2_mount.h"
+#include "hammer2_network.h"
 
 struct hammer2_sync_info {
        int error;
@@ -135,6 +137,9 @@ static int hammer2_install_volume_header(hammer2_mount_t *hmp);
 static int hammer2_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
 
+static void hammer2_cluster_thread_rd(void *arg);
+static void hammer2_cluster_thread_wr(void *arg);
+
 /*
  * HAMMER2 vfs operations.
  */
@@ -206,7 +211,7 @@ hammer2_vfs_init(struct vfsconf *conf)
 static
 int
 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
-             struct ucred *cred)
+                 struct ucred *cred)
 {
        struct hammer2_mount_info info;
        hammer2_pfsmount_t *pmp;
@@ -238,6 +243,8 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                /*
                 * Root mount
                 */
+               bzero(&info, sizeof(info));
+               info.cluster_fd = -1;
                return (EOPNOTSUPP);
        } else {
                /*
@@ -274,9 +281,10 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
        }
 
        /*
-        * New non-root mount
+        * PFS mount
+        *
+        * Lookup name and verify it refers to a block device.
         */
-       /* Lookup name and verify it refers to a block device */
        error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW);
        if (error == 0)
                error = nlookup(&nd);
@@ -462,6 +470,24 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
 
        kprintf("iroot %p\n", pmp->iroot);
 
+       /*
+        * Ref the cluster management messaging descriptor.  The mount
+        * program deals with the other end of the communications pipe.
+        */
+       pmp->msg_fp = holdfp(curproc->p_fd, info.cluster_fd, -1);
+       if (pmp->msg_fp == NULL) {
+               kprintf("hammer2_mount: bad cluster_fd!\n");
+               hammer2_vfs_unmount(mp, MNT_FORCE);
+               return EBADF;
+       }
+       lwkt_create(hammer2_cluster_thread_rd, pmp, &pmp->msgrd_td,
+                   NULL, 0, -1, "hammer2-msgrd");
+       lwkt_create(hammer2_cluster_thread_wr, pmp, &pmp->msgwr_td,
+                   NULL, 0, -1, "hammer2-msgwr");
+
+       /*
+        * Finish setup
+        */
        vfs_getnewfsid(mp);
        vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);
        vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);
@@ -474,6 +500,9 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                  sizeof(mp->mnt_stat.f_mntonname) - 1,
                  &size);
 
+       /*
+        * Initial statfs to prime mnt_stat.
+        */
        hammer2_vfs_statfs(mp, &mp->mnt_stat, cred);
 
        return 0;
@@ -557,6 +586,27 @@ hammer2_vfs_unmount(struct mount *mp, int mntflags)
                pmp->rchain = NULL;
        }
        ccms_domain_uninit(&pmp->ccms_dom);
+
+       /*
+        * Ask the cluster controller to go away
+        */
+       atomic_set_int(&pmp->msg_ctl, HAMMER2_CLUSTERCTL_KILL);
+       while (pmp->msgrd_td || pmp->msgwr_td) {
+               wakeup(&pmp->msg_ctl);
+               tsleep(pmp, 0, "clstrkl", hz);
+       }
+
+       /*
+        * Drop communications descriptor
+        */
+       if (pmp->msg_fp) {
+               fdrop(pmp->msg_fp);
+               pmp->msg_fp = NULL;
+       }
+
+       /*
+        * If no PFS's left drop the master hammer2_mount for the device.
+        */
        if (hmp->pmp_count == 0) {
                if (hmp->schain) {
                        KKASSERT(hmp->schain->refs == 1);
@@ -945,3 +995,45 @@ hammer2_install_volume_header(hammer2_mount_t *hmp)
        return (error);
 }
 
+/*
+ * Cluster controller thread.  Perform messaging functions.  We have one
+ * thread for the reader and one for the writer.  The writer handles
+ * shutdown requests (which should break the reader thread).
+ */
+static
+void
+hammer2_cluster_thread_rd(void *arg)
+{
+       hammer2_pfsmount_t *pmp = arg;
+       hammer2_any_t any;
+       int error;
+
+       while ((pmp->msg_ctl & HAMMER2_CLUSTERCTL_KILL) == 0) {
+               error = fp_read(pmp->msg_fp,
+                               any.buf, sizeof(hammer2_msg_hdr_t),
+                               NULL, 1, UIO_SYSSPACE);
+               kprintf("fp_read %d\n", error);
+               if (error)
+                       break;
+       }
+       pmp->msgrd_td = NULL;
+       /* pmp can be ripped out from under us at this point */
+       wakeup(pmp);
+       lwkt_exit();
+}
+
+static
+void
+hammer2_cluster_thread_wr(void *arg)
+{
+       hammer2_pfsmount_t *pmp = arg;
+
+       while ((pmp->msg_ctl & HAMMER2_CLUSTERCTL_KILL) == 0) {
+               tsleep(&pmp->msg_ctl, 0, "msgwr", hz);
+       }
+       fp_shutdown(pmp->msg_fp, SHUT_RDWR);
+       pmp->msgwr_td = NULL;
+       /* pmp can be ripped out from under us at this point */
+       wakeup(pmp);
+       lwkt_exit();
+}