hammer2 - Crypto handshake work for message stream
authorMatthew Dillon <dillon@apollo.backplane.com>
Sat, 12 May 2012 07:43:26 +0000 (00:43 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sat, 12 May 2012 08:00:38 +0000 (01:00 -0700)
* 'hammer2 debug' can now take a destination host argument (and will
  default to localhost).  This is the debug shell connection.

* Add 'hammer2 rsainit' to initialize hammer2's keys /etc/hammer2/rsa.*

* Change the 'hammer2 node' directive to 'hammer2 service'.

* Flesh out the initial public key exchange handshake.  Currently the
  handshake consists of a symmetric 512 byte write and 512 byte read.
  The data is encrypted with our private key and the remote end's public
  key.

  Currently a very simple verifier has been constructed, but we will
  ultimately want to use sha or md5 or something like that for the
  verifier.

  Since I am doing a double-encryption here the first stage encrypt
  has to check that the result does not exceed the modulus (typically
  bit 7 of the first byte must be zero).  If it does we increment
  a 32 bit quantity in our pad*[] area and retry until we get a good
  result.

* The exchange is used to calculate the AES session key.  Session
  encryption is not yet implemented.  A random session key is sent
  by both ends.  The actual session key will be the XOR of the one
  we send and the one we receive.

* When a connection is accepted the remote end's public key is looked
  up in /etc/hammer2/remotes/<IP>.pub.  If this file does not exist
  then the connection is not allowed.

13 files changed:
sbin/hammer2/Makefile
sbin/hammer2/cmd_debug.c
sbin/hammer2/cmd_rsa.c [new file with mode: 0644]
sbin/hammer2/cmd_service.c [moved from sbin/hammer2/cmd_node.c with 79% similarity]
sbin/hammer2/crypto.c [new file with mode: 0644]
sbin/hammer2/hammer2.h
sbin/hammer2/main.c
sbin/hammer2/msg.c
sbin/hammer2/network.h
sbin/newfs_hammer2/newfs_hammer2.c
sys/vfs/hammer2/DESIGN
sys/vfs/hammer2/hammer2_disk.h
sys/vfs/hammer2/hammer2_vfsops.c

index 49b4689..35981b1 100644 (file)
@@ -1,14 +1,16 @@
 PROG=  hammer2
-SRCS=  main.c subs.c icrc.c msg.c
+SRCS=  main.c subs.c icrc.c msg.c crypto.c
 SRCS+= cmd_remote.c cmd_snapshot.c cmd_pfs.c
-SRCS+= cmd_node.c cmd_leaf.c cmd_debug.c
+SRCS+= cmd_service.c cmd_leaf.c cmd_debug.c
+SRCS+= cmd_rsa.c
 #MAN=  hammer2.8
 NOMAN= TRUE
+DEBUG_FLAGS=-g
 
 CFLAGS+= -I${.CURDIR}/../../sys
 CFLAGS+= -pthread
-LDADD= -lm -lutil -lmd
-DPADD= ${LIBM} ${LIBUTIL} ${LIBMD}
+LDADD= -lm -lutil -lmd -lcrypto
+DPADD= ${LIBM} ${LIBUTIL} ${LIBMD} ${LIBCRYPTO}
 
 #.PATH: ${.CURDIR}/../../sys/libkern
 #SRCS+= crc32.c
index 240ed29..87add09 100644 (file)
@@ -41,11 +41,12 @@ static void debug_tty(hammer2_iocom_t *iocom);
 static void hammer2_debug_parse(hammer2_msg_t *msg, char *cmdbuf);
 
 int
-cmd_debug(void)
+cmd_debug(const char *hostname)
 {
        struct sockaddr_in lsin;
        struct hammer2_iocom iocom;
        hammer2_msg_t *msg;
+       struct hostent *hen;
        int fd;
 
        /*
@@ -64,6 +65,19 @@ cmd_debug(void)
        lsin.sin_family = AF_INET;
        lsin.sin_addr.s_addr = 0;
        lsin.sin_port = htons(HAMMER2_LISTEN_PORT);
+
+       if (hostname) {
+               hen = gethostbyname2(hostname, AF_INET);
+               if (hen == NULL) {
+                       if (inet_pton(AF_INET, hostname, &lsin.sin_addr) != 1) {
+                               fprintf(stderr,
+                                       "Cannot resolve %s\n", hostname);
+                               return 1;
+                       }
+               } else {
+                       bcopy(hen->h_addr, &lsin.sin_addr, hen->h_length);
+               }
+       }
        if (connect(fd, (struct sockaddr *)&lsin, sizeof(lsin)) < 0) {
                close(fd);
                fprintf(stderr, "debug: connect failed: %s\n",
diff --git a/sbin/hammer2/cmd_rsa.c b/sbin/hammer2/cmd_rsa.c
new file mode 100644 (file)
index 0000000..b403b7f
--- /dev/null
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+#include <openssl/rsa.h>
+#include <openssl/pem.h>
+
+/*
+ * Should be run as root.  Creates /etc/hammer2/rsa.{pub,prv} using
+ * an openssl command.
+ */
+int
+cmd_rsainit(const char *dir_path)
+{
+       struct stat st;
+       int ecode;
+       char *str1;
+       char *str2;
+       char *cmd;
+       mode_t old_umask;
+
+       /*
+        * Create the directory if necessary
+        */
+       if (stat(dir_path, &st) < 0) {
+               str1 = strdup(dir_path);
+               str2 = str1 - 1;
+
+               while ((str2 = strchr(str2 + 1, '/')) != NULL) {
+                       *str2 = 0;
+                       mkdir(str1, 0755);
+                       *str2 = '/';
+               }
+               mkdir(str1, 0700);
+               free(str1);
+       }
+       asprintf(&str1, "%s/rsa.prv", dir_path);
+       asprintf(&str2, "%s/rsa.pub", dir_path);
+
+       if (stat(str1, &st) < 0) {
+               old_umask = umask(077);
+               asprintf(&cmd, "openssl genrsa -out %s 2048", str1);
+               umask(old_umask);
+               ecode = system(cmd);
+               free(cmd);
+               chmod(str1, 0400);
+               if (ecode) {
+                       fprintf(stderr,
+                               "hammer2 rsainit: private key gen failed\n");
+                       free(str2);
+                       free(str1);
+                       return 1;
+               }
+               printf("hammer2 rsainit: created %s\n", str1);
+               remove(str2);
+       } else {
+               printf("hammer2 rsainit: Using existing private key in %s\n",
+                      str1);
+       }
+       if (stat(str2, &st) < 0) {
+               asprintf(&cmd, "openssl rsa -in %s -out %s -pubout",
+                        str1, str2);
+               ecode = system(cmd);
+               free(cmd);
+               if (ecode) {
+                       fprintf(stderr,
+                               "hammer2 rsainit: public key gen failed\n");
+                       free(str2);
+                       free(str1);
+                       return 1;
+               }
+               printf("hammer2 rsainit: created %s\n", str2);
+       } else {
+               printf("hammer2 rsainit: both keys already exist\n");
+       }
+       free(str2);
+       free(str1);
+
+       return 0;
+}
+
+int
+cmd_rsaenc(const char **keyfiles, int nkeys)
+{
+       RSA **keys = calloc(nkeys, sizeof(RSA *));
+       int *ispub = calloc(nkeys, sizeof(int));
+       int ecode = 0;
+       int blksize = 0;
+       int i;
+       int off;
+       int n;
+       unsigned char *data_in;
+       unsigned char *data_out;
+
+       for (i = 0; i < nkeys; ++i) {
+               FILE *fp;
+               const char *sfx;
+
+               sfx = strrchr(keyfiles[i], '.');
+               if (sfx && strcmp(sfx, ".pub") == 0) {
+                       fp = fopen(keyfiles[i], "r");
+                       if (fp == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "open %s\n", keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+                       keys[i] = PEM_read_RSA_PUBKEY(fp, NULL, NULL, NULL);
+                       ispub[i] = 1;
+                       fclose(fp);
+                       if (keys[i] == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "parse public key from %s\n",
+                                               keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+               } else if (sfx && strcmp(sfx, ".prv") == 0) {
+                       fp = fopen(keyfiles[i], "r");
+                       if (fp == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "open %s\n", keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+                       keys[i] = PEM_read_RSAPrivateKey(fp, NULL, NULL, NULL);
+                       fclose(fp);
+                       if (keys[i] == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "parse private key from %s\n",
+                                               keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+               } else {
+                       fprintf(stderr, "hammer2: rsaenc: key files must end "
+                                       "in .pub or .prv\n");
+                       ecode = 1;
+                       goto done;
+               }
+               if (i == 0)
+                       blksize = RSA_size(keys[i]);
+               else
+                       assert(blksize == RSA_size(keys[i]));
+       }
+       fprintf(stderr, "blksize %d\n", blksize);
+
+       /*
+        *
+        */
+       data_in = malloc(blksize);
+       data_out = malloc(blksize);
+       off = 0;
+       while ((n = read(0, data_in + off, blksize - off)) > 0) {
+               off += n;
+               if (off == blksize) {
+                       for (i = 0; i < nkeys; ++i) {
+                               if (ispub[i])
+                                       RSA_public_encrypt(blksize,
+                                                          data_in, data_out,
+                                                          keys[i],
+                                                          RSA_NO_PADDING);
+                               else
+                                       RSA_private_encrypt(blksize,
+                                                          data_in, data_out,
+                                                          keys[i],
+                                                          RSA_NO_PADDING);
+                               if (i + 1 != nkeys)
+                                       bcopy(data_out, data_in, blksize);
+                       }
+                       if (write(1, data_out, blksize) != blksize) {
+                               perror("write");
+                               ecode = 1;
+                               break;
+                       }
+                       off = 0;
+               }
+       }
+       if (off && ecode == 0) {
+               if (off < blksize)
+                       bzero(data_in + off, blksize - off);
+               for (i = 0; i < nkeys; ++i) {
+                       if (ispub[i])
+                               RSA_public_encrypt(blksize,
+                                                  data_in, data_out,
+                                                  keys[i],
+                                                  RSA_NO_PADDING);
+                       else
+                               RSA_private_encrypt(blksize,
+                                                  data_in, data_out,
+                                                  keys[i],
+                                                  RSA_NO_PADDING);
+                       if (i + 1 != nkeys)
+                               bcopy(data_out, data_in, blksize);
+               }
+               if (write(1, data_out, blksize) != blksize) {
+                       perror("write");
+                       ecode = 1;
+               }
+       }
+       if (n < 0) {
+               perror("read");
+               ecode = 1;
+       }
+       free(data_out);
+       free(data_in);
+done:
+       for (i = 0; i < nkeys; ++i) {
+               if (keys[i])
+                       RSA_free(keys[i]);
+       }
+       free(keys);
+       free(ispub);
+       return (ecode);
+}
+
+int
+cmd_rsadec(const char **keyfiles, int nkeys)
+{
+       RSA **keys = calloc(nkeys, sizeof(RSA *));
+       int *ispub = calloc(nkeys, sizeof(int));
+       int ecode = 0;
+       int blksize = 0;
+       int i;
+       int off;
+       int n;
+       unsigned char *data_in;
+       unsigned char *data_out;
+
+       for (i = 0; i < nkeys; ++i) {
+               FILE *fp;
+               const char *sfx;
+
+               sfx = strrchr(keyfiles[i], '.');
+               if (sfx && strcmp(sfx, ".pub") == 0) {
+                       fp = fopen(keyfiles[i], "r");
+                       if (fp == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "open %s\n", keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+                       keys[i] = PEM_read_RSA_PUBKEY(fp, NULL, NULL, NULL);
+                       ispub[i] = 1;
+                       fclose(fp);
+                       if (keys[i] == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "parse public key from %s\n",
+                                               keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+               } else if (sfx && strcmp(sfx, ".prv") == 0) {
+                       fp = fopen(keyfiles[i], "r");
+                       if (fp == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "open %s\n", keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+                       keys[i] = PEM_read_RSAPrivateKey(fp, NULL, NULL, NULL);
+                       fclose(fp);
+                       if (keys[i] == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "parse private key from %s\n",
+                                               keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+               } else {
+                       fprintf(stderr, "hammer2: rsaenc: key files must end "
+                                       "in .pub or .prv\n");
+                       ecode = 1;
+                       goto done;
+               }
+               if (i == 0)
+                       blksize = RSA_size(keys[i]);
+               else
+                       assert(blksize == RSA_size(keys[i]));
+       }
+
+       /*
+        *
+        */
+       data_in = malloc(blksize);
+       data_out = malloc(blksize);
+       off = 0;
+       while ((n = read(0, data_in + off, blksize - off)) > 0) {
+               off += n;
+               if (off == blksize) {
+                       for (i = 0; i < nkeys; ++i) {
+                               if (ispub[i])
+                                       RSA_public_decrypt(blksize,
+                                                          data_in, data_out,
+                                                          keys[i],
+                                                          RSA_NO_PADDING);
+                               else
+                                       RSA_private_decrypt(blksize,
+                                                          data_in, data_out,
+                                                          keys[i],
+                                                          RSA_NO_PADDING);
+                               if (i + 1 != nkeys)
+                                       bcopy(data_out, data_in, blksize);
+                       }
+                       if (write(1, data_out, blksize) != blksize) {
+                               perror("write");
+                               ecode = 1;
+                               break;
+                       }
+                       off = 0;
+               }
+       }
+       if (off) {
+               if (off < blksize)
+                       bzero(data_in + off, blksize - off);
+               for (i = 0; i < nkeys; ++i) {
+                       if (ispub[i])
+                               RSA_public_decrypt(blksize,
+                                                  data_in, data_out,
+                                                  keys[i],
+                                                  RSA_NO_PADDING);
+                       else
+                               RSA_private_decrypt(blksize,
+                                                  data_in, data_out,
+                                                  keys[i],
+                                                  RSA_NO_PADDING);
+                       if (i + 1 != nkeys)
+                               bcopy(data_out, data_in, blksize);
+               }
+               if (write(1, data_out, blksize) != blksize) {
+                       perror("write");
+                       ecode = 1;
+               }
+       }
+       if (n < 0) {
+               perror("read");
+               ecode = 1;
+       }
+       free(data_out);
+       free(data_in);
+done:
+       for (i = 0; i < nkeys; ++i) {
+               if (keys[i])
+                       RSA_free(keys[i]);
+       }
+       free(keys);
+       free(ispub);
+       return (ecode);
+}
similarity index 79%
rename from sbin/hammer2/cmd_node.c
rename to sbin/hammer2/cmd_service.c
index 0fba7b3..fb4344d 100644 (file)
 
 #include "hammer2.h"
 
-static void *node_master_accept(void *data);
-static void *node_master_service(void *data);
-static void node_master_recv(hammer2_iocom_t *iocom);
-static void node_master_send(hammer2_iocom_t *iocom);
+static void *master_accept(void *data);
+static void *master_service(void *data);
+static void master_auth_rx(hammer2_iocom_t *iocom);
+static void master_auth_tx(hammer2_iocom_t *iocom);
+static void master_link_rx(hammer2_iocom_t *iocom);
+static void master_link_tx(hammer2_iocom_t *iocom);
 
 /*
  * Start-up the master listener daemon for the machine.
@@ -58,7 +60,7 @@ static void node_master_send(hammer2_iocom_t *iocom);
  * Backbones are specified via /etc/hammer2.conf.
  */
 int
-cmd_node(void)
+cmd_service(void)
 {
        struct sockaddr_in lsin;
        int on;
@@ -68,7 +70,7 @@ cmd_node(void)
         * Acquire socket and set options
         */
        if ((lfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
-               fprintf(stderr, "node_master_listen: socket(): %s\n",
+               fprintf(stderr, "master_listen: socket(): %s\n",
                        strerror(errno));
                return 1;
        }
@@ -100,7 +102,7 @@ cmd_node(void)
         * In debug mode this call will create the pthread without forking
         * and set NormalExit to 0, instead of fork.
         */
-       hammer2_demon(node_master_accept, (void *)(intptr_t)lfd);
+       hammer2_demon(master_accept, (void *)(intptr_t)lfd);
        if (NormalExit)
                close(lfd);
        return 0;
@@ -112,7 +114,7 @@ cmd_node(void)
  */
 static
 void *
-node_master_accept(void *data)
+master_accept(void *data)
 {
        struct sockaddr_in asin;
        socklen_t alen;
@@ -139,9 +141,9 @@ node_master_accept(void *data)
                        break;
                }
                thread = NULL;
-               fprintf(stderr, "node_master_accept: accept fd %d\n", fd);
+               fprintf(stderr, "master_accept: accept fd %d\n", fd);
                pthread_create(&thread, NULL,
-                              node_master_service, (void *)(intptr_t)fd);
+                              master_service, (void *)(intptr_t)fd);
        }
        return (NULL);
 }
@@ -151,14 +153,14 @@ node_master_accept(void *data)
  */
 static
 void *
-node_master_service(void *data)
+master_service(void *data)
 {
        hammer2_iocom_t iocom;
        int fd;
 
        fd = (int)(intptr_t)data;
        hammer2_iocom_init(&iocom, fd, -1);
-       hammer2_iocom_core(&iocom, node_master_recv, node_master_send, NULL);
+       hammer2_iocom_core(&iocom, master_auth_rx, master_auth_tx, NULL);
 
        fprintf(stderr,
                "iocom on fd %d terminated error rx=%d, tx=%d\n",
@@ -168,13 +170,39 @@ node_master_service(void *data)
        return (NULL);
 }
 
+/************************************************************************
+ *                         AUTHENTICATION                              *
+ ************************************************************************
+ *
+ * Additional messaging-based authentication must occur before normal
+ * message operation.  The connection has already been encrypted at
+ * this point.
+ */
+static
+void
+master_auth_rx(hammer2_iocom_t *iocom __unused)
+{
+       printf("AUTHRX\n");
+       iocom->recvmsg_callback = master_link_rx;
+       iocom->sendmsg_callback = master_link_tx;
+}
+
+static
+void
+master_auth_tx(hammer2_iocom_t *iocom __unused)
+{
+       printf("AUTHTX\n");
+       iocom->recvmsg_callback = master_link_rx;
+       iocom->sendmsg_callback = master_link_tx;
+}
+
 /*
  * Callback from hammer2_iocom_core() when messages might be present
  * on the socket.
  */
 static
 void
-node_master_recv(hammer2_iocom_t *iocom)
+master_link_rx(hammer2_iocom_t *iocom)
 {
        hammer2_msg_t *msg;
 
@@ -196,7 +224,7 @@ node_master_recv(hammer2_iocom_t *iocom)
        }
        if (iocom->ioq_rx.error) {
                fprintf(stderr,
-                       "node_master_recv: comm error %d\n",
+                       "master_recv: comm error %d\n",
                        iocom->ioq_rx.error);
        }
 }
@@ -207,7 +235,7 @@ node_master_recv(hammer2_iocom_t *iocom)
  */
 static
 void
-node_master_send(hammer2_iocom_t *iocom)
+master_link_tx(hammer2_iocom_t *iocom)
 {
        hammer2_iocom_flush(iocom);
 }
diff --git a/sbin/hammer2/crypto.c b/sbin/hammer2/crypto.c
new file mode 100644 (file)
index 0000000..947a0cf
--- /dev/null
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+#include <openssl/rsa.h>
+#include <openssl/pem.h>
+#include <openssl/err.h>
+
+/*
+ * Synchronously negotiate crypto for a new session.  This must occur
+ * within 10 seconds or the connection is error'd out.
+ *
+ * We work off the IP address and/or reverse DNS.  The IP address is
+ * checked first, followed by the IP address at various levels of granularity,
+ * followed by the full domain name and domain names at various levels of
+ * granularity.
+ *
+ *     /etc/hammer2/remote/<name>.pub  - Contains a public key
+ *     /etc/hammer2/remote/<name>.none - Indicates no encryption (empty file)
+ *                                       (e.g. localhost.none).
+ *
+ * We first attempt to locate a public key file based on the peer address or
+ * peer FQDN.
+ *
+ *     <name>.none     - No further negotiation is needed.  We simply return.
+ *                       All communication proceeds without encryption.
+ *                       No public key handshake occurs in this situation.
+ *                       (both ends must match).
+ *
+ *     <name>.pub      - We have located the public key for the peer.  Both
+ *                       sides transmit a block encrypted with their private
+ *                       keys and the peer's public key.
+ *
+ *                       Both sides receive a block and decrypt it.
+ *
+ *                       Both sides formulate a reply using the decrypted
+ *                       block and transmit it.
+ *
+ *                       communication proceeds with the negotiated session
+ *                       key (typically AES-256-CBC).
+ *
+ * If we fail to locate the appropriate file and no floating.db exists the
+ * connection is terminated without further action.
+ *
+ * If floating.db exists the connection proceeds with a floating negotiation.
+ */
+typedef union {
+       struct sockaddr sa;
+       struct sockaddr_in sa_in;
+       struct sockaddr_in6 sa_in6;
+} sockaddr_any_t;
+
+void
+hammer2_crypto_negotiate(hammer2_iocom_t *iocom)
+{
+       sockaddr_any_t sa;
+       socklen_t salen = sizeof(sa);
+       char peername[128];
+       char realname[128];
+       hammer2_handshake_t handtx;
+       hammer2_handshake_t handrx;
+       char buf[sizeof(handtx)];
+       char *ptr;
+       char *path;
+       struct stat st;
+       FILE *fp;
+       RSA *keys[3] = { NULL, NULL, NULL };
+       size_t i;
+       size_t blksize;
+       size_t blkmask;
+       ssize_t n;
+       int fd;
+
+       /*
+        * Get the peer IP address for the connection as a string.
+        */
+       if (getpeername(iocom->sock_fd, &sa.sa, &salen) < 0) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NOPEER;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "accept: getpeername() failed\n");
+               goto done;
+       }
+       if (getnameinfo(&sa.sa, salen, peername, sizeof(peername),
+                       NULL, 0, NI_NUMERICHOST) < 0) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NOPEER;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "accept: cannot decode sockaddr\n");
+               goto done;
+       }
+       if (DebugOpt) {
+               if (realhostname_sa(realname, sizeof(realname),
+                                   &sa.sa, salen) == HOSTNAME_FOUND) {
+                       fprintf(stderr, "accept from %s (%s)\n",
+                               peername, realname);
+               } else {
+                       fprintf(stderr, "accept from %s\n", peername);
+               }
+       }
+
+       /*
+        * Find the remote host's public key
+        */
+       asprintf(&path, "%s/%s.pub", HAMMER2_PATH_REMOTE, peername);
+       if ((fp = fopen(path, "r")) == NULL) {
+               free(path);
+               asprintf(&path, "%s/%s.none",
+                        HAMMER2_PATH_REMOTE, peername);
+               if (stat(path, &st) < 0) {
+                       iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NORKEY;
+                       iocom->flags |= HAMMER2_IOCOMF_EOF;
+                       if (DebugOpt)
+                               fprintf(stderr, "auth failure: unknown host\n");
+                       goto done;
+               }
+               if (DebugOpt)
+                       fprintf(stderr, "auth succeeded, unencrypted link\n");
+       }
+       if (fp) {
+               keys[0] = PEM_read_RSA_PUBKEY(fp, NULL, NULL, NULL);
+               fclose(fp);
+               if (keys[0] == NULL) {
+                       iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYFMT;
+                       iocom->flags |= HAMMER2_IOCOMF_EOF;
+                       if (DebugOpt)
+                               fprintf(stderr,
+                                       "auth failure: bad key format\n");
+                       goto done;
+               }
+       }
+
+       /*
+        * Get our public and private keys
+        */
+       free(path);
+       asprintf(&path, HAMMER2_DEFAULT_DIR "/rsa.pub");
+       if ((fp = fopen(path, "r")) == NULL) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NOLKEY;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               goto done;
+       }
+       keys[1] = PEM_read_RSA_PUBKEY(fp, NULL, NULL, NULL);
+       fclose(fp);
+       if (keys[1] == NULL) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYFMT;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: bad host key format\n");
+               goto done;
+       }
+
+       free(path);
+       asprintf(&path, HAMMER2_DEFAULT_DIR "/rsa.prv");
+       if ((fp = fopen(path, "r")) == NULL) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NOLKEY;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: bad host key format\n");
+               goto done;
+       }
+       keys[2] = PEM_read_RSAPrivateKey(fp, NULL, NULL, NULL);
+       fclose(fp);
+       if (keys[2] == NULL) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYFMT;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: bad host key format\n");
+               goto done;
+       }
+       free(path);
+       path = NULL;
+
+       /*
+        * public key encrypt/decrypt block size.
+        */
+       if (keys[0]) {
+               blksize = (size_t)RSA_size(keys[0]);
+               if (blksize != (size_t)RSA_size(keys[1]) ||
+                   blksize != (size_t)RSA_size(keys[2]) ||
+                   sizeof(handtx) % blksize != 0) {
+                       iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYFMT;
+                       iocom->flags |= HAMMER2_IOCOMF_EOF;
+                       if (DebugOpt)
+                               fprintf(stderr, "auth failure: "
+                                               "key size mismatch\n");
+                       goto done;
+               }
+       } else {
+               blksize = sizeof(handtx);
+       }
+       blkmask = blksize - 1;
+
+       bzero(&handrx, sizeof(handrx));
+       bzero(&handtx, sizeof(handtx));
+
+       /*
+        * Fill all unused fields (particular all junk fields) with random
+        * data, and also set the session key.
+        */
+       fd = open("/dev/urandom", O_RDONLY);
+       if (fd < 0 ||
+           fstat(fd, &st) < 0 ||       /* something wrong */
+           S_ISREG(st.st_mode) ||      /* supposed to be a RNG dev! */
+           read(fd, &handtx, sizeof(handtx)) != sizeof(handtx)) {
+urandfail:
+               if (fd >= 0)
+                       close(fd);
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_BADURANDOM;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: bad rng\n");
+               goto done;
+       }
+       if (bcmp(&handrx, &handtx, sizeof(handtx)) == 0)
+               goto urandfail;                 /* read all zeros */
+       close(fd);
+       ERR_load_crypto_strings();
+
+       /*
+        * Handshake with the remote.
+        *
+        *      Encrypt with my private and remote's public
+        *      Decrypt with my private and remote's public
+        *
+        * When encrypting we have to make sure our buffer fits within the
+        * modulus, which typically requires bit 7 o the first byte to be
+        * zero.  To be safe make sure that bit 7 and bit 6 is zero.
+        */
+       snprintf(handtx.quickmsg, sizeof(handtx.quickmsg), "Testing 1 2 3");
+       handtx.magic = HAMMER2_MSGHDR_MAGIC;
+       handtx.version = 1;
+       handtx.flags = 0;
+       assert(sizeof(handtx.verf) * 4 == sizeof(handtx.sess));
+       bzero(handtx.verf, sizeof(handtx.verf));
+
+       handtx.pad1[0] &= 0x3f; /* message must fit within modulus */
+       handtx.pad2[0] &= 0x3f; /* message must fit within modulus */
+
+       for (i = 0; i < sizeof(handtx.sess); ++i)
+               handtx.verf[i / 4] ^= handtx.sess[i];
+
+       /*
+        * Write handshake buffer to remote
+        */
+       for (i = 0; i < sizeof(handtx); i += blksize) {
+               ptr = (char *)&handtx + i;
+               if (keys[0]) {
+                       /*
+                        * Since we are double-encrypting we have to make
+                        * sure that the result of the first stage does
+                        * not blow out the modulus for the second stage.
+                        *
+                        * The pointer is pointing to the pad*[] area so
+                        * we can mess with that until the first stage
+                        * is legal.
+                        */
+                       do {
+                               ++*(int *)(ptr + 4);
+                               if (RSA_private_encrypt(blksize, ptr, buf,
+                                           keys[2], RSA_NO_PADDING) < 0) {
+                                       iocom->ioq_rx.error =
+                                               HAMMER2_IOQ_ERROR_KEYXCHGFAIL;
+                               }
+                       } while (buf[0] & 0xC0);
+
+                       if (RSA_public_encrypt(blksize, buf, ptr,
+                                           keys[0], RSA_NO_PADDING) < 0) {
+                               iocom->ioq_rx.error =
+                                       HAMMER2_IOQ_ERROR_KEYXCHGFAIL;
+                       }
+               }
+               if (write(iocom->sock_fd, ptr, blksize) != (ssize_t)blksize) {
+                       fprintf(stderr, "WRITE ERROR\n");
+               }
+       }
+       if (iocom->ioq_rx.error) {
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: key exchange failure "
+                                       "during encryption\n");
+               goto done;
+       }
+
+       /*
+        * Read handshake buffer from remote
+        */
+       i = 0;
+       while (i < sizeof(handrx)) {
+               ptr = (char *)&handrx + i;
+               n = read(iocom->sock_fd, ptr, blksize - (i & blkmask));
+               if (n <= 0)
+                       break;
+               ptr -= (i & blkmask);
+               i += n;
+               if (keys[0] && (i & blkmask) == 0) {
+                       if (RSA_private_decrypt(blksize, ptr, buf,
+                                          keys[2], RSA_NO_PADDING) < 0)
+                               iocom->ioq_rx.error =
+                                               HAMMER2_IOQ_ERROR_KEYXCHGFAIL;
+                       if (RSA_public_decrypt(blksize, buf, ptr,
+                                          keys[0], RSA_NO_PADDING) < 0)
+                               iocom->ioq_rx.error =
+                                               HAMMER2_IOQ_ERROR_KEYXCHGFAIL;
+               }
+       }
+       if (iocom->ioq_rx.error) {
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: key exchange failure "
+                                       "during decryption\n");
+               goto done;
+       }
+
+       /*
+        * Validate the received data.  Try to make this a constant-time
+        * algorithm.
+        */
+       if (i != sizeof(handrx)) {
+keyxchgfail:
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYXCHGFAIL;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: key exchange failure\n");
+               goto done;
+       }
+
+       if (handrx.magic == HAMMER2_MSGHDR_MAGIC_REV) {
+               handrx.version = bswap16(handrx.version);
+               handrx.flags = bswap32(handrx.flags);
+       }
+       for (i = 0; i < sizeof(handrx.sess); ++i)
+               handrx.verf[i / 4] ^= handrx.sess[i];
+       n = 0;
+       for (i = 0; i < sizeof(handrx.verf); ++i)
+               n += handrx.verf[i];
+       if (handrx.version != 1)
+               ++n;
+       if (n != 0)
+               goto keyxchgfail;
+
+       if (DebugOpt) {
+               fprintf(stderr, "Remote data: %s\n", handrx.quickmsg);
+       }
+done:
+       if (path)
+               free(path);
+       if (keys[0])
+               RSA_free(keys[0]);
+       if (keys[1])
+               RSA_free(keys[1]);
+       if (keys[1])
+               RSA_free(keys[2]);
+}
index d3a21f8..fc3fc05 100644 (file)
 #include <pthread.h>
 #include <poll.h>
 
+#include <libutil.h>
+
 #include "network.h"
 
+#define HAMMER2_DEFAULT_DIR    "/etc/hammer2"
+#define HAMMER2_PATH_REMOTE    HAMMER2_DEFAULT_DIR "/remote"
+
 extern int DebugOpt;
 extern int NormalExit;
 
@@ -90,9 +95,12 @@ int cmd_pfs_create(const char *sel_path, const char *name,
                        uint8_t pfs_type, const char *uuid_str);
 int cmd_pfs_delete(const char *sel_path, const char *name);
 
-int cmd_node(void);
+int cmd_service(void);
 int cmd_leaf(const char *sel_path);
-int cmd_debug(void);
+int cmd_debug(const char *hostname);
+int cmd_rsainit(const char *dir_path);
+int cmd_rsaenc(const char **keys, int nkeys);
+int cmd_rsadec(const char **keys, int nkeys);
 
 void hammer2_ioq_init(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq);
 void hammer2_ioq_done(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq);
@@ -116,5 +124,7 @@ void hammer2_ioq_stream(hammer2_msg_t *msg, int reply);
 void hammer2_iocom_drain(hammer2_iocom_t *iocom);
 void hammer2_iocom_flush(hammer2_iocom_t *iocom);
 
+void hammer2_crypto_negotiate(hammer2_iocom_t *iocom);
+
 void hammer2_debug_remote(hammer2_msg_t *msg);
 void msg_printf(hammer2_msg_t *msg, const char *ctl, ...);
index 570948b..2be4662 100644 (file)
@@ -45,12 +45,15 @@ main(int ac, char **av)
 {
        const char *sel_path = NULL;
        const char *uuid_str = NULL;
+       const char *arg;
        int pfs_type = HAMMER2_PFSTYPE_NONE;
        int quick_opt = 0;
        int all_opt = 0;
        int ecode = 0;
        int ch;
 
+       srandomdev();
+
        /*
         * Core options
         */
@@ -172,14 +175,14 @@ main(int ac, char **av)
                 * Create snapshot with optional pfs_type and optional
                 * label override.
                 */
-       } else if (strcmp(av[0], "node") == 0) {
+       } else if (strcmp(av[0], "service") == 0) {
                /*
-                * Start the master node daemon.  This daemon accepts
-                * connections from local and remote clients, implements
-                * and maintains the spanning tree protocol, and manages
-                * the core messaging protocol.
+                * Start the service daemon.  This daemon accepts
+                * connections from local and remote clients, handles
+                * the security handshake, and manages the core messaging
+                * protocol.
                 */
-               ecode = cmd_node();
+               ecode = cmd_service();
        } else if (strcmp(av[0], "leaf") == 0) {
                /*
                 * Start the management daemon for a specific PFS.
@@ -209,7 +212,52 @@ main(int ac, char **av)
                 * Connect to the command line monitor in the hammer2 master
                 * node for the machine using HAMMER2_DBG_SHELL messages.
                 */
-               ecode = cmd_debug();
+               ecode = cmd_debug((ac < 2) ? NULL : av[1]);
+       } else if (strcmp(av[0], "rsainit") == 0) {
+               /*
+                * Initialize a RSA keypair.  If no target directory is
+                * specified we default to "/etc/hammer2".
+                */
+               arg = (ac < 2) ? HAMMER2_DEFAULT_DIR : av[1];
+               ecode = cmd_rsainit(arg);
+       } else if (strcmp(av[0], "rsaenc") == 0) {
+               /*
+                * Encrypt the input symmetrically by running it through
+                * the specified public and/or private key files.
+                *
+                * If no key files are specified data is encoded using
+                * "/etc/hammer2/rsa.pub".
+                *
+                * WARNING: no padding is added, data stream must contain
+                *          random padding for this to be secure.
+                *
+                * Used for debugging only
+                */
+               if (ac == 1) {
+                       const char *rsapath = HAMMER2_DEFAULT_DIR "/rsa.pub";
+                       ecode = cmd_rsaenc(&rsapath, 1);
+               } else {
+                       ecode = cmd_rsaenc((const char **)&av[1], ac - 1);
+               }
+       } else if (strcmp(av[0], "rsadec") == 0) {
+               /*
+                * Decrypt the input symmetrically by running it through
+                * the specified public and/or private key files.
+                *
+                * If no key files are specified data is decoded using
+                * "/etc/hammer2/rsa.prv".
+                *
+                * WARNING: no padding is added, data stream must contain
+                *          random padding for this to be secure.
+                *
+                * Used for debugging only
+                */
+               if (ac == 1) {
+                       const char *rsapath = HAMMER2_DEFAULT_DIR "/rsa.prv";
+                       ecode = cmd_rsadec(&rsapath, 1);
+               } else {
+                       ecode = cmd_rsadec((const char **)&av[1], ac - 1);
+               }
        } else {
                fprintf(stderr, "Unrecognized command: %s\n", av[0]);
                usage(1);
index e0ac85c..322ef77 100644 (file)
@@ -77,6 +77,15 @@ hammer2_iocom_init(hammer2_iocom_t *iocom, int sock_fd, int alt_fd)
        hammer2_ioq_init(iocom, &iocom->ioq_rx);
        hammer2_ioq_init(iocom, &iocom->ioq_tx);
 
+       /*
+        * Negotiate session crypto synchronously.  This will mark the
+        * connection as error'd if it fails.
+        */
+       hammer2_crypto_negotiate(iocom);
+
+       /*
+        * Make sure our fds are set to non-blocking for the iocom core.
+        */
        if (sock_fd >= 0)
                fcntl(sock_fd, F_SETFL, O_NONBLOCK);
 #if 0
index a970783..6167998 100644 (file)
  * SUCH DAMAGE.
  */
 
+/***************************************************************************
+ *                             CRYPTO HANDSHAKE                           *
+ ***************************************************************************
+ *
+ * The initial public-key exchange is implementing by transmitting a
+ * 512-byte buffer to the other side in a symmetrical fashion.  This
+ * buffer contains the following:
+ *
+ * (1) A random session key.
+ *
+ * (2) A verifier to determine that the decode was successful.  It encodes
+ *     an XOR of each group of 4 bytes from the session key.
+ *
+ * (3) Additional configuration and additional random data.
+ *
+ *     - The hammer2 message header magic for endian detect
+ *
+ *     - The hammer2 protocol version.  The two sides agree on the
+ *      smaller of the two.
+ *
+ *     - All unused fields (junk*) are filled with random data.
+ *
+ * This structure must be exactly 512 bytes and expects to use 256-byte
+ * RSA keys.
+ */
+struct hammer2_handshake {
+       char pad1[8];           /* 000 */
+       uint16_t magic;         /* 008 HAMMER2_MSGHDR_MAGIC for endian detect */
+       uint16_t version;       /* 00A hammer2 protocol version */
+       uint32_t flags;         /* 00C protocol extension flags */
+       uint8_t sess[64];       /* 010 512-bit session key */
+       uint8_t verf[16];       /* 050 verifier = ~sess */
+       char quickmsg[32];      /* 060 reason for connecting */
+       char junk080[128];      /* 080-0FF */
+       char pad2[8];           /* 100-107 */
+       char junk100[256-8];    /* 108-1FF */
+};
+
+typedef struct hammer2_handshake hammer2_handshake_t;
+
 /***************************************************************************
  *                             LOW LEVEL MESSAGING                        *
  ***************************************************************************
@@ -92,14 +132,20 @@ struct hammer2_ioq {
 
 typedef struct hammer2_ioq hammer2_ioq_t;
 
-#define HAMMER2_IOQ_ERROR_SYNC 1               /* bad magic / out of sync */
-#define HAMMER2_IOQ_ERROR_EOF  2               /* unexpected EOF */
-#define HAMMER2_IOQ_ERROR_SOCK 3               /* read() error on socket */
-#define HAMMER2_IOQ_ERROR_FIELD        4               /* invalid field */
-#define HAMMER2_IOQ_ERROR_HCRC 5               /* core header crc bad */
-#define HAMMER2_IOQ_ERROR_XCRC 6               /* ext header crc bad */
-#define HAMMER2_IOQ_ERROR_ACRC 7               /* aux data crc bad */
-#define HAMMER2_IOQ_ERROR_STATE        8               /* bad state */
+#define HAMMER2_IOQ_ERROR_SYNC         1       /* bad magic / out of sync */
+#define HAMMER2_IOQ_ERROR_EOF          2       /* unexpected EOF */
+#define HAMMER2_IOQ_ERROR_SOCK         3       /* read() error on socket */
+#define HAMMER2_IOQ_ERROR_FIELD                4       /* invalid field */
+#define HAMMER2_IOQ_ERROR_HCRC         5       /* core header crc bad */
+#define HAMMER2_IOQ_ERROR_XCRC         6       /* ext header crc bad */
+#define HAMMER2_IOQ_ERROR_ACRC         7       /* aux data crc bad */
+#define HAMMER2_IOQ_ERROR_STATE                8       /* bad state */
+#define HAMMER2_IOQ_ERROR_NOPEER       9       /* bad socket peer */
+#define HAMMER2_IOQ_ERROR_NORKEY       10      /* no remote keyfile found */
+#define HAMMER2_IOQ_ERROR_NOLKEY       11      /* no local keyfile found */
+#define HAMMER2_IOQ_ERROR_KEYXCHGFAIL  12      /* key exchange failed */
+#define HAMMER2_IOQ_ERROR_KEYFMT       13      /* key file format problem */
+#define HAMMER2_IOQ_ERROR_BADURANDOM   14      /* /dev/urandom is bad */
 
 #define HAMMER2_IOQ_MAXIOVEC    16
 
@@ -117,6 +163,8 @@ struct hammer2_iocom {
        int     sock_fd;                        /* comm socket or pipe */
        int     alt_fd;                         /* thread signal, tty, etc */
        int     flags;
+       int     rxmisc;
+       int     txmisc;
        char    rxbuf[HAMMER2_MSGBUF_SIZE];     /* for ioq_rx only */
 };
 
index 1be34e6..5d39337 100644 (file)
@@ -233,7 +233,7 @@ main(int ac, char **av)
        uuid_to_string(&Hammer2_RPFSId, &rpfsidstr, &status);
 
        /*
-        * Calculate the amount of reserved space.  HAMMER2_RESERVE_SEG (4MB)
+        * Calculate the amount of reserved space.  HAMMER2_ZONE_SEG (4MB)
         * is reserved at the beginning of every 2GB of storage, rounded up.
         * Thus a 200MB filesystem will still have a 4MB reserve area.
         *
@@ -241,8 +241,8 @@ main(int ac, char **av)
         * reserve is used to help 'df' calculate the amount of available
         * space.
         */
-       reserved_space = ((total_space + HAMMER2_RESERVE_MASK64) /
-                         HAMMER2_RESERVE_BYTES64) * HAMMER2_RESERVE_SEG64;
+       reserved_space = ((total_space + HAMMER2_ZONE_MASK64) /
+                         HAMMER2_ZONE_BYTES64) * HAMMER2_ZONE_SEG64;
 
        free_space = total_space - reserved_space -
                     BootAreaSize - AuxAreaSize;
@@ -449,7 +449,7 @@ format_hammer2(int fd, hammer2_off_t total_space, hammer2_off_t free_space)
        hammer2_blockref_t root_blockref;
        uint64_t now;
        hammer2_off_t volu_base = 0;
-       hammer2_off_t boot_base = HAMMER2_RESERVE_SEG;
+       hammer2_off_t boot_base = HAMMER2_ZONE_SEG;
        hammer2_off_t aux_base = boot_base + BootAreaSize;
        hammer2_off_t alloc_base = aux_base + AuxAreaSize;
        hammer2_off_t tmp_base;
@@ -462,7 +462,7 @@ format_hammer2(int fd, hammer2_off_t total_space, hammer2_off_t free_space)
         */
        bzero(buf, HAMMER2_PBUFSIZE);
        tmp_base = volu_base;
-       for (i = 0; i < HAMMER2_RESERVE_BLOCKS; ++i) {
+       for (i = 0; i < HAMMER2_ZONE_BLOCKS_SEG; ++i) {
                n = pwrite(fd, buf, HAMMER2_PBUFSIZE, tmp_base);
                if (n != HAMMER2_PBUFSIZE) {
                        perror("write");
@@ -649,10 +649,10 @@ format_hammer2(int fd, hammer2_off_t total_space, hammer2_off_t free_space)
         * Write the volume header and all alternates.
         */
        for (i = 0; i < HAMMER2_NUM_VOLHDRS; ++i) {
-               if (i * HAMMER2_RESERVE_BYTES64 >= total_space)
+               if (i * HAMMER2_ZONE_BYTES64 >= total_space)
                        break;
                n = pwrite(fd, buf, HAMMER2_PBUFSIZE,
-                          volu_base + i * HAMMER2_RESERVE_BYTES64);
+                          volu_base + i * HAMMER2_ZONE_BYTES64);
                if (n != HAMMER2_PBUFSIZE) {
                        perror("write");
                        exit(1);
index 5e87f52..fe10627 100644 (file)
   This allows the writing of 0's to create holes and will be the default
   compression algorithm for HAMMER2.
 
-* Copies support for redundancy.  The media blockref structure would
-  have become too bloated but I found a clean way to do copies using the
-  blockset structure (which is a set of 8 fully associative blockref's).
+* Copies support for redundancy.  Each copy has its own blockref.  The
+  blockrefs representing the copies must exist within the same blockset
+  (set of 8 blockrefs), though I may relax this requirement in the
+  implementation.
 
   The design is such that the filesystem should be able to function at
   full speed even if disks are pulled or inserted, as long as at least one
@@ -87,6 +88,9 @@
   missing copies (or remove excessive copies in the case where the copies
   value is reduced on a live filesystem).
 
+  Copies are specified using the same copyinfo[] array that is used to
+  specify cluster interconnections for PFS's.
+
 * Clusterable with MESI cache coherency and dynamic granularity.
   The media format for HAMMER1 was less condusive to logical clustering
   than I had hoped so I was never able to get that aspect of my personal goals
@@ -229,23 +233,71 @@ structures generally follow the kernel's filesystem hiearchy.  Second,
 HAMMER2's writable snapshots make it possible to implement several forms
 of multi-master clustering.
 
-This is important: The mount device path you specify serves to bootstrap
-your entry into the cluster, but your mount will make active connections
-to ALL copy elements in the hammer2_copy_data[] array (stored in the volume
-header) which match the PFSID of the directory in the super-root that you
-specified.  The local media path does not have to be mentioned in this
-array but becomes part of the cluster based on its type and access
-rights.  ALL ELEMENTS ARE TREATED ACCORDING TO TYPE NO MATTER WHICH ONE
-YOU MOUNT FROM.
-
-The actual cluster may be far larger than the elements you list in the
-hammer2_copy_data[] array.  You list only the elements you wish to
-directly connect to and you are able to access the rest of the cluster
-indirectly through those connections.
-
-All nodes in the cluster may act as administrative proxies.  All nodes
-in the cluster, including your mount point, are classified as one of the
-following as specified in the inode's structure:
+The mount device path you specify serves to bootstrap your entry into
+the cluster.  This can be local media or directly specify a network
+cluster connection (or several).  When a local media mount is used the
+volume header is scanned for local copies and the best volume header is
+selected from all available copies.  Multiple devices may be specified for
+redundancy.
+
+The volume header on local media also contains cluster connection
+specifications keyed by super-root pfsid.  Network connections are
+maintained to all targets.  ALL ELEMENTS ARE TREATED ACCORDING TO TYPE
+NO MATTER WHICH ONE YOU MOUNT FROM.
+
+The actual networked cluster may be far larger than the elements you list
+in the hammer2_copy_data[] array, but your machine will only make direct
+connections as specified by the array.
+
+In the simplest case you simply network a few machines together as ring 0
+masters and each client connects directly to all the masters (and/or are
+the masters themselves).  Thus any quorum operation is straight-forward.
+These master nodes are labeled 'ring 0'.
+
+If you have too many clients to reasonably connect directly you set up
+sub-clusters as satellites.  This is called 'ring 1'.  Ring 1 may contain
+several sub-clusters.  A client then connects to all the nodes in a
+particular sub-cluster (typically 3).  The quorum protocol runs as per
+normal except that once the operation is resolved against the sub-cluster
+an aggregation must be resolved against the master nodes (ring 0).  The
+sub-cluster does this for the client... all the client sees is the normal
+quorum operation against the sub-cluster.
+
+Since each node in the sub-cluster connects to all master nodes we get
+a multiplication.  If we set a reasonable upper limit of, say, 256
+connections at each master node then ring 1 may contain 85 sub-clusters x 3
+nodes in each sub-cluster.
+
+In the most complex case when one wishes to support potentially millions
+of clients then further fan-out is required into ring 2, ring 3, and
+so forth.  However, each sub-cluster in ring 2 must only connect to
+1 sub-cluster in ring 1 (otherwise the cache state will become mightily
+confused).  Using reasonable metrics this will allow ring 2 to contain
+85 * 85 = 7225 sub-clusters.  At this point you could have 1000 clients
+connect to each sub-cluster and support 7.2 million clients, but if that
+isn't enough going to another ring will support 61M clients, and so forth.
+
+Each ring imposes additional latencies for cache operations but the key
+to making this work efficiently is that the satellite clusters can negotiate
+coarse-grained cache coherency locks with the next lower ring and then
+fan-out finer-grained locks to the next higher ring.  Since caching can
+occur anywhere (including on the connecting client), it is the cache
+coherency lock that ultimately dictates efficiency and allows a client
+(or satellite) to access large amoutns of data from local storage.
+
+Modifying operations, particularly commits, also have higher latencies
+when multiple rings are in use.  In this situation it is possible to
+short-cut localized operations by having competing clients connect to
+to sub-clusters which are near each other topologically... having the
+competing clients connect to the same sub-cluster would be the most optimal.
+
+In addition, sub-clusters (typically in ring 1) can act in SOFT_MASTER mode
+which allows the sub-cluster to acknowledge a full commit within its own
+quorum only, and then resolve asynchronously to the masters in ring 0.
+
+The nodes in these intermediate rings can be pure proxies with only memory
+caches, use local media for persistent cache, or use local media to
+completely slave the filesystem.
 
     ADMIN      - Media does not participate, administrative proxy only
     CACHE      - Media only acts as a persistent cache
index c3f5f98..6b4329a 100644 (file)
 #define HAMMER2_NEWFS_ALIGNMASK                (HAMMER2_VOLUME_ALIGN - 1)
 #define HAMMER2_NEWFS_ALIGNMASK64      ((hammer2_off_t)HAMMER2_NEWFS_ALIGNMASK)
 
-#define HAMMER2_RESERVE_BYTES64                (2LLU * 1024 * 1024 * 1024)
-#define HAMMER2_RESERVE_MASK64         (HAMMER2_RESERVE_BYTES64 - 1)
-#define HAMMER2_RESERVE_SEG            (4 * 1024 * 1024)
-#define HAMMER2_RESERVE_SEG64          ((hammer2_off_t)HAMMER2_RESERVE_SEG)
-#define HAMMER2_RESERVE_BLOCKS         (HAMMER2_RESERVE_SEG / HAMMER2_PBUFSIZE)
+#define HAMMER2_ZONE_BYTES64           (2LLU * 1024 * 1024 * 1024)
+#define HAMMER2_ZONE_MASK64            (HAMMER2_ZONE_BYTES64 - 1)
+#define HAMMER2_ZONE_SEG               (4 * 1024 * 1024)
+#define HAMMER2_ZONE_SEG64             ((hammer2_off_t)HAMMER2_ZONE_SEG)
+#define HAMMER2_ZONE_BLOCKS_SEG                (HAMMER2_ZONE_SEG / HAMMER2_PBUFSIZE)
 
 /*
  * Two linear areas can be reserved after the initial 2MB segment in the base
@@ -620,7 +620,7 @@ struct hammer2_copy_data {
        uint16_t flags;         /* 04-05 flags field */
        uint8_t error;          /* 06    last operational error */
        uint8_t priority;       /* 07    priority and round-robin flag */
-       uint8_t remote_pfstype; /* 08    probed direct remote PFS type */
+       uint8_t remote_pfs_type;/* 08    probed direct remote PFS type */
        uint8_t reserved08[23]; /* 09-1F */
        uuid_t  pfs_id;         /* 20-2F copy target must match this uuid */
        uint8_t label[16];      /* 30-3F import/export label */
@@ -657,10 +657,13 @@ typedef struct hammer2_copy_data hammer2_copy_data_t;
  * the tree which are stored in the volumeh eader and must be tracked on
  * the fly.
  *
- * COPIES: Multiple copies may be specified on the mount line AND/OR you
- *        just specify one and the mount code tries to pick up the others
- *        from copyinfo[].  The copyid field in the volume header along
- *        with the fsid validates the copies.
+ * NOTE: The copyinfo[] array contains the configuration for both the
+ *      cluster connections and any local media copies.  The volume
+ *      header will be replicated for each local media copy.
+ *
+ *      The mount command may specify multiple medias or just one and
+ *      allow HAMMER2 to pick up the others when it checks the copyinfo[]
+ *      array on mount.
  *
  * NOTE: root_blockref points to the super-root directory, not the root
  *      directory.  The root directory will be a subdirectory under the
@@ -698,7 +701,7 @@ struct hammer2_volume_data {
        uint32_t        flags;                  /* 0034 */
        uint8_t         copyid;                 /* 0038 copyid of phys vol */
        uint8_t         freemap_version;        /* 0039 freemap algorithm */
-       uint8_t         reserved003A;           /* 003A */
+       uint8_t         pfstype;                /* 003A local media pfstype */
        uint8_t         reserved003B;           /* 003B */
        uint32_t        reserved003C;           /* 003C */
 
index 9d4958a..fb2180f 100644 (file)
@@ -857,7 +857,7 @@ hammer2_install_volume_header(hammer2_mount_t *hmp)
         * block device's EOF.
         */
        for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) {
-               error = bread(hmp->devvp, i * HAMMER2_RESERVE_BYTES64, 
+               error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
                              HAMMER2_VOLUME_BYTES, &bp);
                if (error) {
                        brelse(bp);