From 62efe6ec4de59dc37e5790107c400be430d3bc49 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sat, 12 May 2012 00:43:26 -0700 Subject: [PATCH] hammer2 - Crypto handshake work for message stream * 'hammer2 debug' can now take a destination host argument (and will default to localhost). This is the debug shell connection. * Add 'hammer2 rsainit' to initialize hammer2's keys /etc/hammer2/rsa.* * Change the 'hammer2 node' directive to 'hammer2 service'. * Flesh out the initial public key exchange handshake. Currently the handshake consists of a symmetric 512 byte write and 512 byte read. The data is encrypted with our private key and the remote end's public key. Currently a very simple verifier has been constructed, but we will ultimately want to use sha or md5 or something like that for the verifier. Since I am doing a double-encryption here the first stage encrypt has to check that the result does not exceed the modulus (typically bit 7 of the first byte must be zero). If it does we increment a 32 bit quantity in our pad*[] area and retry until we get a good result. * The exchange is used to calculate the AES session key. Session encryption is not yet implemented. A random session key is sent by both ends. The actual session key will be the XOR of the one we send and the one we receive. * When a connection is accepted the remote end's public key is looked up in /etc/hammer2/remotes/.pub. If this file does not exist then the connection is not allowed. --- sbin/hammer2/Makefile | 10 +- sbin/hammer2/cmd_debug.c | 16 +- sbin/hammer2/cmd_rsa.c | 382 ++++++++++++++++++++ sbin/hammer2/{cmd_node.c => cmd_service.c} | 58 ++- sbin/hammer2/crypto.c | 387 +++++++++++++++++++++ sbin/hammer2/hammer2.h | 14 +- sbin/hammer2/main.c | 62 +++- sbin/hammer2/msg.c | 9 + sbin/hammer2/network.h | 64 +++- sbin/newfs_hammer2/newfs_hammer2.c | 14 +- sys/vfs/hammer2/DESIGN | 92 +++-- sys/vfs/hammer2/hammer2_disk.h | 25 +- sys/vfs/hammer2/hammer2_vfsops.c | 2 +- 13 files changed, 1059 insertions(+), 76 deletions(-) create mode 100644 sbin/hammer2/cmd_rsa.c rename sbin/hammer2/{cmd_node.c => cmd_service.c} (79%) create mode 100644 sbin/hammer2/crypto.c diff --git a/sbin/hammer2/Makefile b/sbin/hammer2/Makefile index 49b4689a2e..35981b160e 100644 --- a/sbin/hammer2/Makefile +++ b/sbin/hammer2/Makefile @@ -1,14 +1,16 @@ PROG= hammer2 -SRCS= main.c subs.c icrc.c msg.c +SRCS= main.c subs.c icrc.c msg.c crypto.c SRCS+= cmd_remote.c cmd_snapshot.c cmd_pfs.c -SRCS+= cmd_node.c cmd_leaf.c cmd_debug.c +SRCS+= cmd_service.c cmd_leaf.c cmd_debug.c +SRCS+= cmd_rsa.c #MAN= hammer2.8 NOMAN= TRUE +DEBUG_FLAGS=-g CFLAGS+= -I${.CURDIR}/../../sys CFLAGS+= -pthread -LDADD= -lm -lutil -lmd -DPADD= ${LIBM} ${LIBUTIL} ${LIBMD} +LDADD= -lm -lutil -lmd -lcrypto +DPADD= ${LIBM} ${LIBUTIL} ${LIBMD} ${LIBCRYPTO} #.PATH: ${.CURDIR}/../../sys/libkern #SRCS+= crc32.c diff --git a/sbin/hammer2/cmd_debug.c b/sbin/hammer2/cmd_debug.c index 240ed29264..87add09ece 100644 --- a/sbin/hammer2/cmd_debug.c +++ b/sbin/hammer2/cmd_debug.c @@ -41,11 +41,12 @@ static void debug_tty(hammer2_iocom_t *iocom); static void hammer2_debug_parse(hammer2_msg_t *msg, char *cmdbuf); int -cmd_debug(void) +cmd_debug(const char *hostname) { struct sockaddr_in lsin; struct hammer2_iocom iocom; hammer2_msg_t *msg; + struct hostent *hen; int fd; /* @@ -64,6 +65,19 @@ cmd_debug(void) lsin.sin_family = AF_INET; lsin.sin_addr.s_addr = 0; lsin.sin_port = htons(HAMMER2_LISTEN_PORT); + + if (hostname) { + hen = gethostbyname2(hostname, AF_INET); + if (hen == NULL) { + if (inet_pton(AF_INET, hostname, &lsin.sin_addr) != 1) { + fprintf(stderr, + "Cannot resolve %s\n", hostname); + return 1; + } + } else { + bcopy(hen->h_addr, &lsin.sin_addr, hen->h_length); + } + } if (connect(fd, (struct sockaddr *)&lsin, sizeof(lsin)) < 0) { close(fd); fprintf(stderr, "debug: connect failed: %s\n", diff --git a/sbin/hammer2/cmd_rsa.c b/sbin/hammer2/cmd_rsa.c new file mode 100644 index 0000000000..b403b7f043 --- /dev/null +++ b/sbin/hammer2/cmd_rsa.c @@ -0,0 +1,382 @@ +/* + * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * by Venkatesh Srinivas + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "hammer2.h" + +#include +#include + +/* + * Should be run as root. Creates /etc/hammer2/rsa.{pub,prv} using + * an openssl command. + */ +int +cmd_rsainit(const char *dir_path) +{ + struct stat st; + int ecode; + char *str1; + char *str2; + char *cmd; + mode_t old_umask; + + /* + * Create the directory if necessary + */ + if (stat(dir_path, &st) < 0) { + str1 = strdup(dir_path); + str2 = str1 - 1; + + while ((str2 = strchr(str2 + 1, '/')) != NULL) { + *str2 = 0; + mkdir(str1, 0755); + *str2 = '/'; + } + mkdir(str1, 0700); + free(str1); + } + asprintf(&str1, "%s/rsa.prv", dir_path); + asprintf(&str2, "%s/rsa.pub", dir_path); + + if (stat(str1, &st) < 0) { + old_umask = umask(077); + asprintf(&cmd, "openssl genrsa -out %s 2048", str1); + umask(old_umask); + ecode = system(cmd); + free(cmd); + chmod(str1, 0400); + if (ecode) { + fprintf(stderr, + "hammer2 rsainit: private key gen failed\n"); + free(str2); + free(str1); + return 1; + } + printf("hammer2 rsainit: created %s\n", str1); + remove(str2); + } else { + printf("hammer2 rsainit: Using existing private key in %s\n", + str1); + } + if (stat(str2, &st) < 0) { + asprintf(&cmd, "openssl rsa -in %s -out %s -pubout", + str1, str2); + ecode = system(cmd); + free(cmd); + if (ecode) { + fprintf(stderr, + "hammer2 rsainit: public key gen failed\n"); + free(str2); + free(str1); + return 1; + } + printf("hammer2 rsainit: created %s\n", str2); + } else { + printf("hammer2 rsainit: both keys already exist\n"); + } + free(str2); + free(str1); + + return 0; +} + +int +cmd_rsaenc(const char **keyfiles, int nkeys) +{ + RSA **keys = calloc(nkeys, sizeof(RSA *)); + int *ispub = calloc(nkeys, sizeof(int)); + int ecode = 0; + int blksize = 0; + int i; + int off; + int n; + unsigned char *data_in; + unsigned char *data_out; + + for (i = 0; i < nkeys; ++i) { + FILE *fp; + const char *sfx; + + sfx = strrchr(keyfiles[i], '.'); + if (sfx && strcmp(sfx, ".pub") == 0) { + fp = fopen(keyfiles[i], "r"); + if (fp == NULL) { + fprintf(stderr, "hammer2 rsaenc: unable to " + "open %s\n", keyfiles[i]); + ecode = 1; + goto done; + } + keys[i] = PEM_read_RSA_PUBKEY(fp, NULL, NULL, NULL); + ispub[i] = 1; + fclose(fp); + if (keys[i] == NULL) { + fprintf(stderr, "hammer2 rsaenc: unable to " + "parse public key from %s\n", + keyfiles[i]); + ecode = 1; + goto done; + } + } else if (sfx && strcmp(sfx, ".prv") == 0) { + fp = fopen(keyfiles[i], "r"); + if (fp == NULL) { + fprintf(stderr, "hammer2 rsaenc: unable to " + "open %s\n", keyfiles[i]); + ecode = 1; + goto done; + } + keys[i] = PEM_read_RSAPrivateKey(fp, NULL, NULL, NULL); + fclose(fp); + if (keys[i] == NULL) { + fprintf(stderr, "hammer2 rsaenc: unable to " + "parse private key from %s\n", + keyfiles[i]); + ecode = 1; + goto done; + } + } else { + fprintf(stderr, "hammer2: rsaenc: key files must end " + "in .pub or .prv\n"); + ecode = 1; + goto done; + } + if (i == 0) + blksize = RSA_size(keys[i]); + else + assert(blksize == RSA_size(keys[i])); + } + fprintf(stderr, "blksize %d\n", blksize); + + /* + * + */ + data_in = malloc(blksize); + data_out = malloc(blksize); + off = 0; + while ((n = read(0, data_in + off, blksize - off)) > 0) { + off += n; + if (off == blksize) { + for (i = 0; i < nkeys; ++i) { + if (ispub[i]) + RSA_public_encrypt(blksize, + data_in, data_out, + keys[i], + RSA_NO_PADDING); + else + RSA_private_encrypt(blksize, + data_in, data_out, + keys[i], + RSA_NO_PADDING); + if (i + 1 != nkeys) + bcopy(data_out, data_in, blksize); + } + if (write(1, data_out, blksize) != blksize) { + perror("write"); + ecode = 1; + break; + } + off = 0; + } + } + if (off && ecode == 0) { + if (off < blksize) + bzero(data_in + off, blksize - off); + for (i = 0; i < nkeys; ++i) { + if (ispub[i]) + RSA_public_encrypt(blksize, + data_in, data_out, + keys[i], + RSA_NO_PADDING); + else + RSA_private_encrypt(blksize, + data_in, data_out, + keys[i], + RSA_NO_PADDING); + if (i + 1 != nkeys) + bcopy(data_out, data_in, blksize); + } + if (write(1, data_out, blksize) != blksize) { + perror("write"); + ecode = 1; + } + } + if (n < 0) { + perror("read"); + ecode = 1; + } + free(data_out); + free(data_in); +done: + for (i = 0; i < nkeys; ++i) { + if (keys[i]) + RSA_free(keys[i]); + } + free(keys); + free(ispub); + return (ecode); +} + +int +cmd_rsadec(const char **keyfiles, int nkeys) +{ + RSA **keys = calloc(nkeys, sizeof(RSA *)); + int *ispub = calloc(nkeys, sizeof(int)); + int ecode = 0; + int blksize = 0; + int i; + int off; + int n; + unsigned char *data_in; + unsigned char *data_out; + + for (i = 0; i < nkeys; ++i) { + FILE *fp; + const char *sfx; + + sfx = strrchr(keyfiles[i], '.'); + if (sfx && strcmp(sfx, ".pub") == 0) { + fp = fopen(keyfiles[i], "r"); + if (fp == NULL) { + fprintf(stderr, "hammer2 rsaenc: unable to " + "open %s\n", keyfiles[i]); + ecode = 1; + goto done; + } + keys[i] = PEM_read_RSA_PUBKEY(fp, NULL, NULL, NULL); + ispub[i] = 1; + fclose(fp); + if (keys[i] == NULL) { + fprintf(stderr, "hammer2 rsaenc: unable to " + "parse public key from %s\n", + keyfiles[i]); + ecode = 1; + goto done; + } + } else if (sfx && strcmp(sfx, ".prv") == 0) { + fp = fopen(keyfiles[i], "r"); + if (fp == NULL) { + fprintf(stderr, "hammer2 rsaenc: unable to " + "open %s\n", keyfiles[i]); + ecode = 1; + goto done; + } + keys[i] = PEM_read_RSAPrivateKey(fp, NULL, NULL, NULL); + fclose(fp); + if (keys[i] == NULL) { + fprintf(stderr, "hammer2 rsaenc: unable to " + "parse private key from %s\n", + keyfiles[i]); + ecode = 1; + goto done; + } + } else { + fprintf(stderr, "hammer2: rsaenc: key files must end " + "in .pub or .prv\n"); + ecode = 1; + goto done; + } + if (i == 0) + blksize = RSA_size(keys[i]); + else + assert(blksize == RSA_size(keys[i])); + } + + /* + * + */ + data_in = malloc(blksize); + data_out = malloc(blksize); + off = 0; + while ((n = read(0, data_in + off, blksize - off)) > 0) { + off += n; + if (off == blksize) { + for (i = 0; i < nkeys; ++i) { + if (ispub[i]) + RSA_public_decrypt(blksize, + data_in, data_out, + keys[i], + RSA_NO_PADDING); + else + RSA_private_decrypt(blksize, + data_in, data_out, + keys[i], + RSA_NO_PADDING); + if (i + 1 != nkeys) + bcopy(data_out, data_in, blksize); + } + if (write(1, data_out, blksize) != blksize) { + perror("write"); + ecode = 1; + break; + } + off = 0; + } + } + if (off) { + if (off < blksize) + bzero(data_in + off, blksize - off); + for (i = 0; i < nkeys; ++i) { + if (ispub[i]) + RSA_public_decrypt(blksize, + data_in, data_out, + keys[i], + RSA_NO_PADDING); + else + RSA_private_decrypt(blksize, + data_in, data_out, + keys[i], + RSA_NO_PADDING); + if (i + 1 != nkeys) + bcopy(data_out, data_in, blksize); + } + if (write(1, data_out, blksize) != blksize) { + perror("write"); + ecode = 1; + } + } + if (n < 0) { + perror("read"); + ecode = 1; + } + free(data_out); + free(data_in); +done: + for (i = 0; i < nkeys; ++i) { + if (keys[i]) + RSA_free(keys[i]); + } + free(keys); + free(ispub); + return (ecode); +} diff --git a/sbin/hammer2/cmd_node.c b/sbin/hammer2/cmd_service.c similarity index 79% rename from sbin/hammer2/cmd_node.c rename to sbin/hammer2/cmd_service.c index 0fba7b3b0e..fb4344d384 100644 --- a/sbin/hammer2/cmd_node.c +++ b/sbin/hammer2/cmd_service.c @@ -35,10 +35,12 @@ #include "hammer2.h" -static void *node_master_accept(void *data); -static void *node_master_service(void *data); -static void node_master_recv(hammer2_iocom_t *iocom); -static void node_master_send(hammer2_iocom_t *iocom); +static void *master_accept(void *data); +static void *master_service(void *data); +static void master_auth_rx(hammer2_iocom_t *iocom); +static void master_auth_tx(hammer2_iocom_t *iocom); +static void master_link_rx(hammer2_iocom_t *iocom); +static void master_link_tx(hammer2_iocom_t *iocom); /* * Start-up the master listener daemon for the machine. @@ -58,7 +60,7 @@ static void node_master_send(hammer2_iocom_t *iocom); * Backbones are specified via /etc/hammer2.conf. */ int -cmd_node(void) +cmd_service(void) { struct sockaddr_in lsin; int on; @@ -68,7 +70,7 @@ cmd_node(void) * Acquire socket and set options */ if ((lfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - fprintf(stderr, "node_master_listen: socket(): %s\n", + fprintf(stderr, "master_listen: socket(): %s\n", strerror(errno)); return 1; } @@ -100,7 +102,7 @@ cmd_node(void) * In debug mode this call will create the pthread without forking * and set NormalExit to 0, instead of fork. */ - hammer2_demon(node_master_accept, (void *)(intptr_t)lfd); + hammer2_demon(master_accept, (void *)(intptr_t)lfd); if (NormalExit) close(lfd); return 0; @@ -112,7 +114,7 @@ cmd_node(void) */ static void * -node_master_accept(void *data) +master_accept(void *data) { struct sockaddr_in asin; socklen_t alen; @@ -139,9 +141,9 @@ node_master_accept(void *data) break; } thread = NULL; - fprintf(stderr, "node_master_accept: accept fd %d\n", fd); + fprintf(stderr, "master_accept: accept fd %d\n", fd); pthread_create(&thread, NULL, - node_master_service, (void *)(intptr_t)fd); + master_service, (void *)(intptr_t)fd); } return (NULL); } @@ -151,14 +153,14 @@ node_master_accept(void *data) */ static void * -node_master_service(void *data) +master_service(void *data) { hammer2_iocom_t iocom; int fd; fd = (int)(intptr_t)data; hammer2_iocom_init(&iocom, fd, -1); - hammer2_iocom_core(&iocom, node_master_recv, node_master_send, NULL); + hammer2_iocom_core(&iocom, master_auth_rx, master_auth_tx, NULL); fprintf(stderr, "iocom on fd %d terminated error rx=%d, tx=%d\n", @@ -168,13 +170,39 @@ node_master_service(void *data) return (NULL); } +/************************************************************************ + * AUTHENTICATION * + ************************************************************************ + * + * Additional messaging-based authentication must occur before normal + * message operation. The connection has already been encrypted at + * this point. + */ +static +void +master_auth_rx(hammer2_iocom_t *iocom __unused) +{ + printf("AUTHRX\n"); + iocom->recvmsg_callback = master_link_rx; + iocom->sendmsg_callback = master_link_tx; +} + +static +void +master_auth_tx(hammer2_iocom_t *iocom __unused) +{ + printf("AUTHTX\n"); + iocom->recvmsg_callback = master_link_rx; + iocom->sendmsg_callback = master_link_tx; +} + /* * Callback from hammer2_iocom_core() when messages might be present * on the socket. */ static void -node_master_recv(hammer2_iocom_t *iocom) +master_link_rx(hammer2_iocom_t *iocom) { hammer2_msg_t *msg; @@ -196,7 +224,7 @@ node_master_recv(hammer2_iocom_t *iocom) } if (iocom->ioq_rx.error) { fprintf(stderr, - "node_master_recv: comm error %d\n", + "master_recv: comm error %d\n", iocom->ioq_rx.error); } } @@ -207,7 +235,7 @@ node_master_recv(hammer2_iocom_t *iocom) */ static void -node_master_send(hammer2_iocom_t *iocom) +master_link_tx(hammer2_iocom_t *iocom) { hammer2_iocom_flush(iocom); } diff --git a/sbin/hammer2/crypto.c b/sbin/hammer2/crypto.c new file mode 100644 index 0000000000..947a0cf700 --- /dev/null +++ b/sbin/hammer2/crypto.c @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * by Venkatesh Srinivas + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "hammer2.h" + +#include +#include +#include + +/* + * Synchronously negotiate crypto for a new session. This must occur + * within 10 seconds or the connection is error'd out. + * + * We work off the IP address and/or reverse DNS. The IP address is + * checked first, followed by the IP address at various levels of granularity, + * followed by the full domain name and domain names at various levels of + * granularity. + * + * /etc/hammer2/remote/.pub - Contains a public key + * /etc/hammer2/remote/.none - Indicates no encryption (empty file) + * (e.g. localhost.none). + * + * We first attempt to locate a public key file based on the peer address or + * peer FQDN. + * + * .none - No further negotiation is needed. We simply return. + * All communication proceeds without encryption. + * No public key handshake occurs in this situation. + * (both ends must match). + * + * .pub - We have located the public key for the peer. Both + * sides transmit a block encrypted with their private + * keys and the peer's public key. + * + * Both sides receive a block and decrypt it. + * + * Both sides formulate a reply using the decrypted + * block and transmit it. + * + * communication proceeds with the negotiated session + * key (typically AES-256-CBC). + * + * If we fail to locate the appropriate file and no floating.db exists the + * connection is terminated without further action. + * + * If floating.db exists the connection proceeds with a floating negotiation. + */ +typedef union { + struct sockaddr sa; + struct sockaddr_in sa_in; + struct sockaddr_in6 sa_in6; +} sockaddr_any_t; + +void +hammer2_crypto_negotiate(hammer2_iocom_t *iocom) +{ + sockaddr_any_t sa; + socklen_t salen = sizeof(sa); + char peername[128]; + char realname[128]; + hammer2_handshake_t handtx; + hammer2_handshake_t handrx; + char buf[sizeof(handtx)]; + char *ptr; + char *path; + struct stat st; + FILE *fp; + RSA *keys[3] = { NULL, NULL, NULL }; + size_t i; + size_t blksize; + size_t blkmask; + ssize_t n; + int fd; + + /* + * Get the peer IP address for the connection as a string. + */ + if (getpeername(iocom->sock_fd, &sa.sa, &salen) < 0) { + iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NOPEER; + iocom->flags |= HAMMER2_IOCOMF_EOF; + if (DebugOpt) + fprintf(stderr, "accept: getpeername() failed\n"); + goto done; + } + if (getnameinfo(&sa.sa, salen, peername, sizeof(peername), + NULL, 0, NI_NUMERICHOST) < 0) { + iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NOPEER; + iocom->flags |= HAMMER2_IOCOMF_EOF; + if (DebugOpt) + fprintf(stderr, "accept: cannot decode sockaddr\n"); + goto done; + } + if (DebugOpt) { + if (realhostname_sa(realname, sizeof(realname), + &sa.sa, salen) == HOSTNAME_FOUND) { + fprintf(stderr, "accept from %s (%s)\n", + peername, realname); + } else { + fprintf(stderr, "accept from %s\n", peername); + } + } + + /* + * Find the remote host's public key + */ + asprintf(&path, "%s/%s.pub", HAMMER2_PATH_REMOTE, peername); + if ((fp = fopen(path, "r")) == NULL) { + free(path); + asprintf(&path, "%s/%s.none", + HAMMER2_PATH_REMOTE, peername); + if (stat(path, &st) < 0) { + iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NORKEY; + iocom->flags |= HAMMER2_IOCOMF_EOF; + if (DebugOpt) + fprintf(stderr, "auth failure: unknown host\n"); + goto done; + } + if (DebugOpt) + fprintf(stderr, "auth succeeded, unencrypted link\n"); + } + if (fp) { + keys[0] = PEM_read_RSA_PUBKEY(fp, NULL, NULL, NULL); + fclose(fp); + if (keys[0] == NULL) { + iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYFMT; + iocom->flags |= HAMMER2_IOCOMF_EOF; + if (DebugOpt) + fprintf(stderr, + "auth failure: bad key format\n"); + goto done; + } + } + + /* + * Get our public and private keys + */ + free(path); + asprintf(&path, HAMMER2_DEFAULT_DIR "/rsa.pub"); + if ((fp = fopen(path, "r")) == NULL) { + iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NOLKEY; + iocom->flags |= HAMMER2_IOCOMF_EOF; + goto done; + } + keys[1] = PEM_read_RSA_PUBKEY(fp, NULL, NULL, NULL); + fclose(fp); + if (keys[1] == NULL) { + iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYFMT; + iocom->flags |= HAMMER2_IOCOMF_EOF; + if (DebugOpt) + fprintf(stderr, "auth failure: bad host key format\n"); + goto done; + } + + free(path); + asprintf(&path, HAMMER2_DEFAULT_DIR "/rsa.prv"); + if ((fp = fopen(path, "r")) == NULL) { + iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NOLKEY; + iocom->flags |= HAMMER2_IOCOMF_EOF; + if (DebugOpt) + fprintf(stderr, "auth failure: bad host key format\n"); + goto done; + } + keys[2] = PEM_read_RSAPrivateKey(fp, NULL, NULL, NULL); + fclose(fp); + if (keys[2] == NULL) { + iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYFMT; + iocom->flags |= HAMMER2_IOCOMF_EOF; + if (DebugOpt) + fprintf(stderr, "auth failure: bad host key format\n"); + goto done; + } + free(path); + path = NULL; + + /* + * public key encrypt/decrypt block size. + */ + if (keys[0]) { + blksize = (size_t)RSA_size(keys[0]); + if (blksize != (size_t)RSA_size(keys[1]) || + blksize != (size_t)RSA_size(keys[2]) || + sizeof(handtx) % blksize != 0) { + iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYFMT; + iocom->flags |= HAMMER2_IOCOMF_EOF; + if (DebugOpt) + fprintf(stderr, "auth failure: " + "key size mismatch\n"); + goto done; + } + } else { + blksize = sizeof(handtx); + } + blkmask = blksize - 1; + + bzero(&handrx, sizeof(handrx)); + bzero(&handtx, sizeof(handtx)); + + /* + * Fill all unused fields (particular all junk fields) with random + * data, and also set the session key. + */ + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0 || + fstat(fd, &st) < 0 || /* something wrong */ + S_ISREG(st.st_mode) || /* supposed to be a RNG dev! */ + read(fd, &handtx, sizeof(handtx)) != sizeof(handtx)) { +urandfail: + if (fd >= 0) + close(fd); + iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_BADURANDOM; + iocom->flags |= HAMMER2_IOCOMF_EOF; + if (DebugOpt) + fprintf(stderr, "auth failure: bad rng\n"); + goto done; + } + if (bcmp(&handrx, &handtx, sizeof(handtx)) == 0) + goto urandfail; /* read all zeros */ + close(fd); + ERR_load_crypto_strings(); + + /* + * Handshake with the remote. + * + * Encrypt with my private and remote's public + * Decrypt with my private and remote's public + * + * When encrypting we have to make sure our buffer fits within the + * modulus, which typically requires bit 7 o the first byte to be + * zero. To be safe make sure that bit 7 and bit 6 is zero. + */ + snprintf(handtx.quickmsg, sizeof(handtx.quickmsg), "Testing 1 2 3"); + handtx.magic = HAMMER2_MSGHDR_MAGIC; + handtx.version = 1; + handtx.flags = 0; + assert(sizeof(handtx.verf) * 4 == sizeof(handtx.sess)); + bzero(handtx.verf, sizeof(handtx.verf)); + + handtx.pad1[0] &= 0x3f; /* message must fit within modulus */ + handtx.pad2[0] &= 0x3f; /* message must fit within modulus */ + + for (i = 0; i < sizeof(handtx.sess); ++i) + handtx.verf[i / 4] ^= handtx.sess[i]; + + /* + * Write handshake buffer to remote + */ + for (i = 0; i < sizeof(handtx); i += blksize) { + ptr = (char *)&handtx + i; + if (keys[0]) { + /* + * Since we are double-encrypting we have to make + * sure that the result of the first stage does + * not blow out the modulus for the second stage. + * + * The pointer is pointing to the pad*[] area so + * we can mess with that until the first stage + * is legal. + */ + do { + ++*(int *)(ptr + 4); + if (RSA_private_encrypt(blksize, ptr, buf, + keys[2], RSA_NO_PADDING) < 0) { + iocom->ioq_rx.error = + HAMMER2_IOQ_ERROR_KEYXCHGFAIL; + } + } while (buf[0] & 0xC0); + + if (RSA_public_encrypt(blksize, buf, ptr, + keys[0], RSA_NO_PADDING) < 0) { + iocom->ioq_rx.error = + HAMMER2_IOQ_ERROR_KEYXCHGFAIL; + } + } + if (write(iocom->sock_fd, ptr, blksize) != (ssize_t)blksize) { + fprintf(stderr, "WRITE ERROR\n"); + } + } + if (iocom->ioq_rx.error) { + iocom->flags |= HAMMER2_IOCOMF_EOF; + if (DebugOpt) + fprintf(stderr, "auth failure: key exchange failure " + "during encryption\n"); + goto done; + } + + /* + * Read handshake buffer from remote + */ + i = 0; + while (i < sizeof(handrx)) { + ptr = (char *)&handrx + i; + n = read(iocom->sock_fd, ptr, blksize - (i & blkmask)); + if (n <= 0) + break; + ptr -= (i & blkmask); + i += n; + if (keys[0] && (i & blkmask) == 0) { + if (RSA_private_decrypt(blksize, ptr, buf, + keys[2], RSA_NO_PADDING) < 0) + iocom->ioq_rx.error = + HAMMER2_IOQ_ERROR_KEYXCHGFAIL; + if (RSA_public_decrypt(blksize, buf, ptr, + keys[0], RSA_NO_PADDING) < 0) + iocom->ioq_rx.error = + HAMMER2_IOQ_ERROR_KEYXCHGFAIL; + } + } + if (iocom->ioq_rx.error) { + iocom->flags |= HAMMER2_IOCOMF_EOF; + if (DebugOpt) + fprintf(stderr, "auth failure: key exchange failure " + "during decryption\n"); + goto done; + } + + /* + * Validate the received data. Try to make this a constant-time + * algorithm. + */ + if (i != sizeof(handrx)) { +keyxchgfail: + iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYXCHGFAIL; + iocom->flags |= HAMMER2_IOCOMF_EOF; + if (DebugOpt) + fprintf(stderr, "auth failure: key exchange failure\n"); + goto done; + } + + if (handrx.magic == HAMMER2_MSGHDR_MAGIC_REV) { + handrx.version = bswap16(handrx.version); + handrx.flags = bswap32(handrx.flags); + } + for (i = 0; i < sizeof(handrx.sess); ++i) + handrx.verf[i / 4] ^= handrx.sess[i]; + n = 0; + for (i = 0; i < sizeof(handrx.verf); ++i) + n += handrx.verf[i]; + if (handrx.version != 1) + ++n; + if (n != 0) + goto keyxchgfail; + + if (DebugOpt) { + fprintf(stderr, "Remote data: %s\n", handrx.quickmsg); + } +done: + if (path) + free(path); + if (keys[0]) + RSA_free(keys[0]); + if (keys[1]) + RSA_free(keys[1]); + if (keys[1]) + RSA_free(keys[2]); +} diff --git a/sbin/hammer2/hammer2.h b/sbin/hammer2/hammer2.h index d3a21f8909..fc3fc051b8 100644 --- a/sbin/hammer2/hammer2.h +++ b/sbin/hammer2/hammer2.h @@ -72,8 +72,13 @@ #include #include +#include + #include "network.h" +#define HAMMER2_DEFAULT_DIR "/etc/hammer2" +#define HAMMER2_PATH_REMOTE HAMMER2_DEFAULT_DIR "/remote" + extern int DebugOpt; extern int NormalExit; @@ -90,9 +95,12 @@ int cmd_pfs_create(const char *sel_path, const char *name, uint8_t pfs_type, const char *uuid_str); int cmd_pfs_delete(const char *sel_path, const char *name); -int cmd_node(void); +int cmd_service(void); int cmd_leaf(const char *sel_path); -int cmd_debug(void); +int cmd_debug(const char *hostname); +int cmd_rsainit(const char *dir_path); +int cmd_rsaenc(const char **keys, int nkeys); +int cmd_rsadec(const char **keys, int nkeys); void hammer2_ioq_init(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq); void hammer2_ioq_done(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq); @@ -116,5 +124,7 @@ void hammer2_ioq_stream(hammer2_msg_t *msg, int reply); void hammer2_iocom_drain(hammer2_iocom_t *iocom); void hammer2_iocom_flush(hammer2_iocom_t *iocom); +void hammer2_crypto_negotiate(hammer2_iocom_t *iocom); + void hammer2_debug_remote(hammer2_msg_t *msg); void msg_printf(hammer2_msg_t *msg, const char *ctl, ...); diff --git a/sbin/hammer2/main.c b/sbin/hammer2/main.c index 570948b827..2be46622fd 100644 --- a/sbin/hammer2/main.c +++ b/sbin/hammer2/main.c @@ -45,12 +45,15 @@ main(int ac, char **av) { const char *sel_path = NULL; const char *uuid_str = NULL; + const char *arg; int pfs_type = HAMMER2_PFSTYPE_NONE; int quick_opt = 0; int all_opt = 0; int ecode = 0; int ch; + srandomdev(); + /* * Core options */ @@ -172,14 +175,14 @@ main(int ac, char **av) * Create snapshot with optional pfs_type and optional * label override. */ - } else if (strcmp(av[0], "node") == 0) { + } else if (strcmp(av[0], "service") == 0) { /* - * Start the master node daemon. This daemon accepts - * connections from local and remote clients, implements - * and maintains the spanning tree protocol, and manages - * the core messaging protocol. + * Start the service daemon. This daemon accepts + * connections from local and remote clients, handles + * the security handshake, and manages the core messaging + * protocol. */ - ecode = cmd_node(); + ecode = cmd_service(); } else if (strcmp(av[0], "leaf") == 0) { /* * Start the management daemon for a specific PFS. @@ -209,7 +212,52 @@ main(int ac, char **av) * Connect to the command line monitor in the hammer2 master * node for the machine using HAMMER2_DBG_SHELL messages. */ - ecode = cmd_debug(); + ecode = cmd_debug((ac < 2) ? NULL : av[1]); + } else if (strcmp(av[0], "rsainit") == 0) { + /* + * Initialize a RSA keypair. If no target directory is + * specified we default to "/etc/hammer2". + */ + arg = (ac < 2) ? HAMMER2_DEFAULT_DIR : av[1]; + ecode = cmd_rsainit(arg); + } else if (strcmp(av[0], "rsaenc") == 0) { + /* + * Encrypt the input symmetrically by running it through + * the specified public and/or private key files. + * + * If no key files are specified data is encoded using + * "/etc/hammer2/rsa.pub". + * + * WARNING: no padding is added, data stream must contain + * random padding for this to be secure. + * + * Used for debugging only + */ + if (ac == 1) { + const char *rsapath = HAMMER2_DEFAULT_DIR "/rsa.pub"; + ecode = cmd_rsaenc(&rsapath, 1); + } else { + ecode = cmd_rsaenc((const char **)&av[1], ac - 1); + } + } else if (strcmp(av[0], "rsadec") == 0) { + /* + * Decrypt the input symmetrically by running it through + * the specified public and/or private key files. + * + * If no key files are specified data is decoded using + * "/etc/hammer2/rsa.prv". + * + * WARNING: no padding is added, data stream must contain + * random padding for this to be secure. + * + * Used for debugging only + */ + if (ac == 1) { + const char *rsapath = HAMMER2_DEFAULT_DIR "/rsa.prv"; + ecode = cmd_rsadec(&rsapath, 1); + } else { + ecode = cmd_rsadec((const char **)&av[1], ac - 1); + } } else { fprintf(stderr, "Unrecognized command: %s\n", av[0]); usage(1); diff --git a/sbin/hammer2/msg.c b/sbin/hammer2/msg.c index e0ac85c714..322ef779ec 100644 --- a/sbin/hammer2/msg.c +++ b/sbin/hammer2/msg.c @@ -77,6 +77,15 @@ hammer2_iocom_init(hammer2_iocom_t *iocom, int sock_fd, int alt_fd) hammer2_ioq_init(iocom, &iocom->ioq_rx); hammer2_ioq_init(iocom, &iocom->ioq_tx); + /* + * Negotiate session crypto synchronously. This will mark the + * connection as error'd if it fails. + */ + hammer2_crypto_negotiate(iocom); + + /* + * Make sure our fds are set to non-blocking for the iocom core. + */ if (sock_fd >= 0) fcntl(sock_fd, F_SETFL, O_NONBLOCK); #if 0 diff --git a/sbin/hammer2/network.h b/sbin/hammer2/network.h index a97078380a..6167998102 100644 --- a/sbin/hammer2/network.h +++ b/sbin/hammer2/network.h @@ -33,6 +33,46 @@ * SUCH DAMAGE. */ +/*************************************************************************** + * CRYPTO HANDSHAKE * + *************************************************************************** + * + * The initial public-key exchange is implementing by transmitting a + * 512-byte buffer to the other side in a symmetrical fashion. This + * buffer contains the following: + * + * (1) A random session key. + * + * (2) A verifier to determine that the decode was successful. It encodes + * an XOR of each group of 4 bytes from the session key. + * + * (3) Additional configuration and additional random data. + * + * - The hammer2 message header magic for endian detect + * + * - The hammer2 protocol version. The two sides agree on the + * smaller of the two. + * + * - All unused fields (junk*) are filled with random data. + * + * This structure must be exactly 512 bytes and expects to use 256-byte + * RSA keys. + */ +struct hammer2_handshake { + char pad1[8]; /* 000 */ + uint16_t magic; /* 008 HAMMER2_MSGHDR_MAGIC for endian detect */ + uint16_t version; /* 00A hammer2 protocol version */ + uint32_t flags; /* 00C protocol extension flags */ + uint8_t sess[64]; /* 010 512-bit session key */ + uint8_t verf[16]; /* 050 verifier = ~sess */ + char quickmsg[32]; /* 060 reason for connecting */ + char junk080[128]; /* 080-0FF */ + char pad2[8]; /* 100-107 */ + char junk100[256-8]; /* 108-1FF */ +}; + +typedef struct hammer2_handshake hammer2_handshake_t; + /*************************************************************************** * LOW LEVEL MESSAGING * *************************************************************************** @@ -92,14 +132,20 @@ struct hammer2_ioq { typedef struct hammer2_ioq hammer2_ioq_t; -#define HAMMER2_IOQ_ERROR_SYNC 1 /* bad magic / out of sync */ -#define HAMMER2_IOQ_ERROR_EOF 2 /* unexpected EOF */ -#define HAMMER2_IOQ_ERROR_SOCK 3 /* read() error on socket */ -#define HAMMER2_IOQ_ERROR_FIELD 4 /* invalid field */ -#define HAMMER2_IOQ_ERROR_HCRC 5 /* core header crc bad */ -#define HAMMER2_IOQ_ERROR_XCRC 6 /* ext header crc bad */ -#define HAMMER2_IOQ_ERROR_ACRC 7 /* aux data crc bad */ -#define HAMMER2_IOQ_ERROR_STATE 8 /* bad state */ +#define HAMMER2_IOQ_ERROR_SYNC 1 /* bad magic / out of sync */ +#define HAMMER2_IOQ_ERROR_EOF 2 /* unexpected EOF */ +#define HAMMER2_IOQ_ERROR_SOCK 3 /* read() error on socket */ +#define HAMMER2_IOQ_ERROR_FIELD 4 /* invalid field */ +#define HAMMER2_IOQ_ERROR_HCRC 5 /* core header crc bad */ +#define HAMMER2_IOQ_ERROR_XCRC 6 /* ext header crc bad */ +#define HAMMER2_IOQ_ERROR_ACRC 7 /* aux data crc bad */ +#define HAMMER2_IOQ_ERROR_STATE 8 /* bad state */ +#define HAMMER2_IOQ_ERROR_NOPEER 9 /* bad socket peer */ +#define HAMMER2_IOQ_ERROR_NORKEY 10 /* no remote keyfile found */ +#define HAMMER2_IOQ_ERROR_NOLKEY 11 /* no local keyfile found */ +#define HAMMER2_IOQ_ERROR_KEYXCHGFAIL 12 /* key exchange failed */ +#define HAMMER2_IOQ_ERROR_KEYFMT 13 /* key file format problem */ +#define HAMMER2_IOQ_ERROR_BADURANDOM 14 /* /dev/urandom is bad */ #define HAMMER2_IOQ_MAXIOVEC 16 @@ -117,6 +163,8 @@ struct hammer2_iocom { int sock_fd; /* comm socket or pipe */ int alt_fd; /* thread signal, tty, etc */ int flags; + int rxmisc; + int txmisc; char rxbuf[HAMMER2_MSGBUF_SIZE]; /* for ioq_rx only */ }; diff --git a/sbin/newfs_hammer2/newfs_hammer2.c b/sbin/newfs_hammer2/newfs_hammer2.c index 1be34e6725..5d39337db5 100644 --- a/sbin/newfs_hammer2/newfs_hammer2.c +++ b/sbin/newfs_hammer2/newfs_hammer2.c @@ -233,7 +233,7 @@ main(int ac, char **av) uuid_to_string(&Hammer2_RPFSId, &rpfsidstr, &status); /* - * Calculate the amount of reserved space. HAMMER2_RESERVE_SEG (4MB) + * Calculate the amount of reserved space. HAMMER2_ZONE_SEG (4MB) * is reserved at the beginning of every 2GB of storage, rounded up. * Thus a 200MB filesystem will still have a 4MB reserve area. * @@ -241,8 +241,8 @@ main(int ac, char **av) * reserve is used to help 'df' calculate the amount of available * space. */ - reserved_space = ((total_space + HAMMER2_RESERVE_MASK64) / - HAMMER2_RESERVE_BYTES64) * HAMMER2_RESERVE_SEG64; + reserved_space = ((total_space + HAMMER2_ZONE_MASK64) / + HAMMER2_ZONE_BYTES64) * HAMMER2_ZONE_SEG64; free_space = total_space - reserved_space - BootAreaSize - AuxAreaSize; @@ -449,7 +449,7 @@ format_hammer2(int fd, hammer2_off_t total_space, hammer2_off_t free_space) hammer2_blockref_t root_blockref; uint64_t now; hammer2_off_t volu_base = 0; - hammer2_off_t boot_base = HAMMER2_RESERVE_SEG; + hammer2_off_t boot_base = HAMMER2_ZONE_SEG; hammer2_off_t aux_base = boot_base + BootAreaSize; hammer2_off_t alloc_base = aux_base + AuxAreaSize; hammer2_off_t tmp_base; @@ -462,7 +462,7 @@ format_hammer2(int fd, hammer2_off_t total_space, hammer2_off_t free_space) */ bzero(buf, HAMMER2_PBUFSIZE); tmp_base = volu_base; - for (i = 0; i < HAMMER2_RESERVE_BLOCKS; ++i) { + for (i = 0; i < HAMMER2_ZONE_BLOCKS_SEG; ++i) { n = pwrite(fd, buf, HAMMER2_PBUFSIZE, tmp_base); if (n != HAMMER2_PBUFSIZE) { perror("write"); @@ -649,10 +649,10 @@ format_hammer2(int fd, hammer2_off_t total_space, hammer2_off_t free_space) * Write the volume header and all alternates. */ for (i = 0; i < HAMMER2_NUM_VOLHDRS; ++i) { - if (i * HAMMER2_RESERVE_BYTES64 >= total_space) + if (i * HAMMER2_ZONE_BYTES64 >= total_space) break; n = pwrite(fd, buf, HAMMER2_PBUFSIZE, - volu_base + i * HAMMER2_RESERVE_BYTES64); + volu_base + i * HAMMER2_ZONE_BYTES64); if (n != HAMMER2_PBUFSIZE) { perror("write"); exit(1); diff --git a/sys/vfs/hammer2/DESIGN b/sys/vfs/hammer2/DESIGN index 5e87f526f0..fe10627779 100644 --- a/sys/vfs/hammer2/DESIGN +++ b/sys/vfs/hammer2/DESIGN @@ -77,9 +77,10 @@ This allows the writing of 0's to create holes and will be the default compression algorithm for HAMMER2. -* Copies support for redundancy. The media blockref structure would - have become too bloated but I found a clean way to do copies using the - blockset structure (which is a set of 8 fully associative blockref's). +* Copies support for redundancy. Each copy has its own blockref. The + blockrefs representing the copies must exist within the same blockset + (set of 8 blockrefs), though I may relax this requirement in the + implementation. The design is such that the filesystem should be able to function at full speed even if disks are pulled or inserted, as long as at least one @@ -87,6 +88,9 @@ missing copies (or remove excessive copies in the case where the copies value is reduced on a live filesystem). + Copies are specified using the same copyinfo[] array that is used to + specify cluster interconnections for PFS's. + * Clusterable with MESI cache coherency and dynamic granularity. The media format for HAMMER1 was less condusive to logical clustering than I had hoped so I was never able to get that aspect of my personal goals @@ -229,23 +233,71 @@ structures generally follow the kernel's filesystem hiearchy. Second, HAMMER2's writable snapshots make it possible to implement several forms of multi-master clustering. -This is important: The mount device path you specify serves to bootstrap -your entry into the cluster, but your mount will make active connections -to ALL copy elements in the hammer2_copy_data[] array (stored in the volume -header) which match the PFSID of the directory in the super-root that you -specified. The local media path does not have to be mentioned in this -array but becomes part of the cluster based on its type and access -rights. ALL ELEMENTS ARE TREATED ACCORDING TO TYPE NO MATTER WHICH ONE -YOU MOUNT FROM. - -The actual cluster may be far larger than the elements you list in the -hammer2_copy_data[] array. You list only the elements you wish to -directly connect to and you are able to access the rest of the cluster -indirectly through those connections. - -All nodes in the cluster may act as administrative proxies. All nodes -in the cluster, including your mount point, are classified as one of the -following as specified in the inode's structure: +The mount device path you specify serves to bootstrap your entry into +the cluster. This can be local media or directly specify a network +cluster connection (or several). When a local media mount is used the +volume header is scanned for local copies and the best volume header is +selected from all available copies. Multiple devices may be specified for +redundancy. + +The volume header on local media also contains cluster connection +specifications keyed by super-root pfsid. Network connections are +maintained to all targets. ALL ELEMENTS ARE TREATED ACCORDING TO TYPE +NO MATTER WHICH ONE YOU MOUNT FROM. + +The actual networked cluster may be far larger than the elements you list +in the hammer2_copy_data[] array, but your machine will only make direct +connections as specified by the array. + +In the simplest case you simply network a few machines together as ring 0 +masters and each client connects directly to all the masters (and/or are +the masters themselves). Thus any quorum operation is straight-forward. +These master nodes are labeled 'ring 0'. + +If you have too many clients to reasonably connect directly you set up +sub-clusters as satellites. This is called 'ring 1'. Ring 1 may contain +several sub-clusters. A client then connects to all the nodes in a +particular sub-cluster (typically 3). The quorum protocol runs as per +normal except that once the operation is resolved against the sub-cluster +an aggregation must be resolved against the master nodes (ring 0). The +sub-cluster does this for the client... all the client sees is the normal +quorum operation against the sub-cluster. + +Since each node in the sub-cluster connects to all master nodes we get +a multiplication. If we set a reasonable upper limit of, say, 256 +connections at each master node then ring 1 may contain 85 sub-clusters x 3 +nodes in each sub-cluster. + +In the most complex case when one wishes to support potentially millions +of clients then further fan-out is required into ring 2, ring 3, and +so forth. However, each sub-cluster in ring 2 must only connect to +1 sub-cluster in ring 1 (otherwise the cache state will become mightily +confused). Using reasonable metrics this will allow ring 2 to contain +85 * 85 = 7225 sub-clusters. At this point you could have 1000 clients +connect to each sub-cluster and support 7.2 million clients, but if that +isn't enough going to another ring will support 61M clients, and so forth. + +Each ring imposes additional latencies for cache operations but the key +to making this work efficiently is that the satellite clusters can negotiate +coarse-grained cache coherency locks with the next lower ring and then +fan-out finer-grained locks to the next higher ring. Since caching can +occur anywhere (including on the connecting client), it is the cache +coherency lock that ultimately dictates efficiency and allows a client +(or satellite) to access large amoutns of data from local storage. + +Modifying operations, particularly commits, also have higher latencies +when multiple rings are in use. In this situation it is possible to +short-cut localized operations by having competing clients connect to +to sub-clusters which are near each other topologically... having the +competing clients connect to the same sub-cluster would be the most optimal. + +In addition, sub-clusters (typically in ring 1) can act in SOFT_MASTER mode +which allows the sub-cluster to acknowledge a full commit within its own +quorum only, and then resolve asynchronously to the masters in ring 0. + +The nodes in these intermediate rings can be pure proxies with only memory +caches, use local media for persistent cache, or use local media to +completely slave the filesystem. ADMIN - Media does not participate, administrative proxy only CACHE - Media only acts as a persistent cache diff --git a/sys/vfs/hammer2/hammer2_disk.h b/sys/vfs/hammer2/hammer2_disk.h index c3f5f98273..6b4329af51 100644 --- a/sys/vfs/hammer2/hammer2_disk.h +++ b/sys/vfs/hammer2/hammer2_disk.h @@ -163,11 +163,11 @@ #define HAMMER2_NEWFS_ALIGNMASK (HAMMER2_VOLUME_ALIGN - 1) #define HAMMER2_NEWFS_ALIGNMASK64 ((hammer2_off_t)HAMMER2_NEWFS_ALIGNMASK) -#define HAMMER2_RESERVE_BYTES64 (2LLU * 1024 * 1024 * 1024) -#define HAMMER2_RESERVE_MASK64 (HAMMER2_RESERVE_BYTES64 - 1) -#define HAMMER2_RESERVE_SEG (4 * 1024 * 1024) -#define HAMMER2_RESERVE_SEG64 ((hammer2_off_t)HAMMER2_RESERVE_SEG) -#define HAMMER2_RESERVE_BLOCKS (HAMMER2_RESERVE_SEG / HAMMER2_PBUFSIZE) +#define HAMMER2_ZONE_BYTES64 (2LLU * 1024 * 1024 * 1024) +#define HAMMER2_ZONE_MASK64 (HAMMER2_ZONE_BYTES64 - 1) +#define HAMMER2_ZONE_SEG (4 * 1024 * 1024) +#define HAMMER2_ZONE_SEG64 ((hammer2_off_t)HAMMER2_ZONE_SEG) +#define HAMMER2_ZONE_BLOCKS_SEG (HAMMER2_ZONE_SEG / HAMMER2_PBUFSIZE) /* * Two linear areas can be reserved after the initial 2MB segment in the base @@ -620,7 +620,7 @@ struct hammer2_copy_data { uint16_t flags; /* 04-05 flags field */ uint8_t error; /* 06 last operational error */ uint8_t priority; /* 07 priority and round-robin flag */ - uint8_t remote_pfstype; /* 08 probed direct remote PFS type */ + uint8_t remote_pfs_type;/* 08 probed direct remote PFS type */ uint8_t reserved08[23]; /* 09-1F */ uuid_t pfs_id; /* 20-2F copy target must match this uuid */ uint8_t label[16]; /* 30-3F import/export label */ @@ -657,10 +657,13 @@ typedef struct hammer2_copy_data hammer2_copy_data_t; * the tree which are stored in the volumeh eader and must be tracked on * the fly. * - * COPIES: Multiple copies may be specified on the mount line AND/OR you - * just specify one and the mount code tries to pick up the others - * from copyinfo[]. The copyid field in the volume header along - * with the fsid validates the copies. + * NOTE: The copyinfo[] array contains the configuration for both the + * cluster connections and any local media copies. The volume + * header will be replicated for each local media copy. + * + * The mount command may specify multiple medias or just one and + * allow HAMMER2 to pick up the others when it checks the copyinfo[] + * array on mount. * * NOTE: root_blockref points to the super-root directory, not the root * directory. The root directory will be a subdirectory under the @@ -698,7 +701,7 @@ struct hammer2_volume_data { uint32_t flags; /* 0034 */ uint8_t copyid; /* 0038 copyid of phys vol */ uint8_t freemap_version; /* 0039 freemap algorithm */ - uint8_t reserved003A; /* 003A */ + uint8_t pfstype; /* 003A local media pfstype */ uint8_t reserved003B; /* 003B */ uint32_t reserved003C; /* 003C */ diff --git a/sys/vfs/hammer2/hammer2_vfsops.c b/sys/vfs/hammer2/hammer2_vfsops.c index 9d4958a732..fb2180fa34 100644 --- a/sys/vfs/hammer2/hammer2_vfsops.c +++ b/sys/vfs/hammer2/hammer2_vfsops.c @@ -857,7 +857,7 @@ hammer2_install_volume_header(hammer2_mount_t *hmp) * block device's EOF. */ for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) { - error = bread(hmp->devvp, i * HAMMER2_RESERVE_BYTES64, + error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64, HAMMER2_VOLUME_BYTES, &bp); if (error) { brelse(bp); -- 2.41.0