| 1 | /* |
| 2 | * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved. |
| 3 | * |
| 4 | * This code is derived from software contributed to The DragonFly Project |
| 5 | * by Matthew Dillon <dillon@dragonflybsd.org> |
| 6 | * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> |
| 7 | * |
| 8 | * Redistribution and use in source and binary forms, with or without |
| 9 | * modification, are permitted provided that the following conditions |
| 10 | * are met: |
| 11 | * |
| 12 | * 1. Redistributions of source code must retain the above copyright |
| 13 | * notice, this list of conditions and the following disclaimer. |
| 14 | * 2. Redistributions in binary form must reproduce the above copyright |
| 15 | * notice, this list of conditions and the following disclaimer in |
| 16 | * the documentation and/or other materials provided with the |
| 17 | * distribution. |
| 18 | * 3. Neither the name of The DragonFly Project nor the names of its |
| 19 | * contributors may be used to endorse or promote products derived |
| 20 | * from this software without specific, prior written permission. |
| 21 | * |
| 22 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 23 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 24 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
| 25 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
| 26 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| 27 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| 28 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| 29 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
| 30 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 31 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| 32 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 33 | * SUCH DAMAGE. |
| 34 | */ |
| 35 | |
| 36 | /*************************************************************************** |
| 37 | * CRYPTO HANDSHAKE * |
| 38 | *************************************************************************** |
| 39 | * |
| 40 | * The initial public-key exchange is implementing by transmitting a |
| 41 | * 512-byte buffer to the other side in a symmetrical fashion. This |
| 42 | * buffer contains the following: |
| 43 | * |
| 44 | * (1) A random session key. |
| 45 | * |
| 46 | * (2) A verifier to determine that the decode was successful. It encodes |
| 47 | * an XOR of each group of 4 bytes from the session key. |
| 48 | * |
| 49 | * (3) Additional configuration and additional random data. |
| 50 | * |
| 51 | * - The hammer2 message header magic for endian detect |
| 52 | * |
| 53 | * - The hammer2 protocol version. The two sides agree on the |
| 54 | * smaller of the two. |
| 55 | * |
| 56 | * - All unused fields (junk*) are filled with random data. |
| 57 | * |
| 58 | * This structure must be exactly 512 bytes and expects to use 256-byte |
| 59 | * RSA keys. |
| 60 | */ |
| 61 | struct hammer2_handshake { |
| 62 | char pad1[8]; /* 000 */ |
| 63 | uint16_t magic; /* 008 HAMMER2_MSGHDR_MAGIC for endian detect */ |
| 64 | uint16_t version; /* 00A hammer2 protocol version */ |
| 65 | uint32_t flags; /* 00C protocol extension flags */ |
| 66 | uint8_t sess[64]; /* 010 512-bit session key */ |
| 67 | uint8_t verf[16]; /* 050 verifier = ~sess */ |
| 68 | char quickmsg[32]; /* 060 reason for connecting */ |
| 69 | char junk080[128]; /* 080-0FF */ |
| 70 | char pad2[8]; /* 100-107 */ |
| 71 | char junk100[256-8]; /* 108-1FF */ |
| 72 | }; |
| 73 | |
| 74 | typedef struct hammer2_handshake hammer2_handshake_t; |
| 75 | |
| 76 | /*************************************************************************** |
| 77 | * LOW LEVEL MESSAGING * |
| 78 | *************************************************************************** |
| 79 | * |
| 80 | * hammer2_msg - A standalone copy of a message, typically referenced by |
| 81 | * or embedded in other structures, or used with I/O queues. |
| 82 | * |
| 83 | * These structures are strictly temporary, so they do not have to be |
| 84 | * particularly optimized for size. All possible message headers are |
| 85 | * directly embedded (any), and the message may contain a reference |
| 86 | * to allocated auxillary data. The structure is recycled quite often |
| 87 | * by a connection. |
| 88 | * |
| 89 | * This structure is typically not used for storing persistent message |
| 90 | * state (see hammer2_persist for that). |
| 91 | */ |
| 92 | struct hammer2_iocom; |
| 93 | struct hammer2_persist; |
| 94 | |
| 95 | struct hammer2_msg { |
| 96 | struct hammer2_iocom *iocom; |
| 97 | struct hammer2_persist *persist; |
| 98 | TAILQ_ENTRY(hammer2_msg) entry; /* queue */ |
| 99 | char *aux_data; /* aux-data if any */ |
| 100 | int aux_size; |
| 101 | int flags; |
| 102 | hammer2_any_t any; /* raw extended msg header */ |
| 103 | }; |
| 104 | |
| 105 | typedef struct hammer2_msg hammer2_msg_t; |
| 106 | |
| 107 | TAILQ_HEAD(hammer2_msg_queue, hammer2_msg); |
| 108 | typedef struct hammer2_msg_queue hammer2_msg_queue_t; |
| 109 | |
| 110 | #define HAMMER2_MSGX_BSWAPPED 0x0001 |
| 111 | |
| 112 | /* |
| 113 | * hammer2_ioq - An embedded component of hammer2_connect, holds state |
| 114 | * for the buffering and parsing of incoming and outgoing messages. |
| 115 | */ |
| 116 | struct hammer2_ioq { |
| 117 | enum { HAMMER2_MSGQ_STATE_HEADER1, |
| 118 | HAMMER2_MSGQ_STATE_HEADER2, |
| 119 | HAMMER2_MSGQ_STATE_AUXDATA1, |
| 120 | HAMMER2_MSGQ_STATE_AUXDATA2, |
| 121 | HAMMER2_MSGQ_STATE_ERROR } state; |
| 122 | int fifo_beg; /* buffered data */ |
| 123 | int fifo_end; |
| 124 | int hbytes; /* header size */ |
| 125 | int abytes; /* aux_data size */ |
| 126 | int error; |
| 127 | int seq; /* salt sequencer */ |
| 128 | int msgcount; |
| 129 | hammer2_msg_t *msg; |
| 130 | hammer2_msg_queue_t msgq; |
| 131 | }; |
| 132 | |
| 133 | typedef struct hammer2_ioq hammer2_ioq_t; |
| 134 | |
| 135 | #define HAMMER2_IOQ_ERROR_SYNC 1 /* bad magic / out of sync */ |
| 136 | #define HAMMER2_IOQ_ERROR_EOF 2 /* unexpected EOF */ |
| 137 | #define HAMMER2_IOQ_ERROR_SOCK 3 /* read() error on socket */ |
| 138 | #define HAMMER2_IOQ_ERROR_FIELD 4 /* invalid field */ |
| 139 | #define HAMMER2_IOQ_ERROR_HCRC 5 /* core header crc bad */ |
| 140 | #define HAMMER2_IOQ_ERROR_XCRC 6 /* ext header crc bad */ |
| 141 | #define HAMMER2_IOQ_ERROR_ACRC 7 /* aux data crc bad */ |
| 142 | #define HAMMER2_IOQ_ERROR_STATE 8 /* bad state */ |
| 143 | #define HAMMER2_IOQ_ERROR_NOPEER 9 /* bad socket peer */ |
| 144 | #define HAMMER2_IOQ_ERROR_NORKEY 10 /* no remote keyfile found */ |
| 145 | #define HAMMER2_IOQ_ERROR_NOLKEY 11 /* no local keyfile found */ |
| 146 | #define HAMMER2_IOQ_ERROR_KEYXCHGFAIL 12 /* key exchange failed */ |
| 147 | #define HAMMER2_IOQ_ERROR_KEYFMT 13 /* key file format problem */ |
| 148 | #define HAMMER2_IOQ_ERROR_BADURANDOM 14 /* /dev/urandom is bad */ |
| 149 | |
| 150 | #define HAMMER2_IOQ_MAXIOVEC 16 |
| 151 | |
| 152 | /* |
| 153 | * hammer2_iocom - governs a messaging stream connection |
| 154 | */ |
| 155 | struct hammer2_iocom { |
| 156 | hammer2_ioq_t ioq_rx; |
| 157 | hammer2_ioq_t ioq_tx; |
| 158 | hammer2_msg_queue_t freeq; /* free msgs hdr only */ |
| 159 | hammer2_msg_queue_t freeq_aux; /* free msgs w/aux_data */ |
| 160 | void (*recvmsg_callback)(struct hammer2_iocom *); |
| 161 | void (*sendmsg_callback)(struct hammer2_iocom *); |
| 162 | void (*altmsg_callback)(struct hammer2_iocom *); |
| 163 | int sock_fd; /* comm socket or pipe */ |
| 164 | int alt_fd; /* thread signal, tty, etc */ |
| 165 | int flags; |
| 166 | int rxmisc; |
| 167 | int txmisc; |
| 168 | char rxbuf[HAMMER2_MSGBUF_SIZE]; /* for ioq_rx only */ |
| 169 | }; |
| 170 | |
| 171 | typedef struct hammer2_iocom hammer2_iocom_t; |
| 172 | |
| 173 | #define HAMMER2_IOCOMF_EOF 0x00000001 /* EOF or ERROR on desc */ |
| 174 | #define HAMMER2_IOCOMF_RREQ 0x00000002 /* request read-data event */ |
| 175 | #define HAMMER2_IOCOMF_WREQ 0x00000004 /* request write-avail event */ |
| 176 | #define HAMMER2_IOCOMF_WIDLE 0x00000008 /* request write-avail event */ |
| 177 | #define HAMMER2_IOCOMF_SIGNAL 0x00000010 |
| 178 | |
| 179 | /*************************************************************************** |
| 180 | * HIGH LEVEL MESSAGING * |
| 181 | *************************************************************************** |
| 182 | * |
| 183 | * Persistent state is stored via the hammer2_persist structure. |
| 184 | */ |
| 185 | struct hammer2_persist { |
| 186 | uint32_t lcmd; /* recent command direction */ |
| 187 | uint32_t lrep; /* recent reply direction */ |
| 188 | }; |
| 189 | |
| 190 | typedef struct hammer2_persist hammer2_persist_t; |
| 191 | |
| 192 | #if 0 |
| 193 | |
| 194 | |
| 195 | |
| 196 | /* |
| 197 | * The global registration structure consolidates information accumulated |
| 198 | * via the spanning tree algorithm and tells us which connection (link) |
| 199 | * is the best path to get to any given registration. |
| 200 | * |
| 201 | * glob_node - Splay entry for this registration in the global index |
| 202 | * of all registrations. |
| 203 | * |
| 204 | * glob_entry - tailq entry when this registration's best_span element |
| 205 | * has changed state. |
| 206 | * |
| 207 | * span_list - Head of a simple list of spanning tree entries which |
| 208 | * we use to determine the best link. |
| 209 | * |
| 210 | * best_span - Which of the span structure on span_list is the best |
| 211 | * one. |
| 212 | * |
| 213 | * source_root - Splay tree root indexing all mesasges sent from this |
| 214 | * registration. The messages are indexed by |
| 215 | * {linkid,msgid} XXX |
| 216 | * |
| 217 | * target_root - Splay tree root indexing all messages being sent to |
| 218 | * this registration. The messages are indexed by |
| 219 | * {linkid,msgid}. XXX |
| 220 | * |
| 221 | * |
| 222 | * Whenever spanning tree data causes a registration's best_link field to |
| 223 | * change that registration is transmitted as spanning tree data to every |
| 224 | * active link. Note that pure clients to the cluster, of which there can |
| 225 | * be millions, typically do not transmit spanning tree data to each other. |
| 226 | * |
| 227 | * Each registration is assigned a unique linkid local to the node (another |
| 228 | * node might assign a different linkid to the same registration). This |
| 229 | * linkid must be persistent as long as messages are active and is used |
| 230 | * to identify the message source and target. |
| 231 | */ |
| 232 | TAILQ_HEAD(hammer2_span_list, hammer2_span); |
| 233 | typedef struct hammer2_span_list hammer2_span_list_t; |
| 234 | |
| 235 | struct hammer2_reg { |
| 236 | SPLAY_ENTRY(hammer2_reg) glob_node; /* index of registrations */ |
| 237 | TAILQ_ENTRY(hammer2_reg) glob_entry; /* when modified */ |
| 238 | hammer2_span_list_t span_list; /* list of hammer2_span's */ |
| 239 | hammer2_span_t *best_span; /* best span entry */ |
| 240 | hammer2_pmsg_splay_head_t source_root; /* msgs sent from reg */ |
| 241 | hammer2_pmsg_splay_head_t target_root; /* msgs sent to reg */ |
| 242 | uuid_t pfs_id; /* key field */ |
| 243 | uuid_t pfs_fsid; /* key field */ |
| 244 | uint32_t linkid; |
| 245 | int flags; |
| 246 | int refs; |
| 247 | }; |
| 248 | |
| 249 | #define HAMMER2_PROTO_REGF_MODIFIED 0x0001 |
| 250 | |
| 251 | /* |
| 252 | * Each link (connection) collects spanning tree data received via the |
| 253 | * link and stores it in these span structures. |
| 254 | */ |
| 255 | struct hammer2_span { |
| 256 | TAILQ_ENTRY(hammer2_span) span_entry; /* from hammer2_reg */ |
| 257 | SPLAY_ENTRY(hammer2_span) span_node; /* from hammer2_link */ |
| 258 | hammer2_reg_t *reg; |
| 259 | hammer2_link_t *link; |
| 260 | int weight; |
| 261 | }; |
| 262 | |
| 263 | /* |
| 264 | * Most hammer2 messages represent transactions and have persistent state |
| 265 | * which must be recorded. Some messages, such as cache states and inode |
| 266 | * representations are very long-lasting transactions. |
| 267 | * |
| 268 | * Each node in the graph must keep track of the message state in order |
| 269 | * to perform the proper action when a connection is lost. To do this |
| 270 | * the message is indexed on the source and target (global) registration, |
| 271 | * and the actual span element the message was received on and transmitted |
| 272 | * to is recorded (allowing us to retrieve the physical links involved). |
| 273 | * |
| 274 | * The {source_reg, target_reg, msgid} uniquely identifies a message. Any |
| 275 | * streaming operations using the same msgid use the same rendezvous. |
| 276 | * |
| 277 | * It is important to note that recorded state must use the same physical |
| 278 | * link (and thus the same chain of links across the graph) as was 'forged' |
| 279 | * by the initial message for that msgid. If the source span a message is |
| 280 | * received on does not match the recorded source, or the recorded target |
| 281 | * is no longer routeable, the message will be returned or generate an ABORT |
| 282 | * with LINKFAIL as appropriate. |
| 283 | */ |
| 284 | struct hammer2_pmsg { |
| 285 | SPLAY_ENTRY(hammer2_pmsg) source_reg; |
| 286 | SPLAY_ENTRY(hammer2_pmsg) target_reg; |
| 287 | hammer2_span_t *source; |
| 288 | hammer2_span_t *target; |
| 289 | uint16_t msgid; |
| 290 | }; |
| 291 | |
| 292 | #endif |