| Commit | Line | Data |
|---|---|---|
| 9ab15106 MD |
1 | /* |
| 2 | * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved. | |
| 3 | * | |
| 4 | * This code is derived from software contributed to The DragonFly Project | |
| 5 | * by Matthew Dillon <dillon@dragonflybsd.org> | |
| 6 | * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> | |
| 7 | * | |
| 8 | * Redistribution and use in source and binary forms, with or without | |
| 9 | * modification, are permitted provided that the following conditions | |
| 10 | * are met: | |
| 11 | * | |
| 12 | * 1. Redistributions of source code must retain the above copyright | |
| 13 | * notice, this list of conditions and the following disclaimer. | |
| 14 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 15 | * notice, this list of conditions and the following disclaimer in | |
| 16 | * the documentation and/or other materials provided with the | |
| 17 | * distribution. | |
| 18 | * 3. Neither the name of The DragonFly Project nor the names of its | |
| 19 | * contributors may be used to endorse or promote products derived | |
| 20 | * from this software without specific, prior written permission. | |
| 21 | * | |
| 22 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| 23 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 24 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |
| 25 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | |
| 26 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
| 27 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
| 28 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| 29 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |
| 30 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 31 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |
| 32 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 33 | * SUCH DAMAGE. | |
| 34 | */ | |
| 35 | #ifndef VFS_HAMMER2_NETWORK_H_ | |
| 36 | #define VFS_HAMMER2_NETWORK_H_ | |
| 37 | ||
| 38 | #ifndef _VFS_HAMMER2_DISK_H_ | |
| 39 | #include "hammer2_disk.h" | |
| 40 | #endif | |
| 41 | ||
| 42 | /* | |
| 43 | * Mesh network protocol structures. | |
| 44 | * | |
| 10c86c4e MD |
45 | * SPAN PROTOCOL |
| 46 | * | |
| 9ab15106 | 47 | * The mesh is constructed from point-to-point streaming links with varying |
| 10c86c4e MD |
48 | * levels of interconnectedness, forming a graph. Terminii in the graph |
| 49 | * are entities such as a HAMMER2 PFS or a network mount or other types | |
| 50 | * of nodes. | |
| 51 | * | |
| 52 | * The spanning tree protocol runs symmetrically on every node. Each node | |
| 53 | * transmits a representitive LNK_SPAN out all available connections. Nodes | |
| 54 | * also receive LNK_SPANs from other nodes (obviously), and must aggregate, | |
| 55 | * reduce, and relay those LNK_SPANs out all available connections, thus | |
| 56 | * propagating the spanning tree. Any connection failure or topology change | |
| 57 | * causes changes in the LNK_SPAN propagation. | |
| 58 | * | |
| 59 | * Each LNK_SPAN or LNK_SPAN relay represents a virtual circuit for routing | |
| 60 | * purposes. In addition, each relay is chained in one direction, | |
| 61 | * representing a 1:N fan-out (i.e. one received LNK_SPAN can be relayed out | |
| 62 | * multiple connections). In order to be able to route a message via a | |
| 63 | * LNK_SPAN over a deterministic route THE MESSAGE CAN ONLY FLOW FROM A | |
| 64 | * REMOTE NODE TOWARDS OUR NODE (N:1 fan-in). | |
| 65 | * | |
| 66 | * This supports the requirement that we have both message serialization | |
| 67 | * and positive feedback if a topology change breaks the chain of VCs | |
| 68 | * the message is flowing over. A remote node sending a message to us | |
| 69 | * will get positive feedback that the route was broken and can take suitable | |
| 70 | * action to terminate the transaction with an error. | |
| 71 | * | |
| 72 | * TRANSACTIONAL REPLIES | |
| 73 | * | |
| 74 | * However, when we receive a command message from a remote node and we want | |
| 75 | * to reply to it, we have a problem. We want the remote node to have | |
| 76 | * positive feedback if our reply fails to make it, but if we use a virtual | |
| 77 | * circuit based on the remote node's LNK_SPAN to us it will be a DIFFERENT | |
| 78 | * virtual circuit than the one the remote node used to message us. That's | |
| 79 | * a problem because it means we have no reliable way to notify the remote | |
| 80 | * node if we get notified that our reply has failed. | |
| 81 | * | |
| 82 | * The solution is to first note the fact that the remote chose an optimal | |
| 83 | * route to get to us, so the reverse should be true. The reason the VC | |
| 84 | * might not exist over the same route in the reverse is because there may | |
| 85 | * be multiple paths available with the same distance metric. | |
| 86 | * | |
| 87 | * But this also means that we can adjust the messaging protocols to | |
| 88 | * propagate a LNK_SPAN from the remote to us WHILE the remote's command | |
| 89 | * message is being sent to us, and it will not only likely be optimal but | |
| 90 | * it might also already exist, and it will also guarantee that a reply | |
| 91 | * failure will propagate back to both sides (because even though each | |
| 92 | * direction is using a different VC chain, the two chains are still | |
| 93 | * going along the same path). | |
| 94 | * | |
| 95 | * We communicate the return VC by having the relay adjust both the target | |
| 96 | * and the source fields in the message, rather than just the target, on | |
| 97 | * each relay. As of when the message gets to us the 'source' field will | |
| 98 | * represent the VC for the return direction (and of course also identify | |
| 99 | * the node the message came from). | |
| 100 | * | |
| 101 | * This way both sides get positive feedback if a topology change disrupts | |
| 102 | * the VC for the transaction. We also get one additional guarantee, and | |
| 103 | * that is no spurious messages. Messages simply die when the VC they are | |
| 104 | * traveling over is broken, in either direction, simple as that. | |
| 105 | * It makes managing message transactional states very easy. | |
| 8c280d5d MD |
106 | * |
| 107 | * MESSAGE TRANSACTIONAL STATES | |
| 9ab15106 MD |
108 | * |
| 109 | * Message state is handled by the CREATE, DELETE, REPLY, and ABORT | |
| 110 | * flags. Message state is typically recorded at the end points and | |
| 111 | * at each hop until a DELETE is received from both sides. | |
| 112 | * | |
| 113 | * One-way messages such as those used by spanning tree commands are not | |
| 26bf1a36 MD |
114 | * recorded. These are sent without the CREATE, DELETE, or ABORT flags set. |
| 115 | * ABORT is not supported for one-off messages. The REPLY bit can be used | |
| 116 | * to distinguish between command and status if desired. | |
| 117 | * | |
| 118 | * Persistent-state messages are messages which require a reply to be | |
| 119 | * returned. These messages can also consist of multiple message elements | |
| 120 | * for the command or reply or both (or neither). The command message | |
| 121 | * sequence sets CREATE on the first message and DELETE on the last message. | |
| 122 | * A single message command sets both (CREATE|DELETE). The reply message | |
| 123 | * sequence works the same way but of course also sets the REPLY bit. | |
| 124 | * | |
| 125 | * Persistent-state messages can be aborted by sending a message element | |
| 126 | * with the ABORT flag set. This flag can be combined with either or both | |
| 127 | * the CREATE and DELETE flags. When combined with the CREATE flag the | |
| 128 | * command is treated as non-blocking but still executes. Whem combined | |
| 129 | * with the DELETE flag no additional message elements are required. | |
| 130 | * | |
| 131 | * ABORT SPECIAL CASE - Mid-stream aborts. A mid-stream abort can be sent | |
| 132 | * when supported by the sender by sending an ABORT message with neither | |
| 133 | * CREATE or DELETE set. This effectively turns the message into a | |
| 134 | * non-blocking message (but depending on what is being represented can also | |
| 135 | * cut short prior data elements in the stream). | |
| 136 | * | |
| 137 | * ABORT SPECIAL CASE - Abort-after-DELETE. Persistent messages have to be | |
| 138 | * abortable if the stream/pipe/whatever is lost. In this situation any | |
| 139 | * forwarding relay needs to unconditionally abort commands and replies that | |
| 140 | * are still active. This is done by sending an ABORT|DELETE even in | |
| 141 | * situations where a DELETE has already been sent in that direction. This | |
| 142 | * is done, for example, when links are in a half-closed state. In this | |
| 143 | * situation it is possible for the abort request to race a transition to the | |
| 144 | * fully closed state. ABORT|DELETE messages which race the fully closed | |
| 145 | * state are expected to be discarded by the other end. | |
| 9ab15106 | 146 | * |
| 9ab15106 MD |
147 | * -- |
| 148 | * | |
| 8c280d5d MD |
149 | * All base and extended message headers are 64-byte aligned, and all |
| 150 | * transports must support extended message headers up to HAMMER2_MSGHDR_MAX. | |
| 151 | * Currently we allow extended message headers up to 2048 bytes. Note | |
| 152 | * that the extended header size is encoded in the 'cmd' field of the header. | |
| 9ab15106 | 153 | * |
| 8c280d5d | 154 | * Any in-band data is padded to a 64-byte alignment and placed directly |
| 9ab15106 MD |
155 | * after the extended header (after the higher-level cmd/rep structure). |
| 156 | * The actual unaligned size of the in-band data is encoded in the aux_bytes | |
| 157 | * field in this case. Maximum data sizes are negotiated during registration. | |
| 158 | * | |
| 8c280d5d MD |
159 | * Auxillary data can be in-band or out-of-band. In-band data sets aux_descr |
| 160 | * equal to 0. Any out-of-band data must be negotiated by the SPAN protocol. | |
| 161 | * | |
| 162 | * Auxillary data, whether in-band or out-of-band, must be at-least 64-byte | |
| 163 | * aligned. The aux_bytes field contains the actual byte-granular length | |
| 164 | * and not the aligned length. | |
| 165 | * | |
| 166 | * hdr_crc is calculated over the entire, ALIGNED extended header. For | |
| 167 | * the purposes of calculating the crc, the hdr_crc field is 0. That is, | |
| 168 | * if calculating the crc in HW a 32-bit '0' must be inserted in place of | |
| 169 | * the hdr_crc field when reading the entire header and compared at the | |
| 170 | * end (but the actual hdr_crc must be left intact in memory). A simple | |
| 171 | * counter to replace the field going into the CRC generator does the job | |
| 172 | * in HW. The CRC endian is based on the magic number field and may have | |
| 173 | * to be byte-swapped, too (which is also easy to do in HW). | |
| 174 | * | |
| 175 | * aux_crc is calculated over the entire, ALIGNED auxillary data. | |
| 176 | * | |
| 177 | * SHARED MEMORY IMPLEMENTATIONS | |
| 178 | * | |
| 179 | * Shared-memory implementations typically use a pipe to transmit the extended | |
| 180 | * message header and shared memory to store any auxilary data. Auxillary | |
| 181 | * data in one-way (non-transactional) messages is typically required to be | |
| 182 | * inline. CRCs are still recommended and required at the beginning, but | |
| 183 | * may be negotiated away later. | |
| 184 | * | |
| 185 | * MULTI-PATH MESSAGE DUPLICATION | |
| 186 | * | |
| 187 | * Redundancy can be negotiated but is not required in the current spec. | |
| 188 | * Basically you send the same message, with the same msgid, via several | |
| 189 | * paths to the target. The msgid is the rendezvous. The first copy that | |
| 190 | * makes it to the target is used, the second is ignored. Similarly for | |
| 191 | * replies. This can improve performance during span flapping. Only | |
| 192 | * transactional messages will be serialized. The target might receive | |
| 193 | * multiple copies of one-way messages in higher protocol layers (potentially | |
| 194 | * out of order, too). | |
| 9ab15106 MD |
195 | */ |
| 196 | struct hammer2_msg_hdr { | |
| 8c280d5d | 197 | uint16_t magic; /* 00 sanity, synchro, endian */ |
| 10c86c4e | 198 | uint16_t reserved02; /* 02 */ |
| 8c280d5d MD |
199 | uint32_t salt; /* 04 random salt helps w/crypto */ |
| 200 | ||
| 201 | uint64_t msgid; /* 08 message transaction id */ | |
| 10c86c4e MD |
202 | uint64_t source; /* 10 originator or 0 */ |
| 203 | uint64_t target; /* 18 destination or 0 */ | |
| 8c280d5d | 204 | |
| 10c86c4e MD |
205 | uint32_t cmd; /* 20 flags | cmd | hdr_size / ALIGN */ |
| 206 | uint32_t aux_crc; /* 24 auxillary data crc */ | |
| 207 | uint32_t aux_bytes; /* 28 auxillary data length (bytes) */ | |
| 208 | uint32_t error; /* 2C error code or 0 */ | |
| 209 | uint64_t aux_descr; /* 30 negotiated OOB data descr */ | |
| 8c280d5d MD |
210 | uint32_t reserved38; /* 38 */ |
| 211 | uint32_t hdr_crc; /* 3C (aligned) extended header crc */ | |
| 9ab15106 MD |
212 | }; |
| 213 | ||
| 214 | typedef struct hammer2_msg_hdr hammer2_msg_hdr_t; | |
| 215 | ||
| 216 | #define HAMMER2_MSGHDR_MAGIC 0x4832 | |
| 217 | #define HAMMER2_MSGHDR_MAGIC_REV 0x3248 | |
| 218 | #define HAMMER2_MSGHDR_CRCOFF offsetof(hammer2_msg_hdr_t, salt) | |
| 219 | #define HAMMER2_MSGHDR_CRCBYTES (sizeof(hammer2_msg_hdr_t) - \ | |
| 220 | HAMMER2_MSGHDR_CRCOFF) | |
| 221 | ||
| 222 | /* | |
| 223 | * Administrative protocol limits. | |
| 224 | */ | |
| 8c280d5d MD |
225 | #define HAMMER2_MSGHDR_MAX 2048 /* <= 65535 */ |
| 226 | #define HAMMER2_MSGAUX_MAX 65536 /* <= 1MB */ | |
| 9ab15106 MD |
227 | #define HAMMER2_MSGBUF_SIZE (HAMMER2_MSGHDR_MAX * 4) |
| 228 | #define HAMMER2_MSGBUF_MASK (HAMMER2_MSGBUF_SIZE - 1) | |
| 229 | ||
| 230 | /* | |
| 231 | * The message (cmd) field also encodes various flags and the total size | |
| 232 | * of the message header. This allows the protocol processors to validate | |
| 233 | * persistency and structural settings for every command simply by | |
| 234 | * switch()ing on the (cmd) field. | |
| 235 | */ | |
| 236 | #define HAMMER2_MSGF_CREATE 0x80000000U /* msg start */ | |
| 237 | #define HAMMER2_MSGF_DELETE 0x40000000U /* msg end */ | |
| 238 | #define HAMMER2_MSGF_REPLY 0x20000000U /* reply path */ | |
| 239 | #define HAMMER2_MSGF_ABORT 0x10000000U /* abort req */ | |
| 240 | #define HAMMER2_MSGF_AUXOOB 0x08000000U /* aux-data is OOB */ | |
| 241 | #define HAMMER2_MSGF_FLAG2 0x04000000U | |
| 242 | #define HAMMER2_MSGF_FLAG1 0x02000000U | |
| 243 | #define HAMMER2_MSGF_FLAG0 0x01000000U | |
| 244 | ||
| 245 | #define HAMMER2_MSGF_FLAGS 0xFF000000U /* all flags */ | |
| 246 | #define HAMMER2_MSGF_PROTOS 0x00F00000U /* all protos */ | |
| 247 | #define HAMMER2_MSGF_CMDS 0x000FFF00U /* all cmds */ | |
| 248 | #define HAMMER2_MSGF_SIZE 0x000000FFU /* N*32 */ | |
| 249 | ||
| 250 | #define HAMMER2_MSGF_CMDSWMASK (HAMMER2_MSGF_CMDS | \ | |
| 251 | HAMMER2_MSGF_SIZE | \ | |
| 252 | HAMMER2_MSGF_PROTOS | \ | |
| 253 | HAMMER2_MSGF_REPLY) | |
| 42e2a62e | 254 | |
| f2e07ffb MD |
255 | #define HAMMER2_MSGF_BASECMDMASK (HAMMER2_MSGF_CMDS | \ |
| 256 | HAMMER2_MSGF_SIZE | \ | |
| 257 | HAMMER2_MSGF_PROTOS) | |
| 9ab15106 | 258 | |
| 42e2a62e MD |
259 | #define HAMMER2_MSGF_TRANSMASK (HAMMER2_MSGF_CMDS | \ |
| 260 | HAMMER2_MSGF_SIZE | \ | |
| 261 | HAMMER2_MSGF_PROTOS | \ | |
| 262 | HAMMER2_MSGF_REPLY | \ | |
| 263 | HAMMER2_MSGF_CREATE | \ | |
| 264 | HAMMER2_MSGF_DELETE) | |
| 265 | ||
| 9ab15106 MD |
266 | #define HAMMER2_MSG_PROTO_LNK 0x00000000U |
| 267 | #define HAMMER2_MSG_PROTO_DBG 0x00100000U | |
| 9b8b748f MD |
268 | #define HAMMER2_MSG_PROTO_DOM 0x00200000U |
| 269 | #define HAMMER2_MSG_PROTO_CAC 0x00300000U | |
| 270 | #define HAMMER2_MSG_PROTO_QRM 0x00400000U | |
| 271 | #define HAMMER2_MSG_PROTO_BLK 0x00500000U | |
| 272 | #define HAMMER2_MSG_PROTO_VOP 0x00600000U | |
| 9ab15106 MD |
273 | |
| 274 | /* | |
| 275 | * Message command constructors, sans flags | |
| 276 | */ | |
| 8c280d5d | 277 | #define HAMMER2_MSG_ALIGN 64 |
| 9ab15106 MD |
278 | #define HAMMER2_MSG_ALIGNMASK (HAMMER2_MSG_ALIGN - 1) |
| 279 | #define HAMMER2_MSG_DOALIGN(bytes) (((bytes) + HAMMER2_MSG_ALIGNMASK) & \ | |
| 280 | ~HAMMER2_MSG_ALIGNMASK) | |
| f2e07ffb | 281 | #define HAMMER2_MSG_HDR_ENCODE(elm) (((uint32_t)sizeof(struct elm) + \ |
| 9ab15106 MD |
282 | HAMMER2_MSG_ALIGNMASK) / \ |
| 283 | HAMMER2_MSG_ALIGN) | |
| 284 | ||
| 285 | #define HAMMER2_MSG_LNK(cmd, elm) (HAMMER2_MSG_PROTO_LNK | \ | |
| 286 | ((cmd) << 8) | \ | |
| 287 | HAMMER2_MSG_HDR_ENCODE(elm)) | |
| 288 | ||
| 289 | #define HAMMER2_MSG_DBG(cmd, elm) (HAMMER2_MSG_PROTO_DBG | \ | |
| 290 | ((cmd) << 8) | \ | |
| 291 | HAMMER2_MSG_HDR_ENCODE(elm)) | |
| 292 | ||
| 9b8b748f MD |
293 | #define HAMMER2_MSG_DOM(cmd, elm) (HAMMER2_MSG_PROTO_DOM | \ |
| 294 | ((cmd) << 8) | \ | |
| 295 | HAMMER2_MSG_HDR_ENCODE(elm)) | |
| 296 | ||
| 9ab15106 MD |
297 | #define HAMMER2_MSG_CAC(cmd, elm) (HAMMER2_MSG_PROTO_CAC | \ |
| 298 | ((cmd) << 8) | \ | |
| 299 | HAMMER2_MSG_HDR_ENCODE(elm)) | |
| 300 | ||
| 301 | #define HAMMER2_MSG_QRM(cmd, elm) (HAMMER2_MSG_PROTO_QRM | \ | |
| 302 | ((cmd) << 8) | \ | |
| 303 | HAMMER2_MSG_HDR_ENCODE(elm)) | |
| 304 | ||
| 305 | #define HAMMER2_MSG_BLK(cmd, elm) (HAMMER2_MSG_PROTO_BLK | \ | |
| 306 | ((cmd) << 8) | \ | |
| 307 | HAMMER2_MSG_HDR_ENCODE(elm)) | |
| 308 | ||
| 309 | #define HAMMER2_MSG_VOP(cmd, elm) (HAMMER2_MSG_PROTO_VOP | \ | |
| 310 | ((cmd) << 8) | \ | |
| 311 | HAMMER2_MSG_HDR_ENCODE(elm)) | |
| 312 | ||
| 313 | /* | |
| 314 | * Link layer ops basically talk to just the other side of a direct | |
| 315 | * connection. | |
| 316 | * | |
| 1a34728c | 317 | * LNK_PAD - One-way message on link-0, ignored by target. Used to |
| 9ab15106 MD |
318 | * pad message buffers on shared-memory transports. Not |
| 319 | * typically used with TCP. | |
| 320 | * | |
| 1a34728c | 321 | * LNK_PING - One-way message on link-0, keep-alive, run by both sides |
| 8c280d5d MD |
322 | * typically 1/sec on idle link, link is lost after 10 seconds |
| 323 | * of inactivity. | |
| 324 | * | |
| 1a34728c | 325 | * LNK_AUTH - Authenticate the connection, negotiate administrative |
| 9ab15106 MD |
326 | * rights & encryption, protocol class, etc. Only PAD and |
| 327 | * AUTH messages (not even PING) are accepted until | |
| 328 | * authentication is complete. This message also identifies | |
| 329 | * the host. | |
| 330 | * | |
| 1a34728c | 331 | * LNK_CONN - Enable the SPAN protocol on link-0, possibly also installing |
| 8c280d5d MD |
332 | * a PFS filter (by cluster id, unique id, and/or wildcarded |
| 333 | * name). | |
| 9ab15106 | 334 | * |
| 1a34728c | 335 | * LNK_SPAN - A SPAN transaction on link-0 enables messages to be relayed |
| 8c280d5d MD |
336 | * to/from a particular cluster node. SPANs are received, |
| 337 | * sorted, aggregated, and retransmitted back out across all | |
| 338 | * applicable connections. | |
| 9ab15106 MD |
339 | * |
| 340 | * The leaf protocol also uses this to make a PFS available | |
| 341 | * to the cluster (e.g. on-mount). | |
| 1a34728c MD |
342 | * |
| 343 | * LNK_VOLCONF - Volume header configuration change. All hammer2 | |
| 344 | * connections (hammer2 connect ...) stored in the volume | |
| 345 | * header are spammed at the link level to the hammer2 | |
| 346 | * service daemon, and any live configuration change | |
| 347 | * thereafter. | |
| 9ab15106 MD |
348 | */ |
| 349 | #define HAMMER2_LNK_PAD HAMMER2_MSG_LNK(0x000, hammer2_msg_hdr) | |
| 350 | #define HAMMER2_LNK_PING HAMMER2_MSG_LNK(0x001, hammer2_msg_hdr) | |
| 351 | #define HAMMER2_LNK_AUTH HAMMER2_MSG_LNK(0x010, hammer2_lnk_auth) | |
| 8c280d5d MD |
352 | #define HAMMER2_LNK_CONN HAMMER2_MSG_LNK(0x011, hammer2_lnk_conn) |
| 353 | #define HAMMER2_LNK_SPAN HAMMER2_MSG_LNK(0x012, hammer2_lnk_span) | |
| 1a34728c | 354 | #define HAMMER2_LNK_VOLCONF HAMMER2_MSG_LNK(0x020, hammer2_lnk_volconf) |
| 9ab15106 MD |
355 | #define HAMMER2_LNK_ERROR HAMMER2_MSG_LNK(0xFFF, hammer2_msg_hdr) |
| 356 | ||
| 357 | /* | |
| 8c280d5d MD |
358 | * LNK_CONN - Register connection for SPAN (transaction, left open) |
| 359 | * | |
| 360 | * One LNK_CONN transaction may be opened on a stream connection, registering | |
| 361 | * the connection with the SPAN subsystem and allowing the subsystem to | |
| 362 | * accept and relay SPANs to this connection. | |
| 363 | * | |
| 364 | * The LNK_CONN message may contain a filter, limiting the desireable SPANs. | |
| 365 | * | |
| 366 | * This message contains a lot of the same info that a SPAN message contains, | |
| 367 | * but is not a SPAN. That is, without this message the SPAN subprotocol will | |
| 368 | * not be executed on the connection, nor is this message a promise that the | |
| 369 | * sending end is a client or node of a cluster. | |
| 370 | */ | |
| 81666e1b MD |
371 | struct hammer2_lnk_auth { |
| 372 | hammer2_msg_hdr_t head; | |
| 373 | char dummy[64]; | |
| 374 | }; | |
| 375 | ||
| 8c280d5d MD |
376 | struct hammer2_lnk_conn { |
| 377 | hammer2_msg_hdr_t head; | |
| 1a34728c | 378 | uuid_t mediaid; /* media configuration id */ |
| 8c280d5d MD |
379 | uuid_t pfs_clid; /* rendezvous pfs uuid */ |
| 380 | uuid_t pfs_fsid; /* unique pfs uuid */ | |
| 381 | uint8_t pfs_type; /* peer type */ | |
| 382 | uint8_t reserved01; | |
| 383 | uint16_t proto_version; /* high level protocol support */ | |
| 384 | uint32_t status; /* status flags */ | |
| 385 | uint8_t reserved02[8]; | |
| 32d51501 | 386 | int32_t dist; /* span distance */ |
| 8c280d5d MD |
387 | uint32_t reserved03[15]; |
| 388 | char label[256]; /* PFS label (can be wildcard) */ | |
| 389 | }; | |
| 390 | ||
| 391 | typedef struct hammer2_lnk_conn hammer2_lnk_conn_t; | |
| 392 | ||
| 393 | /* | |
| 394 | * LNK_SPAN - Relay a SPAN (transaction, left open) | |
| 9b8b748f MD |
395 | * |
| 396 | * This message registers a PFS/PFS_TYPE with the other end of the connection, | |
| 397 | * telling the other end who we are and what we can provide or what we want | |
| 398 | * to consume. Multiple registrations can be maintained as open transactions | |
| 399 | * with each one specifying a unique {source} linkid. | |
| 400 | * | |
| 401 | * Registrations are sent from {source}=S {1...n} to {target}=0 and maintained | |
| 402 | * as open transactions. Registrations are also received and maintains as | |
| 403 | * open transactions, creating a matrix of linkid's. | |
| 404 | * | |
| 405 | * While these transactions are open additional transactions can be executed | |
| 406 | * between any two linkid's {source}=S (registrations we sent) to {target}=T | |
| 407 | * (registrations we received). | |
| 408 | * | |
| 409 | * Closure of any registration transaction will automatically abort any open | |
| 410 | * transactions using the related linkids. Closure can be initiated | |
| 411 | * voluntarily from either side with either end issuing a DELETE, or they | |
| 412 | * can be ABORTed. | |
| 413 | * | |
| 414 | * Status updates are performed via the open transaction. | |
| 415 | * | |
| 416 | * -- | |
| 417 | * | |
| 418 | * A registration identifies a node and its various PFS parameters including | |
| 419 | * the PFS_TYPE. For example, a diskless HAMMER2 client typically identifies | |
| 420 | * itself as PFSTYPE_CLIENT. | |
| 421 | * | |
| 422 | * Any node may serve as a cluster controller, aggregating and passing | |
| 423 | * on received registrations, but end-points do not have to implement this | |
| 424 | * ability. Most end-points typically implement a single client-style or | |
| 425 | * server-style PFS_TYPE and rendezvous at a cluster controller. | |
| 426 | * | |
| 427 | * The cluster controller does not aggregate/pass-on all received | |
| 428 | * registrations. It typically filters what gets passed on based on | |
| 429 | * what it receives. | |
| 430 | * | |
| 431 | * STATUS UPDATES: Status updates use the same structure but typically | |
| 432 | * only contain incremental changes to pfs_type, with the | |
| 433 | * label field containing a text status. | |
| 434 | */ | |
| 435 | struct hammer2_lnk_span { | |
| 436 | hammer2_msg_hdr_t head; | |
| 8c280d5d | 437 | uuid_t pfs_clid; /* rendezvous pfs uuid */ |
| 9b8b748f MD |
438 | uuid_t pfs_fsid; /* unique pfs uuid */ |
| 439 | uint8_t pfs_type; /* peer type */ | |
| 440 | uint8_t reserved01; | |
| 441 | uint16_t proto_version; /* high level protocol support */ | |
| 442 | uint32_t status; /* status flags */ | |
| 443 | uint8_t reserved02[8]; | |
| 32d51501 | 444 | int32_t dist; /* span distance */ |
| 8c280d5d | 445 | uint32_t reserved03[15]; |
| 9b8b748f MD |
446 | char label[256]; /* PFS label (can be wildcard) */ |
| 447 | }; | |
| 448 | ||
| 42e2a62e MD |
449 | typedef struct hammer2_lnk_span hammer2_lnk_span_t; |
| 450 | ||
| 451 | #define HAMMER2_SPAN_PROTO_1 1 | |
| 452 | ||
| 9b8b748f | 453 | /* |
| 1a34728c MD |
454 | * LNK_VOLCONF |
| 455 | */ | |
| 456 | struct hammer2_lnk_volconf { | |
| 457 | hammer2_msg_hdr_t head; | |
| 458 | hammer2_copy_data_t copy; /* copy spec */ | |
| 459 | int32_t index; | |
| 460 | int32_t unused01; | |
| 461 | uuid_t mediaid; | |
| 462 | int64_t reserved02[32]; | |
| 463 | }; | |
| 464 | ||
| 465 | typedef struct hammer2_lnk_volconf hammer2_lnk_volconf_t; | |
| 466 | ||
| 467 | /* | |
| 9ab15106 MD |
468 | * Debug layer ops operate on any link |
| 469 | * | |
| 470 | * SHELL - Persist stream, access the debug shell on the target | |
| 471 | * registration. Multiple shells can be operational. | |
| 472 | */ | |
| 473 | #define HAMMER2_DBG_SHELL HAMMER2_MSG_DBG(0x001, hammer2_dbg_shell) | |
| 474 | ||
| 475 | struct hammer2_dbg_shell { | |
| 476 | hammer2_msg_hdr_t head; | |
| 477 | }; | |
| 478 | typedef struct hammer2_dbg_shell hammer2_dbg_shell_t; | |
| 479 | ||
| 480 | /* | |
| 9b8b748f MD |
481 | * Domain layer ops operate on any link, link-0 may be used when the |
| 482 | * directory connected target is the desired registration. | |
| 483 | * | |
| 484 | * (nothing defined) | |
| 485 | */ | |
| 486 | ||
| 487 | /* | |
| 9ab15106 MD |
488 | * Cache layer ops operate on any link, link-0 may be used when the |
| 489 | * directly connected target is the desired registration. | |
| 490 | * | |
| 491 | * LOCK - Persist state, blockable, abortable. | |
| 492 | * | |
| 493 | * Obtain cache state (MODIFIED, EXCLUSIVE, SHARED, or INVAL) | |
| 494 | * in any of three domains (TREE, INUM, ATTR, DIRENT) for a | |
| 495 | * particular key relative to cache state already owned. | |
| 496 | * | |
| 497 | * TREE - Effects entire sub-tree at the specified element | |
| 498 | * and will cause existing cache state owned by | |
| 499 | * other nodes to be adjusted such that the request | |
| 500 | * can be granted. | |
| 501 | * | |
| 502 | * INUM - Only effects inode creation/deletion of an existing | |
| 503 | * element or a new element, by inumber and/or name. | |
| 504 | * typically can be held for very long periods of time | |
| 505 | * (think the vnode cache), directly relates to | |
| 506 | * hammer2_chain structures representing inodes. | |
| 507 | * | |
| 508 | * ATTR - Only effects an inode's attributes, such as | |
| 509 | * ownership, modes, etc. Used for lookups, chdir, | |
| 510 | * open, etc. mtime has no affect. | |
| 511 | * | |
| 512 | * DIRENT - Only affects an inode's attributes plus the | |
| 513 | * attributes or names related to any directory entry | |
| 514 | * directly under this inode (non-recursively). Can | |
| 515 | * be retained for medium periods of time when doing | |
| 516 | * directory scans. | |
| 517 | * | |
| 518 | * This function may block and can be aborted. You may be | |
| 519 | * granted cache state that is more broad than the state you | |
| 520 | * requested (e.g. a different set of domains and/or an element | |
| 521 | * at a higher layer in the tree). When quorum operations | |
| 522 | * are used you may have to reconcile these grants to the | |
| 523 | * lowest common denominator. | |
| 524 | * | |
| 525 | * In order to grant your request either you or the target | |
| 526 | * (or both) may have to obtain a quorum agreement. Deadlock | |
| 527 | * resolution may be required. When doing it yourself you | |
| 528 | * will typically maintain an active message to each master | |
| 529 | * node in the system. You can only grant the cache state | |
| 530 | * when a quorum of nodes agree. | |
| 531 | * | |
| 532 | * The cache state includes transaction id information which | |
| 533 | * can be used to resolve data requests. | |
| 534 | */ | |
| 535 | #define HAMMER2_CAC_LOCK HAMMER2_MSG_CAC(0x001, hammer2_cac_lock) | |
| 536 | ||
| 537 | /* | |
| 538 | * Quorum layer ops operate on any link, link-0 may be used when the | |
| 539 | * directly connected target is the desired registration. | |
| 540 | * | |
| 541 | * COMMIT - Persist state, blockable, abortable | |
| 542 | * | |
| 543 | * Issue a COMMIT in two phases. A quorum must acknowledge | |
| 544 | * the operation to proceed to phase-2. Message-update to | |
| 545 | * proceed to phase-2. | |
| 546 | */ | |
| 547 | #define HAMMER2_QRM_COMMIT HAMMER2_MSG_QRM(0x001, hammer2_qrm_commit) | |
| 548 | ||
| 549 | /* | |
| 8c280d5d MD |
550 | * NOTE!!!! ALL EXTENDED HEADER STRUCTURES MUST BE 64-BYTE ALIGNED!!! |
| 551 | * | |
| 9ab15106 MD |
552 | * General message errors |
| 553 | * | |
| 554 | * 0x00 - 0x1F Local iocomm errors | |
| 555 | * 0x20 - 0x2F Global errors | |
| 556 | */ | |
| 81666e1b | 557 | #define HAMMER2_MSG_ERR_NOSUPP 0x20 |
| 9ab15106 | 558 | |
| 42e2a62e | 559 | union hammer2_msg_any { |
| 9ab15106 MD |
560 | char buf[HAMMER2_MSGHDR_MAX]; |
| 561 | hammer2_msg_hdr_t head; | |
| 42e2a62e | 562 | hammer2_lnk_span_t lnk_span; |
| 8c280d5d | 563 | hammer2_lnk_conn_t lnk_conn; |
| 1a34728c | 564 | hammer2_lnk_volconf_t lnk_volconf; |
| 9ab15106 MD |
565 | }; |
| 566 | ||
| 42e2a62e | 567 | typedef union hammer2_msg_any hammer2_msg_any_t; |
| 9ab15106 MD |
568 | |
| 569 | #endif |