2 * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
18 * 3. Neither the name of The DragonFly Project nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific, prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 #ifndef VFS_HAMMER2_NETWORK_H_
36 #define VFS_HAMMER2_NETWORK_H_
38 #ifndef _VFS_HAMMER2_DISK_H_
39 #include "hammer2_disk.h"
43 * Mesh network protocol structures.
45 * The mesh is constructed from point-to-point streaming links with varying
46 * levels of interconnectedness, forming a graph. When a link is established
47 * link id #0 is reserved for link-level communications. This link is used
48 * for authentication, registration, ping, further link id negotiations,
49 * spanning tree, and so on.
51 * The spanning tree forms a weighted shortest-path-first graph amongst
52 * those nodes with sufficient administrative rights to relay between
53 * registrations. Each link maintains a full reachability set, aggregates
54 * it, and retransmits via the shortest path. However, leaf nodes (even leaf
55 * nodes with multiple connections) can opt not to be part of the spanning
56 * tree and typically (due to administrative rights) their registrations
57 * are not reported to other leafs.
59 * All message responses follow the SAME PATH that the original message
60 * followed, but in reverse. This is an absolute requirement since messages
61 * expecting replies record persistent state at each hop.
63 * Message state is handled by the CREATE, DELETE, REPLY, and ABORT
64 * flags. Message state is typically recorded at the end points and
65 * at each hop until a DELETE is received from both sides.
67 * One-way messages such as those used by spanning tree commands are not
68 * recorded. These are sent without the CREATE, DELETE, or ABORT flags set.
69 * ABORT is not supported for one-off messages. The REPLY bit can be used
70 * to distinguish between command and status if desired.
72 * Persistent-state messages are messages which require a reply to be
73 * returned. These messages can also consist of multiple message elements
74 * for the command or reply or both (or neither). The command message
75 * sequence sets CREATE on the first message and DELETE on the last message.
76 * A single message command sets both (CREATE|DELETE). The reply message
77 * sequence works the same way but of course also sets the REPLY bit.
79 * Persistent-state messages can be aborted by sending a message element
80 * with the ABORT flag set. This flag can be combined with either or both
81 * the CREATE and DELETE flags. When combined with the CREATE flag the
82 * command is treated as non-blocking but still executes. Whem combined
83 * with the DELETE flag no additional message elements are required.
85 * ABORT SPECIAL CASE - Mid-stream aborts. A mid-stream abort can be sent
86 * when supported by the sender by sending an ABORT message with neither
87 * CREATE or DELETE set. This effectively turns the message into a
88 * non-blocking message (but depending on what is being represented can also
89 * cut short prior data elements in the stream).
91 * ABORT SPECIAL CASE - Abort-after-DELETE. Persistent messages have to be
92 * abortable if the stream/pipe/whatever is lost. In this situation any
93 * forwarding relay needs to unconditionally abort commands and replies that
94 * are still active. This is done by sending an ABORT|DELETE even in
95 * situations where a DELETE has already been sent in that direction. This
96 * is done, for example, when links are in a half-closed state. In this
97 * situation it is possible for the abort request to race a transition to the
98 * fully closed state. ABORT|DELETE messages which race the fully closed
99 * state are expected to be discarded by the other end.
102 * NEGOTIATION OF {source} AND {target}
104 * In this discussion 'originator' describes the original sender of a message
105 * and not the relays inbetween, while 'sender' describes the last relay.
106 * The two mean the same thing only when the originator IS the last relay.
108 * The {source} field is sender-localized. The sender assigns this field
109 * based on which connection the message originally came from. The initial
110 * message as sent by the originator sets source=0. This also means that a
111 * leaf connection will always send messages with source=0.
113 * The {source} field must be re-localized at each hop, since messages
114 * coming from multiple connections to a node will use conflicting
115 * {source} values. This can lead to linkid exhaustion which is discussed
116 * a few paragraphs down.
118 * The {target} field is sender-allocated. Messages sent to {target} are
119 * preceeded by a FORGE message to {target} which associates a registration
120 * with {target}, or UNFORGE to delete the associtation.
122 * The msgid field is 32 bits (remember some messages have long-lived
123 * persistent state so this is important!). One-way messages always use
128 * Because {source} must be re-localized at each hop it is possible to run
129 * out of link identifiers. At the same time we want to allow millions of
130 * client/leaf connections, and 'millions' is a lot bigger than 65535.
132 * We also have a problem with the persistent message state... If a single
133 * client's vnode cache has a million vnodes that can represent a million
134 * persistent cache states. Multiply by a million clients and ... oops!
136 * To solve these problems leafs connect into protocol-aggregators rather
137 * than directly to the cluster. The linkid and core message protocols only
138 * occur within the cluster and not by the leafs. A leaf can still connect
139 * to multiple aggregators for redundancy if it desires but may have to
140 * pick and choose which inodes go where since acquiring a cache state lock
141 * over one connection will cause conflicts to be invalidated on the other.
142 * In otherwords, there are limitations to this approach.
144 * A protocol aggregator takes any number of connections and aggregates
145 * the operations down to a single linkid. For example, this means that
146 * the protocol aggregator is responsible for maintaining all the cache
147 * state and performing crunches to reduce the overall amount of state
148 * down to something the cluster core can handle.
152 * All message headers are 32-byte aligned and sized (all command and
153 * response structures must be 32-byte aligned), and all transports must
154 * support message headers up to HAMMER2_MSGHDR_MAX. The msg structure
155 * can handle up to 8160 bytes but to keep things fairly clean we limit
156 * message headers to 2048 bytes.
158 * Any in-band data is padded to a 32-byte alignment and placed directly
159 * after the extended header (after the higher-level cmd/rep structure).
160 * The actual unaligned size of the in-band data is encoded in the aux_bytes
161 * field in this case. Maximum data sizes are negotiated during registration.
163 * Use of out-of-band data must be negotiated. In this case bit 31 of
164 * aux_bytes will be set and the remaining bits will contain information
165 * specific to the out-of-band transfer (such as DMA channel, slot, etc).
167 * (must be 32 bytes exactly to match the alignment requirement and to
168 * support pad records in shared-memory FIFO schemes)
170 struct hammer2_msg_hdr {
171 uint16_t magic; /* sanity, synchronization, endian */
172 uint16_t icrc1; /* base header crc &salt on */
173 uint32_t salt; /* random salt helps crypto/replay */
175 uint16_t source; /* command originator linkid */
176 uint16_t target; /* reply originator linkid */
177 uint32_t msgid; /* {source,target,msgid} unique */
179 uint32_t cmd; /* flags | cmd | hdr_size / 32 */
180 uint16_t error; /* error field */
183 uint16_t icrc2; /* extended header crc (after base) */
184 uint16_t aux_bytes; /* aux data descriptor or size / 32 */
185 uint32_t aux_icrc; /* aux data iscsi crc */
188 typedef struct hammer2_msg_hdr hammer2_msg_hdr_t;
190 #define HAMMER2_MSGHDR_MAGIC 0x4832
191 #define HAMMER2_MSGHDR_MAGIC_REV 0x3248
192 #define HAMMER2_MSGHDR_CRCOFF offsetof(hammer2_msg_hdr_t, salt)
193 #define HAMMER2_MSGHDR_CRCBYTES (sizeof(hammer2_msg_hdr_t) - \
194 HAMMER2_MSGHDR_CRCOFF)
197 * Administrative protocol limits.
199 #define HAMMER2_MSGHDR_MAX 2048 /* msg struct max is 8192-32 */
200 #define HAMMER2_MSGAUX_MAX 65536 /* msg struct max is 2MB-32 */
201 #define HAMMER2_MSGBUF_SIZE (HAMMER2_MSGHDR_MAX * 4)
202 #define HAMMER2_MSGBUF_MASK (HAMMER2_MSGBUF_SIZE - 1)
205 * The message (cmd) field also encodes various flags and the total size
206 * of the message header. This allows the protocol processors to validate
207 * persistency and structural settings for every command simply by
208 * switch()ing on the (cmd) field.
210 #define HAMMER2_MSGF_CREATE 0x80000000U /* msg start */
211 #define HAMMER2_MSGF_DELETE 0x40000000U /* msg end */
212 #define HAMMER2_MSGF_REPLY 0x20000000U /* reply path */
213 #define HAMMER2_MSGF_ABORT 0x10000000U /* abort req */
214 #define HAMMER2_MSGF_AUXOOB 0x08000000U /* aux-data is OOB */
215 #define HAMMER2_MSGF_FLAG2 0x04000000U
216 #define HAMMER2_MSGF_FLAG1 0x02000000U
217 #define HAMMER2_MSGF_FLAG0 0x01000000U
219 #define HAMMER2_MSGF_FLAGS 0xFF000000U /* all flags */
220 #define HAMMER2_MSGF_PROTOS 0x00F00000U /* all protos */
221 #define HAMMER2_MSGF_CMDS 0x000FFF00U /* all cmds */
222 #define HAMMER2_MSGF_SIZE 0x000000FFU /* N*32 */
224 #define HAMMER2_MSGF_CMDSWMASK (HAMMER2_MSGF_CMDS | \
225 HAMMER2_MSGF_SIZE | \
226 HAMMER2_MSGF_PROTOS | \
229 #define HAMMER2_MSGF_BASECMDMASK (HAMMER2_MSGF_CMDS | \
230 HAMMER2_MSGF_SIZE | \
233 #define HAMMER2_MSGF_TRANSMASK (HAMMER2_MSGF_CMDS | \
234 HAMMER2_MSGF_SIZE | \
235 HAMMER2_MSGF_PROTOS | \
236 HAMMER2_MSGF_REPLY | \
237 HAMMER2_MSGF_CREATE | \
240 #define HAMMER2_MSG_PROTO_LNK 0x00000000U
241 #define HAMMER2_MSG_PROTO_DBG 0x00100000U
242 #define HAMMER2_MSG_PROTO_DOM 0x00200000U
243 #define HAMMER2_MSG_PROTO_CAC 0x00300000U
244 #define HAMMER2_MSG_PROTO_QRM 0x00400000U
245 #define HAMMER2_MSG_PROTO_BLK 0x00500000U
246 #define HAMMER2_MSG_PROTO_VOP 0x00600000U
249 * Message command constructors, sans flags
251 #define HAMMER2_MSG_ALIGN 32
252 #define HAMMER2_MSG_ALIGNMASK (HAMMER2_MSG_ALIGN - 1)
253 #define HAMMER2_MSG_DOALIGN(bytes) (((bytes) + HAMMER2_MSG_ALIGNMASK) & \
254 ~HAMMER2_MSG_ALIGNMASK)
255 #define HAMMER2_MSG_HDR_ENCODE(elm) (((uint32_t)sizeof(struct elm) + \
256 HAMMER2_MSG_ALIGNMASK) / \
259 #define HAMMER2_MSG_LNK(cmd, elm) (HAMMER2_MSG_PROTO_LNK | \
261 HAMMER2_MSG_HDR_ENCODE(elm))
263 #define HAMMER2_MSG_DBG(cmd, elm) (HAMMER2_MSG_PROTO_DBG | \
265 HAMMER2_MSG_HDR_ENCODE(elm))
267 #define HAMMER2_MSG_DOM(cmd, elm) (HAMMER2_MSG_PROTO_DOM | \
269 HAMMER2_MSG_HDR_ENCODE(elm))
271 #define HAMMER2_MSG_CAC(cmd, elm) (HAMMER2_MSG_PROTO_CAC | \
273 HAMMER2_MSG_HDR_ENCODE(elm))
275 #define HAMMER2_MSG_QRM(cmd, elm) (HAMMER2_MSG_PROTO_QRM | \
277 HAMMER2_MSG_HDR_ENCODE(elm))
279 #define HAMMER2_MSG_BLK(cmd, elm) (HAMMER2_MSG_PROTO_BLK | \
281 HAMMER2_MSG_HDR_ENCODE(elm))
283 #define HAMMER2_MSG_VOP(cmd, elm) (HAMMER2_MSG_PROTO_VOP | \
285 HAMMER2_MSG_HDR_ENCODE(elm))
288 * Link layer ops basically talk to just the other side of a direct
291 * PAD - One-way message on link-0, ignored by target. Used to
292 * pad message buffers on shared-memory transports. Not
293 * typically used with TCP.
295 * AUTH - Authenticate the connection, negotiate administrative
296 * rights & encryption, protocol class, etc. Only PAD and
297 * AUTH messages (not even PING) are accepted until
298 * authentication is complete. This message also identifies
301 * PING - One-way message on link-0, keep-alive, run by both sides
302 * typically 1/sec on idle link, link is lost after 10 seconds
305 * STATUS - One-way message on link-0, host-spanning tree message.
306 * Connection and authentication status is propagated using
307 * these messages on a per-connection basis. Works like SPAN
308 * but is only used for general status. See the hammer2
311 * SPAN - One-way message on link-0, spanning tree message adds,
312 * drops, or updates a remote registration. Sent by both
313 * sides, delta changes only. Visbility into remote
314 * registrations may be limited and received registrations
315 * may be filtered depending on administrative controls.
317 * A multiply-connected node maintains SPAN information on
318 * each link independently and then retransmits an aggregation
319 * of the shortest-weighted path for each registration to
320 * all links when a received change adjusts the path.
322 * The leaf protocol also uses this to make a PFS available
323 * to the cluster (e.g. on-mount).
325 #define HAMMER2_LNK_PAD HAMMER2_MSG_LNK(0x000, hammer2_msg_hdr)
326 #define HAMMER2_LNK_PING HAMMER2_MSG_LNK(0x001, hammer2_msg_hdr)
327 #define HAMMER2_LNK_AUTH HAMMER2_MSG_LNK(0x010, hammer2_lnk_auth)
328 #define HAMMER2_LNK_SPAN HAMMER2_MSG_LNK(0x011, hammer2_lnk_span)
329 #define HAMMER2_LNK_ERROR HAMMER2_MSG_LNK(0xFFF, hammer2_msg_hdr)
332 * SPAN - Registration (transaction, left open)
334 * This message registers a PFS/PFS_TYPE with the other end of the connection,
335 * telling the other end who we are and what we can provide or what we want
336 * to consume. Multiple registrations can be maintained as open transactions
337 * with each one specifying a unique {source} linkid.
339 * Registrations are sent from {source}=S {1...n} to {target}=0 and maintained
340 * as open transactions. Registrations are also received and maintains as
341 * open transactions, creating a matrix of linkid's.
343 * While these transactions are open additional transactions can be executed
344 * between any two linkid's {source}=S (registrations we sent) to {target}=T
345 * (registrations we received).
347 * Closure of any registration transaction will automatically abort any open
348 * transactions using the related linkids. Closure can be initiated
349 * voluntarily from either side with either end issuing a DELETE, or they
352 * Status updates are performed via the open transaction.
356 * A registration identifies a node and its various PFS parameters including
357 * the PFS_TYPE. For example, a diskless HAMMER2 client typically identifies
358 * itself as PFSTYPE_CLIENT.
360 * Any node may serve as a cluster controller, aggregating and passing
361 * on received registrations, but end-points do not have to implement this
362 * ability. Most end-points typically implement a single client-style or
363 * server-style PFS_TYPE and rendezvous at a cluster controller.
365 * The cluster controller does not aggregate/pass-on all received
366 * registrations. It typically filters what gets passed on based on
369 * STATUS UPDATES: Status updates use the same structure but typically
370 * only contain incremental changes to pfs_type, with the
371 * label field containing a text status.
373 struct hammer2_lnk_span {
374 hammer2_msg_hdr_t head;
375 uuid_t pfs_id; /* rendezvous pfs uuid */
376 uuid_t pfs_fsid; /* unique pfs uuid */
377 uint8_t pfs_type; /* peer type */
379 uint16_t proto_version; /* high level protocol support */
380 uint32_t status; /* status flags */
381 uint8_t reserved02[8];
382 uint32_t reserved03[16];
383 char label[256]; /* PFS label (can be wildcard) */
386 typedef struct hammer2_lnk_span hammer2_lnk_span_t;
388 #define HAMMER2_SPAN_PROTO_1 1
391 * Debug layer ops operate on any link
393 * SHELL - Persist stream, access the debug shell on the target
394 * registration. Multiple shells can be operational.
396 #define HAMMER2_DBG_SHELL HAMMER2_MSG_DBG(0x001, hammer2_dbg_shell)
398 struct hammer2_dbg_shell {
399 hammer2_msg_hdr_t head;
401 typedef struct hammer2_dbg_shell hammer2_dbg_shell_t;
404 * Domain layer ops operate on any link, link-0 may be used when the
405 * directory connected target is the desired registration.
411 * Cache layer ops operate on any link, link-0 may be used when the
412 * directly connected target is the desired registration.
414 * LOCK - Persist state, blockable, abortable.
416 * Obtain cache state (MODIFIED, EXCLUSIVE, SHARED, or INVAL)
417 * in any of three domains (TREE, INUM, ATTR, DIRENT) for a
418 * particular key relative to cache state already owned.
420 * TREE - Effects entire sub-tree at the specified element
421 * and will cause existing cache state owned by
422 * other nodes to be adjusted such that the request
425 * INUM - Only effects inode creation/deletion of an existing
426 * element or a new element, by inumber and/or name.
427 * typically can be held for very long periods of time
428 * (think the vnode cache), directly relates to
429 * hammer2_chain structures representing inodes.
431 * ATTR - Only effects an inode's attributes, such as
432 * ownership, modes, etc. Used for lookups, chdir,
433 * open, etc. mtime has no affect.
435 * DIRENT - Only affects an inode's attributes plus the
436 * attributes or names related to any directory entry
437 * directly under this inode (non-recursively). Can
438 * be retained for medium periods of time when doing
441 * This function may block and can be aborted. You may be
442 * granted cache state that is more broad than the state you
443 * requested (e.g. a different set of domains and/or an element
444 * at a higher layer in the tree). When quorum operations
445 * are used you may have to reconcile these grants to the
446 * lowest common denominator.
448 * In order to grant your request either you or the target
449 * (or both) may have to obtain a quorum agreement. Deadlock
450 * resolution may be required. When doing it yourself you
451 * will typically maintain an active message to each master
452 * node in the system. You can only grant the cache state
453 * when a quorum of nodes agree.
455 * The cache state includes transaction id information which
456 * can be used to resolve data requests.
458 #define HAMMER2_CAC_LOCK HAMMER2_MSG_CAC(0x001, hammer2_cac_lock)
461 * Quorum layer ops operate on any link, link-0 may be used when the
462 * directly connected target is the desired registration.
464 * COMMIT - Persist state, blockable, abortable
466 * Issue a COMMIT in two phases. A quorum must acknowledge
467 * the operation to proceed to phase-2. Message-update to
468 * proceed to phase-2.
470 #define HAMMER2_QRM_COMMIT HAMMER2_MSG_QRM(0x001, hammer2_qrm_commit)
473 * General message errors
475 * 0x00 - 0x1F Local iocomm errors
476 * 0x20 - 0x2F Global errors
478 #define HAMMER2_MSG_ERR_UNKNOWN 0x20
480 union hammer2_msg_any {
481 char buf[HAMMER2_MSGHDR_MAX];
482 hammer2_msg_hdr_t head;
483 hammer2_lnk_span_t lnk_span;
486 typedef union hammer2_msg_any hammer2_msg_any_t;