cluster - Major kernel component work (diskiocom, xdisk, kdmsg)
[dragonfly.git] / sys / sys / dmsg.h
CommitLineData
9ab15106
MD
1/*
2 * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
9ab15106
MD
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
9ab15106 34
5bc5bca2
MD
35#ifndef _SYS_DMSG_H_
36#define _SYS_DMSG_H_
37
38#ifndef _SYS_MALLOC_H_
39#include <sys/malloc.h>
40#endif
41#ifndef _SYS_TREE_H_
42#include <sys/tree.h>
43#endif
3a5aa68f
MD
44#ifndef _SYS_THREAD_H_
45#include <sys/thread.h>
46#endif
5bc5bca2
MD
47#ifndef _SYS_UUID_H_
48#include <sys/uuid.h>
9ab15106
MD
49#endif
50
51/*
52 * Mesh network protocol structures.
53 *
03d99ea4 54 * CONN PROTOCOL
10c86c4e 55 *
9ab15106 56 * The mesh is constructed from point-to-point streaming links with varying
10c86c4e
MD
57 * levels of interconnectedness, forming a graph. Terminii in the graph
58 * are entities such as a HAMMER2 PFS or a network mount or other types
59 * of nodes.
60 *
03d99ea4
MD
61 * Upon connecting and after authentication, a LNK_CONN transaction is opened
62 * on circuit 0 by both ends. This configures and enables the SPAN protocol.
63 * The LNK_CONN transaction remains open for the life of the connection.
64 *
65 * SPAN PROTOCOL
66 *
67 * Once enabled, termini transmits a representitive LNK_SPAN out all
68 * available connections advertising what it is. Nodes maintaing multiple
69 * connections will relay received LNK_SPANs out available connections
70 * with some filtering based on the CONN configuration. A distance metric
71 * and per-node random value (rnss) is aggregated.
72 *
73 * Since LNK_SPANs can rapidly multiply in a complex graph, not all incoming
74 * LNK_SPANs will be relayed. Only the top N over all collect LNK_SPANs for
75 * any given advertisement are relayed.
76 *
77 * It is possible to code the SPANning tree algorithm to guarantee that
78 * symmetrical spans will be generated after stabilization. The RNSS field
79 * is used to help distinguish and reduce paths in complex graphs when
80 * symmetric spans are desired. We always generate RNSS but we currently do
81 * not implement symmetrical SPAN guarantees.
82 *
83 * CIRC PROTOCOL
84 *
85 * We aren't done yet. Before transactions can be relayed, symmetric paths
86 * must be formed via the LNK_CIRC protocol. The LNK_CIRC protocol
87 * establishes a virtual circuit from any node to any other node, creating
88 * a circuit id which is stored in dmsg_hdr.circuit. Messages received on
89 * one side or forwarded to the other. Forwarded messages bypass normal
90 * state tracking.
91 *
92 * A virtual circuit is forged by working the propogated SPANs backwards.
93 * Each node in the graph helps propagate the virtual circuit by attach the
94 * LNK_CIRC transaction it receives to a LNK_CIRC transaction it initiates
95 * out the other interface.
96 *
97 * Since SPANs are link-state transactions any change in related span(s)
98 * will also force-terminate VC's using those spans.
8c280d5d
MD
99 *
100 * MESSAGE TRANSACTIONAL STATES
9ab15106
MD
101 *
102 * Message state is handled by the CREATE, DELETE, REPLY, and ABORT
103 * flags. Message state is typically recorded at the end points and
104 * at each hop until a DELETE is received from both sides.
105 *
106 * One-way messages such as those used by spanning tree commands are not
26bf1a36
MD
107 * recorded. These are sent without the CREATE, DELETE, or ABORT flags set.
108 * ABORT is not supported for one-off messages. The REPLY bit can be used
109 * to distinguish between command and status if desired.
110 *
111 * Persistent-state messages are messages which require a reply to be
112 * returned. These messages can also consist of multiple message elements
113 * for the command or reply or both (or neither). The command message
114 * sequence sets CREATE on the first message and DELETE on the last message.
115 * A single message command sets both (CREATE|DELETE). The reply message
116 * sequence works the same way but of course also sets the REPLY bit.
117 *
118 * Persistent-state messages can be aborted by sending a message element
119 * with the ABORT flag set. This flag can be combined with either or both
120 * the CREATE and DELETE flags. When combined with the CREATE flag the
121 * command is treated as non-blocking but still executes. Whem combined
122 * with the DELETE flag no additional message elements are required.
123 *
124 * ABORT SPECIAL CASE - Mid-stream aborts. A mid-stream abort can be sent
125 * when supported by the sender by sending an ABORT message with neither
126 * CREATE or DELETE set. This effectively turns the message into a
127 * non-blocking message (but depending on what is being represented can also
128 * cut short prior data elements in the stream).
129 *
130 * ABORT SPECIAL CASE - Abort-after-DELETE. Persistent messages have to be
131 * abortable if the stream/pipe/whatever is lost. In this situation any
132 * forwarding relay needs to unconditionally abort commands and replies that
133 * are still active. This is done by sending an ABORT|DELETE even in
134 * situations where a DELETE has already been sent in that direction. This
135 * is done, for example, when links are in a half-closed state. In this
136 * situation it is possible for the abort request to race a transition to the
137 * fully closed state. ABORT|DELETE messages which race the fully closed
138 * state are expected to be discarded by the other end.
9ab15106 139 *
9ab15106
MD
140 * --
141 *
8c280d5d 142 * All base and extended message headers are 64-byte aligned, and all
5bc5bca2 143 * transports must support extended message headers up to DMSG_HDR_MAX.
8c280d5d
MD
144 * Currently we allow extended message headers up to 2048 bytes. Note
145 * that the extended header size is encoded in the 'cmd' field of the header.
9ab15106 146 *
8c280d5d 147 * Any in-band data is padded to a 64-byte alignment and placed directly
9ab15106
MD
148 * after the extended header (after the higher-level cmd/rep structure).
149 * The actual unaligned size of the in-band data is encoded in the aux_bytes
150 * field in this case. Maximum data sizes are negotiated during registration.
151 *
8c280d5d
MD
152 * Auxillary data can be in-band or out-of-band. In-band data sets aux_descr
153 * equal to 0. Any out-of-band data must be negotiated by the SPAN protocol.
154 *
155 * Auxillary data, whether in-band or out-of-band, must be at-least 64-byte
156 * aligned. The aux_bytes field contains the actual byte-granular length
8d6d37b8
MD
157 * and not the aligned length. The crc is against the aligned length (so
158 * a faster crc algorithm can be used, theoretically).
8c280d5d
MD
159 *
160 * hdr_crc is calculated over the entire, ALIGNED extended header. For
161 * the purposes of calculating the crc, the hdr_crc field is 0. That is,
162 * if calculating the crc in HW a 32-bit '0' must be inserted in place of
163 * the hdr_crc field when reading the entire header and compared at the
164 * end (but the actual hdr_crc must be left intact in memory). A simple
165 * counter to replace the field going into the CRC generator does the job
166 * in HW. The CRC endian is based on the magic number field and may have
167 * to be byte-swapped, too (which is also easy to do in HW).
168 *
169 * aux_crc is calculated over the entire, ALIGNED auxillary data.
170 *
171 * SHARED MEMORY IMPLEMENTATIONS
172 *
173 * Shared-memory implementations typically use a pipe to transmit the extended
174 * message header and shared memory to store any auxilary data. Auxillary
175 * data in one-way (non-transactional) messages is typically required to be
176 * inline. CRCs are still recommended and required at the beginning, but
177 * may be negotiated away later.
9ab15106 178 */
5bc5bca2 179struct dmsg_hdr {
8c280d5d 180 uint16_t magic; /* 00 sanity, synchro, endian */
10c86c4e 181 uint16_t reserved02; /* 02 */
8c280d5d
MD
182 uint32_t salt; /* 04 random salt helps w/crypto */
183
184 uint64_t msgid; /* 08 message transaction id */
03d99ea4
MD
185 uint64_t circuit; /* 10 circuit id or 0 */
186 uint64_t reserved18; /* 18 */
8c280d5d 187
10c86c4e
MD
188 uint32_t cmd; /* 20 flags | cmd | hdr_size / ALIGN */
189 uint32_t aux_crc; /* 24 auxillary data crc */
190 uint32_t aux_bytes; /* 28 auxillary data length (bytes) */
191 uint32_t error; /* 2C error code or 0 */
192 uint64_t aux_descr; /* 30 negotiated OOB data descr */
8c280d5d
MD
193 uint32_t reserved38; /* 38 */
194 uint32_t hdr_crc; /* 3C (aligned) extended header crc */
9ab15106
MD
195};
196
5bc5bca2 197typedef struct dmsg_hdr dmsg_hdr_t;
9ab15106 198
5bc5bca2
MD
199#define DMSG_HDR_MAGIC 0x4832
200#define DMSG_HDR_MAGIC_REV 0x3248
201#define DMSG_HDR_CRCOFF offsetof(dmsg_hdr_t, salt)
202#define DMSG_HDR_CRCBYTES (sizeof(dmsg_hdr_t) - DMSG_HDR_CRCOFF)
9ab15106
MD
203
204/*
205 * Administrative protocol limits.
206 */
5bc5bca2
MD
207#define DMSG_HDR_MAX 2048 /* <= 65535 */
208#define DMSG_AUX_MAX 65536 /* <= 1MB */
209#define DMSG_BUF_SIZE (DMSG_HDR_MAX * 4)
210#define DMSG_BUF_MASK (DMSG_BUF_SIZE - 1)
9ab15106
MD
211
212/*
213 * The message (cmd) field also encodes various flags and the total size
214 * of the message header. This allows the protocol processors to validate
215 * persistency and structural settings for every command simply by
216 * switch()ing on the (cmd) field.
217 */
5bc5bca2
MD
218#define DMSGF_CREATE 0x80000000U /* msg start */
219#define DMSGF_DELETE 0x40000000U /* msg end */
220#define DMSGF_REPLY 0x20000000U /* reply path */
221#define DMSGF_ABORT 0x10000000U /* abort req */
222#define DMSGF_AUXOOB 0x08000000U /* aux-data is OOB */
223#define DMSGF_FLAG2 0x04000000U
224#define DMSGF_FLAG1 0x02000000U
225#define DMSGF_FLAG0 0x01000000U
226
227#define DMSGF_FLAGS 0xFF000000U /* all flags */
228#define DMSGF_PROTOS 0x00F00000U /* all protos */
229#define DMSGF_CMDS 0x000FFF00U /* all cmds */
230#define DMSGF_SIZE 0x000000FFU /* N*32 */
231
232#define DMSGF_CMDSWMASK (DMSGF_CMDS | \
233 DMSGF_SIZE | \
234 DMSGF_PROTOS | \
235 DMSGF_REPLY)
236
237#define DMSGF_BASECMDMASK (DMSGF_CMDS | \
238 DMSGF_SIZE | \
239 DMSGF_PROTOS)
240
241#define DMSGF_TRANSMASK (DMSGF_CMDS | \
242 DMSGF_SIZE | \
243 DMSGF_PROTOS | \
244 DMSGF_REPLY | \
245 DMSGF_CREATE | \
246 DMSGF_DELETE)
247
248#define DMSG_PROTO_LNK 0x00000000U
249#define DMSG_PROTO_DBG 0x00100000U
250#define DMSG_PROTO_DOM 0x00200000U
251#define DMSG_PROTO_CAC 0x00300000U
252#define DMSG_PROTO_QRM 0x00400000U
253#define DMSG_PROTO_BLK 0x00500000U
254#define DMSG_PROTO_VOP 0x00600000U
9ab15106
MD
255
256/*
257 * Message command constructors, sans flags
258 */
5bc5bca2
MD
259#define DMSG_ALIGN 64
260#define DMSG_ALIGNMASK (DMSG_ALIGN - 1)
261#define DMSG_DOALIGN(bytes) (((bytes) + DMSG_ALIGNMASK) & \
262 ~DMSG_ALIGNMASK)
263
264#define DMSG_HDR_ENCODE(elm) (((uint32_t)sizeof(struct elm) + \
265 DMSG_ALIGNMASK) / \
266 DMSG_ALIGN)
267
268#define DMSG_LNK(cmd, elm) (DMSG_PROTO_LNK | \
9ab15106 269 ((cmd) << 8) | \
5bc5bca2 270 DMSG_HDR_ENCODE(elm))
9ab15106 271
5bc5bca2 272#define DMSG_DBG(cmd, elm) (DMSG_PROTO_DBG | \
9ab15106 273 ((cmd) << 8) | \
5bc5bca2 274 DMSG_HDR_ENCODE(elm))
9ab15106 275
5bc5bca2 276#define DMSG_DOM(cmd, elm) (DMSG_PROTO_DOM | \
9b8b748f 277 ((cmd) << 8) | \
5bc5bca2 278 DMSG_HDR_ENCODE(elm))
9b8b748f 279
5bc5bca2 280#define DMSG_CAC(cmd, elm) (DMSG_PROTO_CAC | \
9ab15106 281 ((cmd) << 8) | \
5bc5bca2 282 DMSG_HDR_ENCODE(elm))
9ab15106 283
5bc5bca2 284#define DMSG_QRM(cmd, elm) (DMSG_PROTO_QRM | \
9ab15106 285 ((cmd) << 8) | \
5bc5bca2 286 DMSG_HDR_ENCODE(elm))
9ab15106 287
5bc5bca2 288#define DMSG_BLK(cmd, elm) (DMSG_PROTO_BLK | \
9ab15106 289 ((cmd) << 8) | \
5bc5bca2 290 DMSG_HDR_ENCODE(elm))
9ab15106 291
5bc5bca2 292#define DMSG_VOP(cmd, elm) (DMSG_PROTO_VOP | \
9ab15106 293 ((cmd) << 8) | \
5bc5bca2 294 DMSG_HDR_ENCODE(elm))
9ab15106
MD
295
296/*
297 * Link layer ops basically talk to just the other side of a direct
298 * connection.
299 *
03d99ea4 300 * LNK_PAD - One-way message on circuit 0, ignored by target. Used to
9ab15106
MD
301 * pad message buffers on shared-memory transports. Not
302 * typically used with TCP.
303 *
03d99ea4 304 * LNK_PING - One-way message on circuit-0, keep-alive, run by both sides
8c280d5d
MD
305 * typically 1/sec on idle link, link is lost after 10 seconds
306 * of inactivity.
307 *
1a34728c 308 * LNK_AUTH - Authenticate the connection, negotiate administrative
9ab15106
MD
309 * rights & encryption, protocol class, etc. Only PAD and
310 * AUTH messages (not even PING) are accepted until
311 * authentication is complete. This message also identifies
312 * the host.
313 *
03d99ea4
MD
314 * LNK_CONN - Enable the SPAN protocol on circuit-0, possibly also
315 * installing a PFS filter (by cluster id, unique id, and/or
316 * wildcarded name).
9ab15106 317 *
03d99ea4
MD
318 * LNK_SPAN - A SPAN transaction on circuit-0 enables messages to be
319 * relayed to/from a particular cluster node. SPANs are
320 * received, sorted, aggregated, filtered, and retransmitted
321 * back out across all applicable connections.
9ab15106
MD
322 *
323 * The leaf protocol also uses this to make a PFS available
324 * to the cluster (e.g. on-mount).
1a34728c 325 *
03d99ea4
MD
326 * LNK_CIRC - a CIRC transaction establishes a circuit from source to
327 * target by creating pairs of open transactions across each
328 * hop.
329 *
1a34728c
MD
330 * LNK_VOLCONF - Volume header configuration change. All hammer2
331 * connections (hammer2 connect ...) stored in the volume
03d99ea4 332 * header are spammed on circuit 0 to the hammer2
1a34728c
MD
333 * service daemon, and any live configuration change
334 * thereafter.
9ab15106 335 */
5bc5bca2
MD
336#define DMSG_LNK_PAD DMSG_LNK(0x000, dmsg_hdr)
337#define DMSG_LNK_PING DMSG_LNK(0x001, dmsg_hdr)
338#define DMSG_LNK_AUTH DMSG_LNK(0x010, dmsg_lnk_auth)
339#define DMSG_LNK_CONN DMSG_LNK(0x011, dmsg_lnk_conn)
340#define DMSG_LNK_SPAN DMSG_LNK(0x012, dmsg_lnk_span)
03d99ea4 341#define DMSG_LNK_CIRC DMSG_LNK(0x013, dmsg_lnk_circ)
5bc5bca2
MD
342#define DMSG_LNK_VOLCONF DMSG_LNK(0x020, dmsg_lnk_volconf)
343#define DMSG_LNK_ERROR DMSG_LNK(0xFFF, dmsg_hdr)
9ab15106
MD
344
345/*
03d99ea4 346 * LNK_AUTH - Authentication (often omitted)
8c280d5d 347 */
5bc5bca2
MD
348struct dmsg_lnk_auth {
349 dmsg_hdr_t head;
81666e1b
MD
350 char dummy[64];
351};
352
2063f4d7 353/*
03d99ea4
MD
354 * LNK_CONN - Register connection info for SPAN protocol
355 * (transaction, left open, circuit 0 only).
2063f4d7 356 *
03d99ea4
MD
357 * LNK_CONN identifies a streaming connection into the cluster and serves
358 * to identify, enable, and specify filters for the SPAN protocol.
359 *
360 * peer_mask serves to filter the SPANs we receive by peer_type. A cluster
361 * controller typically sets this to (uint64_t)-1, indicating that it wants
362 * everything. A block devfs interface might set it to 1 << DMSG_PEER_DISK,
363 * and a hammer2 mount might set it to 1 << DMSG_PEER_HAMMER2.
2063f4d7
MD
364 *
365 * mediaid allows multiple (e.g. HAMMER2) connections belonging to the same
03d99ea4
MD
366 * media to transmit duplicative LNK_VOLCONF updates without causing
367 * confusion in the cluster controller.
2063f4d7
MD
368 *
369 * pfs_clid, pfs_fsid, pfs_type, and label are peer-specific and must be
370 * left empty (zero-fill) if not supported by a particular peer.
371 *
5bc5bca2 372 * DMSG_PEER_CLUSTER filter: none
0c3a8cd0 373 * DMSG_PEER_BLOCK filter: label
5bc5bca2 374 * DMSG_PEER_HAMMER2 filter: pfs_clid if not empty, and label
2063f4d7 375 */
5bc5bca2
MD
376struct dmsg_lnk_conn {
377 dmsg_hdr_t head;
1a34728c 378 uuid_t mediaid; /* media configuration id */
8c280d5d
MD
379 uuid_t pfs_clid; /* rendezvous pfs uuid */
380 uuid_t pfs_fsid; /* unique pfs uuid */
2063f4d7 381 uint64_t peer_mask; /* PEER mask for SPAN filtering */
5bc5bca2 382 uint8_t peer_type; /* see DMSG_PEER_xxx */
2063f4d7 383 uint8_t pfs_type; /* pfs type */
8c280d5d
MD
384 uint16_t proto_version; /* high level protocol support */
385 uint32_t status; /* status flags */
03d99ea4 386 uint32_t rnss; /* node's generated rnss */
8c280d5d 387 uint8_t reserved02[8];
03d99ea4
MD
388 uint32_t reserved03[12];
389 uint64_t pfs_mask; /* PFS mask for SPAN filtering */
ddfbb283
MD
390 char cl_label[128]; /* cluster label (for PEER_BLOCK) */
391 char fs_label[128]; /* PFS label (for PEER_HAMMER2) */
8c280d5d
MD
392};
393
5bc5bca2 394typedef struct dmsg_lnk_conn dmsg_lnk_conn_t;
8c280d5d 395
ddfbb283
MD
396#define DMSG_PFSTYPE_NONE 0
397#define DMSG_PFSTYPE_ADMIN 1
398#define DMSG_PFSTYPE_CLIENT 2
399#define DMSG_PFSTYPE_CACHE 3
400#define DMSG_PFSTYPE_COPY 4
401#define DMSG_PFSTYPE_SLAVE 5
402#define DMSG_PFSTYPE_SOFT_SLAVE 6
403#define DMSG_PFSTYPE_SOFT_MASTER 7
404#define DMSG_PFSTYPE_MASTER 8
03d99ea4
MD
405#define DMSG_PFSTYPE_SERVER 9
406#define DMSG_PFSTYPE_MAX 10 /* 0-9 */
ddfbb283 407
0c3a8cd0
MD
408#define DMSG_PEER_NONE 0
409#define DMSG_PEER_CLUSTER 1 /* a cluster controller */
410#define DMSG_PEER_BLOCK 2 /* block devices */
411#define DMSG_PEER_HAMMER2 3 /* hammer2-mounted volumes */
412
8c280d5d 413/*
03d99ea4
MD
414 * Structures embedded in LNK_SPAN
415 */
416struct dmsg_media_block {
417 uint64_t bytes; /* media size in bytes */
418 uint32_t blksize; /* media block size */
419};
420
421typedef struct dmsg_media_block dmsg_media_block_t;
422
423/*
424 * LNK_SPAN - Initiate or relay a SPAN
425 * (transaction, left open, circuit 0 only)
9b8b748f 426 *
03d99ea4
MD
427 * This message registers an end-point with the other end of the connection,
428 * telling the other end who we are and what we can provide or intend to
429 * consume. Multiple registrations can be maintained as open transactions
430 * with each one specifying a unique end-point.
9b8b748f
MD
431 *
432 * Registrations are sent from {source}=S {1...n} to {target}=0 and maintained
433 * as open transactions. Registrations are also received and maintains as
434 * open transactions, creating a matrix of linkid's.
435 *
436 * While these transactions are open additional transactions can be executed
437 * between any two linkid's {source}=S (registrations we sent) to {target}=T
438 * (registrations we received).
439 *
440 * Closure of any registration transaction will automatically abort any open
441 * transactions using the related linkids. Closure can be initiated
442 * voluntarily from either side with either end issuing a DELETE, or they
443 * can be ABORTed.
444 *
445 * Status updates are performed via the open transaction.
446 *
447 * --
448 *
449 * A registration identifies a node and its various PFS parameters including
450 * the PFS_TYPE. For example, a diskless HAMMER2 client typically identifies
451 * itself as PFSTYPE_CLIENT.
452 *
453 * Any node may serve as a cluster controller, aggregating and passing
454 * on received registrations, but end-points do not have to implement this
455 * ability. Most end-points typically implement a single client-style or
456 * server-style PFS_TYPE and rendezvous at a cluster controller.
457 *
458 * The cluster controller does not aggregate/pass-on all received
03d99ea4
MD
459 * registrations. It typically filters what gets passed on based on what it
460 * receives, passing on only the best candidates.
461 *
462 * If a symmetric spanning tree is desired additional candidates whos
463 * {dist, rnss} fields match the last best candidate must also be propagated.
464 * This feature is not currently enabled.
9b8b748f
MD
465 *
466 * STATUS UPDATES: Status updates use the same structure but typically
03d99ea4
MD
467 * only contain incremental changes to e.g. pfs_type, with
468 * a text description sent as out-of-band data.
9b8b748f 469 */
5bc5bca2
MD
470struct dmsg_lnk_span {
471 dmsg_hdr_t head;
8c280d5d 472 uuid_t pfs_clid; /* rendezvous pfs uuid */
03d99ea4 473 uuid_t pfs_fsid; /* unique pfs id (differentiate node) */
2063f4d7
MD
474 uint8_t pfs_type; /* PFS type */
475 uint8_t peer_type; /* PEER type */
9b8b748f
MD
476 uint16_t proto_version; /* high level protocol support */
477 uint32_t status; /* status flags */
478 uint8_t reserved02[8];
03d99ea4
MD
479 uint32_t dist; /* span distance */
480 uint32_t rnss; /* random number sub-sort */
481 union {
482 uint32_t reserved03[14];
483 dmsg_media_block_t block;
484 } media;
485
486 /*
487 * NOTE: for PEER_HAMMER2 cl_label is typically empty and fs_label
488 * is the superroot directory name.
489 *
490 * for PEER_BLOCK cl_label is typically host/device and
491 * fs_label is typically the serial number string.
492 */
493 char cl_label[128]; /* cluster label */
494 char fs_label[128]; /* PFS label */
9b8b748f
MD
495};
496
5bc5bca2 497typedef struct dmsg_lnk_span dmsg_lnk_span_t;
42e2a62e 498
5bc5bca2 499#define DMSG_SPAN_PROTO_1 1
42e2a62e 500
9b8b748f 501/*
03d99ea4
MD
502 * LNK_CIRC - Establish a circuit
503 * (transaction, left open, circuit 0 only)
504 *
505 * Establish a circuit to the specified target. The msgid for the open
506 * transaction is used to transit messages in both directions.
507 *
508 * For circuit establishment the receiving entity looks up the outgoing
509 * relayed SPAN on the incoming iocom based on the target field and then
510 * creates peer circuit on the interface the SPAN originally came in on.
511 * Messages received on one side or forwarded to the other side and vise-versa.
512 * Any link state loss causes all related circuits to be lost.
1a34728c 513 */
03d99ea4
MD
514struct dmsg_lnk_circ {
515 dmsg_hdr_t head;
516 uint64_t reserved01;
517 uint64_t target;
518};
519
520typedef struct dmsg_lnk_circ dmsg_lnk_circ_t;
5bc5bca2
MD
521
522/*
03d99ea4
MD
523 * LNK_VOLCONF
524 *
5bc5bca2
MD
525 * All HAMMER2 directories directly under the super-root on your local
526 * media can be mounted separately, even if they share the same physical
527 * device.
528 *
529 * When you do a HAMMER2 mount you are effectively tying into a HAMMER2
530 * cluster via local media. The local media does not have to participate
3a5aa68f 531 * in the cluster, other than to provide the dmsg_vol_data[] array and
5bc5bca2
MD
532 * root inode for the mount.
533 *
534 * This is important: The mount device path you specify serves to bootstrap
535 * your entry into the cluster, but your mount will make active connections
3a5aa68f 536 * to ALL copy elements in the dmsg_vol_data[] array which match the
5bc5bca2
MD
537 * PFSID of the directory in the super-root that you specified. The local
538 * media path does not have to be mentioned in this array but becomes part
539 * of the cluster based on its type and access rights. ALL ELEMENTS ARE
540 * TREATED ACCORDING TO TYPE NO MATTER WHICH ONE YOU MOUNT FROM.
541 *
542 * The actual cluster may be far larger than the elements you list in the
3a5aa68f 543 * dmsg_vol_data[] array. You list only the elements you wish to
5bc5bca2
MD
544 * directly connect to and you are able to access the rest of the cluster
545 * indirectly through those connections.
546 *
547 * This structure must be exactly 128 bytes long.
548 *
549 * WARNING! dmsg_vol_data is embedded in the hammer2 media volume header
550 */
551struct dmsg_vol_data {
552 uint8_t copyid; /* 00 copyid 0-255 (must match slot) */
553 uint8_t inprog; /* 01 operation in progress, or 0 */
554 uint8_t chain_to; /* 02 operation chaining to, or 0 */
555 uint8_t chain_from; /* 03 operation chaining from, or 0 */
556 uint16_t flags; /* 04-05 flags field */
557 uint8_t error; /* 06 last operational error */
558 uint8_t priority; /* 07 priority and round-robin flag */
559 uint8_t remote_pfs_type;/* 08 probed direct remote PFS type */
560 uint8_t reserved08[23]; /* 09-1F */
561 uuid_t pfs_clid; /* 20-2F copy target must match this uuid */
562 uint8_t label[16]; /* 30-3F import/export label */
563 uint8_t path[64]; /* 40-7F target specification string or key */
564};
565
566typedef struct dmsg_vol_data dmsg_vol_data_t;
567
568#define DMSG_VOLF_ENABLED 0x0001
569#define DMSG_VOLF_INPROG 0x0002
570#define DMSG_VOLF_CONN_RR 0x80 /* round-robin at same priority */
571#define DMSG_VOLF_CONN_EF 0x40 /* media errors flagged */
572#define DMSG_VOLF_CONN_PRI 0x0F /* select priority 0-15 (15=best) */
573
0c3a8cd0
MD
574#define DMSG_COPYID_COUNT 256 /* WARNING! embedded in hammer2 vol */
575
5bc5bca2
MD
576struct dmsg_lnk_volconf {
577 dmsg_hdr_t head;
578 dmsg_vol_data_t copy; /* copy spec */
1a34728c
MD
579 int32_t index;
580 int32_t unused01;
581 uuid_t mediaid;
582 int64_t reserved02[32];
583};
584
5bc5bca2 585typedef struct dmsg_lnk_volconf dmsg_lnk_volconf_t;
1a34728c
MD
586
587/*
9ab15106
MD
588 * Debug layer ops operate on any link
589 *
590 * SHELL - Persist stream, access the debug shell on the target
591 * registration. Multiple shells can be operational.
592 */
5bc5bca2 593#define DMSG_DBG_SHELL DMSG_DBG(0x001, dmsg_dbg_shell)
9ab15106 594
5bc5bca2
MD
595struct dmsg_dbg_shell {
596 dmsg_hdr_t head;
9ab15106 597};
5bc5bca2 598typedef struct dmsg_dbg_shell dmsg_dbg_shell_t;
9ab15106
MD
599
600/*
9b8b748f
MD
601 * Domain layer ops operate on any link, link-0 may be used when the
602 * directory connected target is the desired registration.
603 *
604 * (nothing defined)
605 */
606
607/*
9ab15106
MD
608 * Cache layer ops operate on any link, link-0 may be used when the
609 * directly connected target is the desired registration.
610 *
611 * LOCK - Persist state, blockable, abortable.
612 *
613 * Obtain cache state (MODIFIED, EXCLUSIVE, SHARED, or INVAL)
614 * in any of three domains (TREE, INUM, ATTR, DIRENT) for a
615 * particular key relative to cache state already owned.
616 *
617 * TREE - Effects entire sub-tree at the specified element
618 * and will cause existing cache state owned by
619 * other nodes to be adjusted such that the request
620 * can be granted.
621 *
622 * INUM - Only effects inode creation/deletion of an existing
623 * element or a new element, by inumber and/or name.
624 * typically can be held for very long periods of time
625 * (think the vnode cache), directly relates to
626 * hammer2_chain structures representing inodes.
627 *
628 * ATTR - Only effects an inode's attributes, such as
629 * ownership, modes, etc. Used for lookups, chdir,
630 * open, etc. mtime has no affect.
631 *
632 * DIRENT - Only affects an inode's attributes plus the
633 * attributes or names related to any directory entry
634 * directly under this inode (non-recursively). Can
635 * be retained for medium periods of time when doing
636 * directory scans.
637 *
638 * This function may block and can be aborted. You may be
639 * granted cache state that is more broad than the state you
640 * requested (e.g. a different set of domains and/or an element
641 * at a higher layer in the tree). When quorum operations
642 * are used you may have to reconcile these grants to the
643 * lowest common denominator.
644 *
645 * In order to grant your request either you or the target
646 * (or both) may have to obtain a quorum agreement. Deadlock
647 * resolution may be required. When doing it yourself you
648 * will typically maintain an active message to each master
649 * node in the system. You can only grant the cache state
650 * when a quorum of nodes agree.
651 *
652 * The cache state includes transaction id information which
653 * can be used to resolve data requests.
654 */
5bc5bca2 655#define DMSG_CAC_LOCK DMSG_CAC(0x001, dmsg_cac_lock)
9ab15106
MD
656
657/*
658 * Quorum layer ops operate on any link, link-0 may be used when the
659 * directly connected target is the desired registration.
660 *
661 * COMMIT - Persist state, blockable, abortable
662 *
663 * Issue a COMMIT in two phases. A quorum must acknowledge
664 * the operation to proceed to phase-2. Message-update to
665 * proceed to phase-2.
666 */
5bc5bca2 667#define DMSG_QRM_COMMIT DMSG_QRM(0x001, dmsg_qrm_commit)
9ab15106
MD
668
669/*
03d99ea4
MD
670 * DMSG_PROTO_BLK Protocol
671 *
672 * BLK_OPEN - Open device. This transaction must be left open for the
673 * duration and the returned keyid passed in all associated
8d6d37b8
MD
674 * BLK commands. Multiple OPENs can be issued within the
675 * transaction.
03d99ea4 676 *
8d6d37b8
MD
677 * BLK_CLOSE - Close device. This can be used to close one of the opens
678 * within a BLK_OPEN transaction. It may NOT initiate a
679 * transaction. Note that a termination of the transaction
680 * (e.g. with LNK_ERROR or BLK_ERROR) closes all active OPENs
681 * for that transaction.
03d99ea4 682 *
8d6d37b8 683 * BLK_READ - Strategy read. Not typically streaming.
03d99ea4 684 *
8d6d37b8
MD
685 * BLK_WRITE - Strategy write. Not typically streaming.
686 *
687 * BLK_FLUSH - Strategy flush. Not typically streaming.
688 *
689 * BLK_FREEBLKS - Strategy freeblks. Not typically streaming.
03d99ea4
MD
690 */
691#define DMSG_BLK_OPEN DMSG_BLK(0x001, dmsg_blk_open)
8d6d37b8
MD
692#define DMSG_BLK_CLOSE DMSG_BLK(0x002, dmsg_blk_open)
693#define DMSG_BLK_READ DMSG_BLK(0x003, dmsg_blk_read)
694#define DMSG_BLK_WRITE DMSG_BLK(0x004, dmsg_blk_write)
695#define DMSG_BLK_FLUSH DMSG_BLK(0x005, dmsg_blk_flush)
696#define DMSG_BLK_FREEBLKS DMSG_BLK(0x006, dmsg_blk_freeblks)
03d99ea4
MD
697#define DMSG_BLK_ERROR DMSG_BLK(0xFFF, dmsg_blk_error)
698
699struct dmsg_blk_open {
700 dmsg_hdr_t head;
701 uint32_t modes;
702 uint32_t reserved01;
703};
704
705#define DMSG_BLKOPEN_RD 0x0001
706#define DMSG_BLKOPEN_WR 0x0002
707
708/*
709 * DMSG_LNK_ERROR is returned for simple results,
710 * DMSG_BLK_ERROR is returned for extended results.
711 */
712struct dmsg_blk_error {
713 dmsg_hdr_t head;
714 uint64_t keyid;
715 uint32_t resid;
716 uint32_t reserved02;
717 char buf[64];
718};
719
720struct dmsg_blk_read {
721 dmsg_hdr_t head;
722 uint64_t keyid;
723 uint64_t offset;
724 uint32_t bytes;
725 uint32_t flags;
726 uint32_t reserved01;
727 uint32_t reserved02;
728};
729
730struct dmsg_blk_write {
731 dmsg_hdr_t head;
732 uint64_t keyid;
733 uint64_t offset;
734 uint32_t bytes;
735 uint32_t flags;
736 uint32_t reserved01;
737 uint32_t reserved02;
738};
739
740struct dmsg_blk_flush {
741 dmsg_hdr_t head;
742 uint64_t keyid;
743 uint64_t offset;
744 uint32_t bytes;
745 uint32_t flags;
746 uint32_t reserved01;
747 uint32_t reserved02;
748};
749
750struct dmsg_blk_freeblks {
751 dmsg_hdr_t head;
752 uint64_t keyid;
753 uint64_t offset;
754 uint32_t bytes;
755 uint32_t flags;
756 uint32_t reserved01;
757 uint32_t reserved02;
758};
759
760typedef struct dmsg_blk_open dmsg_blk_open_t;
761typedef struct dmsg_blk_read dmsg_blk_read_t;
762typedef struct dmsg_blk_write dmsg_blk_write_t;
763typedef struct dmsg_blk_flush dmsg_blk_flush_t;
764typedef struct dmsg_blk_freeblks dmsg_blk_freeblks_t;
765typedef struct dmsg_blk_error dmsg_blk_error_t;
766
767/*
8c280d5d
MD
768 * NOTE!!!! ALL EXTENDED HEADER STRUCTURES MUST BE 64-BYTE ALIGNED!!!
769 *
9ab15106
MD
770 * General message errors
771 *
772 * 0x00 - 0x1F Local iocomm errors
773 * 0x20 - 0x2F Global errors
774 */
5bc5bca2 775#define DMSG_ERR_NOSUPP 0x20
03d99ea4 776#define DMSG_ERR_LOSTLINK 0x21
8d6d37b8
MD
777#define DMSG_ERR_IO 0x22 /* generic */
778#define DMSG_ERR_PARAM 0x23 /* generic */
779#define DMSG_ERR_CANTCIRC 0x24 /* (typically means lost span) */
5bc5bca2
MD
780
781union dmsg_any {
782 char buf[DMSG_HDR_MAX];
783 dmsg_hdr_t head;
03d99ea4 784
5bc5bca2 785 dmsg_lnk_conn_t lnk_conn;
03d99ea4
MD
786 dmsg_lnk_span_t lnk_span;
787 dmsg_lnk_circ_t lnk_circ;
5bc5bca2 788 dmsg_lnk_volconf_t lnk_volconf;
03d99ea4
MD
789
790 dmsg_blk_open_t blk_open;
791 dmsg_blk_error_t blk_error;
792 dmsg_blk_read_t blk_read;
793 dmsg_blk_write_t blk_write;
794 dmsg_blk_flush_t blk_flush;
795 dmsg_blk_freeblks_t blk_freeblks;
9ab15106
MD
796};
797
5bc5bca2 798typedef union dmsg_any dmsg_any_t;
9ab15106 799
3a5aa68f
MD
800/*
801 * Kernel iocom structures and prototypes for kern/kern_dmsg.c
802 */
840d1679 803#if defined(_KERNEL) || defined(_KERNEL_STRUCTURES)
3a5aa68f
MD
804
805struct hammer2_pfsmount;
3a5aa68f
MD
806struct kdmsg_iocom;
807struct kdmsg_state;
808struct kdmsg_msg;
809
810/*
3a5aa68f
MD
811 * msg_ctl flags (atomic)
812 */
813#define KDMSG_CLUSTERCTL_KILL 0x00000001
814#define KDMSG_CLUSTERCTL_KILLRX 0x00000002 /* staged helper exit */
815#define KDMSG_CLUSTERCTL_KILLTX 0x00000004 /* staged helper exit */
816#define KDMSG_CLUSTERCTL_SLEEPING 0x00000008 /* interlocked w/msglk */
817
818/*
03d99ea4
MD
819 * When the KDMSG_IOCOMF_AUTOCIRC flag is set the kdmsg code in
820 * the kernel automatically tries to forge a virtual circuit for
821 * any active SPAN state received.
822 *
823 * This is only done when the received SPANs are significantly filtered
824 * by the transmitted LNK_CONN. That is, it is done only by clients who
825 * connect to specific services over the cluster.
826 */
827struct kdmsg_circuit {
8d6d37b8
MD
828 RB_ENTRY(kdmsg_circuit) rbnode; /* indexed by msgid */
829 TAILQ_ENTRY(kdmsg_circuit) entry; /* written by shim */
830 struct kdmsg_iocom *iocom; /* written by shim */
03d99ea4 831 struct kdmsg_state *span_state;
8d6d37b8
MD
832 struct kdmsg_state *circ_state; /* master circuit */
833 struct kdmsg_state *rcirc_state; /* slave circuit */
834 uint64_t msgid;
03d99ea4 835 int weight;
8d6d37b8
MD
836 int recorded; /* written by shim */
837 int refs; /* written by shim */
03d99ea4
MD
838};
839
840typedef struct kdmsg_circuit kdmsg_circuit_t;
841
842/*
3a5aa68f
MD
843 * Transactional state structure, representing an open transaction. The
844 * transaction might represent a cache state (and thus have a chain
845 * association), or a VOP op, LNK_SPAN, or other things.
846 */
847struct kdmsg_state {
848 RB_ENTRY(kdmsg_state) rbnode; /* indexed by msgid */
03d99ea4 849 struct kdmsg_iocom *iocom;
8d6d37b8 850 struct kdmsg_circuit *circ;
03d99ea4 851 uint32_t icmd; /* record cmd creating state */
3a5aa68f
MD
852 uint32_t txcmd; /* mostly for CMDF flags */
853 uint32_t rxcmd; /* mostly for CMDF flags */
03d99ea4 854 uint64_t msgid; /* {circuit,msgid} uniq */
3a5aa68f
MD
855 int flags;
856 int error;
857 void *chain; /* (caller's state) */
858 struct kdmsg_msg *msg;
859 int (*func)(struct kdmsg_state *, struct kdmsg_msg *);
860 union {
861 void *any;
862 struct hammer2_pfsmount *pmp;
03d99ea4 863 struct kdmsg_circuit *circ;
3a5aa68f
MD
864 } any;
865};
866
867#define KDMSG_STATE_INSERTED 0x0001
868#define KDMSG_STATE_DYNAMIC 0x0002
869#define KDMSG_STATE_DELPEND 0x0004 /* transmit delete pending */
870
871struct kdmsg_msg {
872 TAILQ_ENTRY(kdmsg_msg) qentry; /* serialized queue */
03d99ea4 873 struct kdmsg_iocom *iocom;
3a5aa68f 874 struct kdmsg_state *state;
8d6d37b8 875 struct kdmsg_circuit *circ;
3a5aa68f
MD
876 size_t hdr_size;
877 size_t aux_size;
878 char *aux_data;
03d99ea4 879 int flags;
3a5aa68f
MD
880 dmsg_any_t any;
881};
882
03d99ea4
MD
883#define KDMSG_FLAG_AUXALLOC 0x0001
884
3a5aa68f
MD
885typedef struct kdmsg_link kdmsg_link_t;
886typedef struct kdmsg_state kdmsg_state_t;
887typedef struct kdmsg_msg kdmsg_msg_t;
888
889struct kdmsg_state_tree;
3a5aa68f 890int kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2);
8d6d37b8 891RB_HEAD(kdmsg_state_tree, kdmsg_state);
3a5aa68f
MD
892RB_PROTOTYPE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
893
8d6d37b8
MD
894struct kdmsg_circuit_tree;
895int kdmsg_circuit_cmp(kdmsg_circuit_t *circ1, kdmsg_circuit_t *circ2);
896RB_HEAD(kdmsg_circuit_tree, kdmsg_circuit);
897RB_PROTOTYPE(kdmsg_circuit_tree, kdmsg_circuit, rbnode, kdmsg_circuit_cmp);
898
3a5aa68f
MD
899/*
900 * Structure embedded in e.g. mount, master control structure for
901 * DMSG stream handling.
902 */
903struct kdmsg_iocom {
904 struct malloc_type *mmsg;
905 struct file *msg_fp; /* cluster pipe->userland */
906 thread_t msgrd_td; /* cluster thread */
907 thread_t msgwr_td; /* cluster thread */
908 int msg_ctl; /* wakeup flags */
909 int msg_seq; /* cluster msg sequence id */
03d99ea4 910 uint32_t flags;
3a5aa68f
MD
911 struct lock msglk; /* lockmgr lock */
912 TAILQ_HEAD(, kdmsg_msg) msgq; /* transmit queue */
913 void *handle;
03d99ea4
MD
914 void (*auto_callback)(kdmsg_msg_t *);
915 int (*rcvmsg)(kdmsg_msg_t *);
ddfbb283 916 void (*exit_func)(struct kdmsg_iocom *);
3a5aa68f
MD
917 struct kdmsg_state *conn_state; /* active LNK_CONN state */
918 struct kdmsg_state *freerd_state; /* allocation cache */
919 struct kdmsg_state *freewr_state; /* allocation cache */
920 struct kdmsg_state_tree staterd_tree; /* active messages */
921 struct kdmsg_state_tree statewr_tree; /* active messages */
8d6d37b8 922 struct kdmsg_circuit_tree circ_tree; /* active circuits */
03d99ea4
MD
923 dmsg_lnk_conn_t auto_lnk_conn;
924 dmsg_lnk_span_t auto_lnk_span;
3a5aa68f
MD
925};
926
927typedef struct kdmsg_iocom kdmsg_iocom_t;
928
03d99ea4
MD
929#define KDMSG_IOCOMF_AUTOCONN 0x0001 /* handle received LNK_CONN */
930#define KDMSG_IOCOMF_AUTOSPAN 0x0002 /* handle received LNK_SPAN */
931#define KDMSG_IOCOMF_AUTOCIRC 0x0004 /* handle received LNK_CIRC */
932#define KDMSG_IOCOMF_AUTOFORGE 0x0008 /* auto initiate LNK_CIRC */
8d6d37b8 933#define KDMSG_IOCOMF_EXITNOACC 0x0010 /* cannot accept writes */
03d99ea4
MD
934
935#define KDMSG_IOCOMF_AUTOANY (KDMSG_IOCOMF_AUTOCONN | \
936 KDMSG_IOCOMF_AUTOSPAN | \
8d6d37b8
MD
937 KDMSG_IOCOMF_AUTOCIRC | \
938 KDMSG_IOCOMF_AUTOFORGE)
03d99ea4 939
3a5aa68f
MD
940uint32_t kdmsg_icrc32(const void *buf, size_t size);
941uint32_t kdmsg_icrc32c(const void *buf, size_t size, uint32_t crc);
942
943/*
944 * kern_dmsg.c
945 */
03d99ea4 946void kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, u_int32_t flags,
3a5aa68f 947 struct malloc_type *mmsg,
03d99ea4 948 int (*rcvmsg)(kdmsg_msg_t *msg));
3a5aa68f
MD
949void kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
950 const char *subsysname);
03d99ea4
MD
951void kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
952 void (*conn_callback)(kdmsg_msg_t *msg));
185ace93 953void kdmsg_iocom_uninit(kdmsg_iocom_t *iocom);
3a5aa68f
MD
954void kdmsg_drain_msgq(kdmsg_iocom_t *iocom);
955
3a5aa68f 956void kdmsg_msg_free(kdmsg_msg_t *msg);
8d6d37b8 957kdmsg_msg_t *kdmsg_msg_alloc(kdmsg_iocom_t *iocom, kdmsg_circuit_t *circ,
03d99ea4 958 uint32_t cmd,
3a5aa68f
MD
959 int (*func)(kdmsg_state_t *, kdmsg_msg_t *),
960 void *data);
8d6d37b8
MD
961kdmsg_msg_t *kdmsg_msg_alloc_state(kdmsg_state_t *state, uint32_t cmd,
962 int (*func)(kdmsg_state_t *, kdmsg_msg_t *),
963 void *data);
3a5aa68f
MD
964void kdmsg_msg_write(kdmsg_msg_t *msg);
965void kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error);
966void kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error);
03d99ea4
MD
967void kdmsg_state_reply(kdmsg_state_t *state, uint32_t error);
968void kdmsg_state_result(kdmsg_state_t *state, uint32_t error);
3a5aa68f 969
8d6d37b8
MD
970void kdmsg_circ_hold(kdmsg_circuit_t *circ);
971void kdmsg_circ_drop(kdmsg_circuit_t *circ);
972
973
3a5aa68f
MD
974#endif
975
9ab15106 976#endif