2 * Copyright (c) 2004-2007 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/sys/syslink_msg.h,v 1.7 2007/04/26 02:11:00 dillon Exp $
37 * The syslink infrastructure implements an optimized RPC mechanism across a
38 * communications link. Endpoints, defined by a sysid, are typically
39 * associated with system structures but do not have to be.
41 * This header file is primarily responsible for the formatting of message
42 * traffic over a syslink.
45 #ifndef _SYS_SYSLINK_MSG_H_
46 #define _SYS_SYSLINK_MSG_H_
49 #include <sys/types.h>
51 #ifndef _MACHINE_ATOMIC_H_
52 #include <machine/atomic.h>
55 typedef u_int32_t sl_msgid_t; /* transaction sequencing */
56 typedef u_int32_t sl_auxdata_t; /* auxillary data element */
57 typedef u_int16_t sl_cmd_t; /* command or error */
58 typedef u_int16_t sl_error_t;
59 typedef u_int16_t sl_itemid_t; /* item id */
60 typedef u_int16_t sl_reclen_t; /* item length */
62 #define SL_ALIGN 8 /* 8-byte alignment */
63 #define SL_ALIGNMASK (SL_ALIGN - 1)
66 * The msgid is used to control transaction sequencing within a session, but
67 * also has a special meaning to the transport layer. A msgid of 0 indicates
68 * a PAD syslink message, used to pad FIFO buffers to prevent messages from
69 * being bisected by the end of the buffer. Since all structures are 8-byte
70 * aligned, 8-byte PAD messages are allowed. All other messages must be
71 * at least sizeof(syslink_msg).
73 * The reclen is the actual record length in bytes prior to alignment.
74 * The reclen must be aligned to obtain the actual size of a syslink_msg
75 * or syslink_item structure. Note that the reclen includes structural
76 * headers (i.e. it does not represent just the data payload, it represents
77 * the entire structure).
79 * Syslink messages allow special treatment for large data payloads, allowing
80 * the transport mechanism to separate the data payload into its own buffer
81 * or DMA area (for example, its own page), facilitating DMA and page-mapping
82 * operations at the end points while allowing the message to be maximally
83 * compressed during transport. This is typically handled by special casing
84 * a readv() or writev().
86 * Sessions are identified with a session id. The session id is a rendezvous
87 * id that associates physical and logical routing information with a single
88 * sysid, allowing us to both avoid storing the source and target logical id
89 * in the syslink message AND ALSO providing a unique session id and validator
90 * which manages the abstracted 'connection' between two entities. This
93 * The target physical address is deconstructed as the message hops across
94 * the mesh. All 0's, or all 0's remaining indicates a link layer message
95 * to be processed by the syslink route node itself. All 1's indicates
96 * a broadcast message. Broadcast messages also require special attention.
97 * Sending a message to a target address of 0 basically sends it to the
98 * nearest route node as a link layer message.
100 * The source physical address normally starts out as 0 and is constructed
101 * as the message hops across the mesh. The target can use the constructed
102 * source address to respond to the originator of the message (as it must
103 * if it has no knowledge about the session). A target with knowledge
104 * of the session id has the option of forging its own return path.
106 * Checksums are the responsibility of higher layers but message checking
107 * elements can be negotiated or required as part of the syslink message's
111 sl_msgid_t sh_msgid; /* message transaction control */
112 sl_reclen_t sh_payloadoff; /* offset of payload as a DMA aid */
113 sl_reclen_t sh_bytes; /* unaligned size of message */
114 /* minimum syslink_msg size is 8 bytes (special PAD) */
115 sysid_t sh_sessid; /* session id */
116 sysid_t sh_srcphysid; /* transit routing */
117 sysid_t sh_dstphysid; /* transit routing */
118 /* 8-byte aligned structure */
119 /* followed by structured data */
123 * MSGID handling. This controls message transactions and PAD. Terminal
124 * nodes, such as filesystems, are state driven entities whos syslink
125 * message transactions are directly supported by the local on-machine route
126 * nodes they connect to. The route nodes use various fields in the header,
127 * particularly sm_msgid, sm_sessid, and sm_payloadoff, to optimally present
128 * syslink messages to the terminal node. In particular, a route node may
129 * present the payload for a syslink message or the message itself through
130 * some out-of-band means, such as by mapping it into memory.
132 * These route nodes also handle timeout and retry processing, providing
133 * appropriate response messages to terminal nodes if the target never replies
134 * to a transaction or some other exceptional condition occurs. The route
135 * node does not handle RETRY and other exceptional conditions itself..
136 * that is, the route node is not responsible for storing the message, only
137 * routing it. The route node only tracks the related session(s).
139 * A route node only directly supports terminal nodes directly connected to
140 * it. Intermediate route nodes ignore the MSGID (other then the all 0's PAD
141 * case) and do not track indirect sessions. For example, a piece of
142 * hardware doing syslink message routing does not have to mess with
145 * A session id establishes a session between two entities. One terminal node
146 * is considered to be the originator of the session, the other terminal node
147 * is the target. However, once established, EITHER ENTITY may initiate
148 * a transaction (or both simulataniously). SH_MSGID_CMD_ORIGINATOR is used
149 * in all messages and replies related to a transaction initiated by the
150 * session originator, and SH_MSGID_CMD_TARGET is used in all messages and
151 * replies related to a transaction initiated by the session target.
152 * Establishment of new sessions uses SH_MSGID_CMD_FORGE.
154 * Parallel transactions are supported by using different transaction ids
155 * amoungst the parallel transactions. Once a transaction id is used, it
156 * may not be reused until after the timeout period is exceeded. With 23
157 * transaction id bits we have 8 million transaction ids, supporting around
158 * 26000 transactions per second with a 5 minute timeout. Note that
159 * multiple sessions may be established between any two entities, giving us
160 * essentially an unlimited number of transactions per second.
162 * ENDIANESS - syslink messages may be transported with any endianess. This
163 * includes all fields including the syslink header and syslink element
164 * header fields. If upon reception SH_MSGID_ENDIAN_NORM is set in the msgid
165 * both end-points will have the same endianess and no translation is
166 * required. If SH_MSGID_ENDIAN_REV is set then the two end-points have
167 * different endianess and translation is required. Only little endian and
168 * bit endian transport is supported (that is, a simple reversal of bytes for
171 * Intermediate route nodes (i.e. those not tracking the session) may NOT
172 * translate the endianess of the message in any fashion. The management
173 * node that talks to the actual resource is responsible for doing the
174 * endian translations for all the above fields... everything except the
175 * syslink_elm payload, which is described later.
177 #define SL_MIN_MESSAGE_SIZE offsetof(struct syslink_msg, sm_sessid)
178 #define SL_MSG_ALIGN(bytes) (((bytes) + 7) & ~7)
180 #define SH_MSGID_CMD_MASK 0xF0000000
181 #define SH_MSGID_CMD_HEARTBEAT 0x60000000 /* seed heartbeat broadcast */
182 #define SH_MSGID_CMD_TIMESYNC 0x50000000 /* timesync broadcast */
183 #define SH_MSGID_CMD_ALLOCATE 0x40000000 /* allocate session id space */
184 #define SH_MSGID_CMD_ORIGINATOR 0x30000000 /* origin initiated trans */
185 #define SH_MSGID_CMD_TARGET 0x20000000 /* target initiated trans */
186 #define SH_MSGID_CMD_ESTABLISH 0x10000000 /* establish session */
187 #define SH_MSGID_CMD_PAD 0x00000000
189 #define SH_MSGID_REPLY 0x08000000
190 #define SH_MSGID_ENDIAN_NORM 0x01000000
191 #define SH_MSGID_ENDIAN_REV 0x00000001
192 #define SM_MSGID_TRANS_MASK 0x00FFFFFE /* 23 bits */
195 * A syslink message is broken up into three pieces: (1) The headers, (2) The
196 * message elements, and (3) DMA payload.
198 * A non-PAD syslink message contains a single top-level message element.
199 * Unlike recursive message elements which can be iterated, the top level
200 * element is never iterated. There is always only one. The top level
201 * element is usually structured but does not have to be. The top level
202 * element's aux field represents the RPC protocol id for the command.
204 * A PAD syslink message contains no message elements. The entire syslink
205 * message is considered pad based on the header.
207 * A structured syslink message element may be specified by setting
208 * SE_CMDF_STRUCTURED. The data payload for a structured message element
209 * is a sequence of ZERO or MORE message elements until the payload size is
210 * reached. Each message element may be opaque or structured. Fully
211 * recursive message elements are supported in this manner.
213 * A syslink message element with SE_CMDF_MASTERPAYLOAD set is associated
214 * with the master payload for the syslink message as a whole. This field
215 * is only interpreted by terminal nodes and does not have to be used this
216 * way, but its a good idea to for debugging purposes.
218 * Syslink message elements are always 8-byte aligned. In order to
219 * guarentee an 8-byte alignment for our extended data, a 32 bit auxillary
220 * field is always included as part of the official syslink_elm structure
221 * definition. This field is actually part of the element command's data
222 * and its use, if any, depends on the element command.
224 * Syslink message elements do not have to be validated by intermediate
225 * route nodes but must ALWAYS be validated by the route node that connects
226 * to the terminal node intended to receive the syslink message.
228 * Only the header fields of a syslink_elm are translated for endianess
229 * by the management node. If the management node does have to do an
230 * endian conversion it will also set SE_CMDF_UNTRANSLATED in se_cmd (all
231 * of them, recursively, since it has to validate and translate the entire
232 * hierarchy anyway) and the rpc mechanism will be responsible for doing
233 * the conversion and clearing the flag. The seu_proto field IS always
234 * translated, which means that when used as aux data it must be referenced
237 * As a fringe benefit, since the RPC command is the entire se_cmd field,
238 * flags and all, an untranslated element will wind up with an unrecognized
239 * command code and be reported as an error rather then being mis-executed.
243 sl_reclen_t se_bytes;
245 sl_auxdata_t seu_aux; /* aux data */
246 sl_auxdata_t seu_proto; /* protocol field */
248 /* extended by data */
251 #define SE_CMDF_STRUCTURED 0x8000 /* structured, else opaque */
252 #define SE_CMDF_RESERVED4000 0x4000
253 #define SE_CMDF_MASTERPAYLOAD 0x2000 /* DMA payload association */
254 #define SE_CMDF_UNTRANSLATED 0x1000 /* needs endian translation */
256 #define SE_CMD_PAD 0x0000 /* CMD 0 is always PAD */
258 typedef struct syslink_msg *syslink_msg_t;
259 typedef struct syslink_elm *syslink_elm_t;