2 * Copyright (c) 2004 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/sys/mountctl.h,v 1.4 2004/12/31 23:48:06 dillon Exp $
41 #define JIDMAX 32 /* id string buf[] size (incls \0) */
44 * Data structures for the journaling API
47 #define MOUNTCTL_INSTALL_VFS_JOURNAL 1
48 #define MOUNTCTL_REMOVE_VFS_JOURNAL 2
49 #define MOUNTCTL_RESYNC_VFS_JOURNAL 3
50 #define MOUNTCTL_JOURNAL_VFS_STATUS 4
52 #define MOUNTCTL_INSTALL_BLK_JOURNAL 8
53 #define MOUNTCTL_REMOVE_BLK_JOURNAL 9
54 #define MOUNTCTL_RESYNC_BLK_JOURNAL 10
55 #define MOUNTCTL_JOURNAL_BLK_STATUS 11
57 struct mountctl_install_journal {
59 int flags; /* journaling flags */
61 int64_t membufsize; /* backing store */
62 int64_t swapbufsize; /* backing store */
63 int64_t transid; /* starting with specified transaction id */
65 int stallwarn; /* stall warning (seconds) */
66 int stallerror; /* stall error (seconds) */
71 #define MC_JOURNAL_ACTIVE 0x00000001 /* journal is active */
72 #define MC_JOURNAL_STOP_REQ 0x00000002 /* stop request pend */
73 #define MC_JOURNAL_STOP_IMM 0x00000004 /* STOP+trash fifo */
74 #define MC_JOURNAL_WWAIT 0x00000040 /* write stall */
75 #define MC_JOURNAL_WANT_AUDIT 0x00010000 /* audit trail */
76 #define MC_JOURNAL_WANT_REVERSABLE 0x00020000 /* reversable stream */
78 struct mountctl_remove_journal {
83 #define MC_JOURNAL_REMOVE_TRASH 0x00000001 /* data -> trash */
84 #define MC_JOURNAL_REMOVE_ASSYNC 0x00000002 /* asynchronous op */
86 struct mountctl_journal_status {
94 int64_t swapbufqueued;
96 int64_t transidcurrent;
97 int64_t transidqueued;
101 struct timeval lastack;
104 #define MC_JOURNAL_STATUS_NEXT 0x80000000 /* find next id */
107 * Physical file format (binary)
109 * All raw records are 128-bit aligned, but all record sizes are actual.
110 * This means that any scanning code must 16-byte-align the recsize field
111 * when calculating skips. The top level raw record has a header and a
112 * trailer to allow both forwards and backwards scanning of the journal.
113 * The alignment requirement allows the worker thread FIFO reservation
114 * API to operate efficiently, amoung other things.
116 * Logical data stream records are usually no larger then the journal's
117 * in-memory FIFO, since the journal's transactional APIs return contiguous
118 * blocks of buffer space and since logical stream records are used to avoid
119 * stalls when concurrent blocking operations are being written to the journal.
120 * Programs can depend on a logical stream record being a 'reasonable' size.
122 * Multiple logical data streams may operate concurrently in the journal,
123 * reflecting the fact that the system may be executing multiple blocking
124 * operations on the filesystem all at the same time. These logical data
125 * streams are short-lived transactional entities which use a 13 bit id
126 * plus a transaction start bit, end bit, and abort bit.
128 * Stream identifiers in the 0x00-0xFF range are special and not used for
129 * normal transactional commands.
131 * Stream id 0x00 indicates that no other streams should be active at that
132 * point in the journal, which helps the journaling code detect corruption.
134 * Stream id 0x01 is used for pad. Pads are used to align data on convenient
135 * boundaries and to deal with dead space.
137 * Stream id 0x02 indicates a discontinuity in the streamed data and typically
138 * contains information relating to the reason for the discontinuity.
139 * JTYPE_ASSOCIATE and JTYPE_DISASSOCIATE are usually emplaced in stream 0x02.
141 * Stream id 0x03 may be used to annotate the journal with text comments
142 * via mountctl commands. This can be extremely useful to note situations
143 * that may help with later recovery or audit operations.
145 * Stream id 0x04-0x7F are reserved by DragonFly for future protocol expansion.
147 * Stream id 0x80-0xFF may be used for third-party protocol expansion.
149 * Stream id's 0x0100-0x1FFF typically represent short-lived transactions
150 * (i.e. an id may be reused once the previous use has completed). The
151 * journaling system runs through these id's sequentially which means that
152 * the journaling code can handle up to 8192-256 = 7936 simultanious
153 * transactions at any given moment.
155 * The sequence number field is context-sensitive. It is typically used by
156 * a journaling stream to provide an incrementing counter and/or timestamp
157 * so recovery utilities can determine if any data is missing.
159 * The check word in the trailer may be used to provide an integrity check
160 * on the journaled data. A value of 0 always means that no check word
161 * has been calculated.
163 * The journal_rawrecbeg structure MUST be a multiple of 16 bytes.
164 * The journal_rawrecend structure MUST be a multiple of 8 bytes.
166 * NOTE: PAD RECORD SPECIAL CASE. Pad records are 16 bytes and have the
167 * rawrecend structure overlayed on the sequence number field of the
168 * rawrecbeg structure. This is necessary because stream records are
169 * 16 byte aligned, not 24 byte aligned, and dead space is not allowed.
170 * So the pad record must fit into any dead space.
172 struct journal_rawrecbeg {
173 u_int16_t begmagic; /* recovery scan, endianess detection */
174 u_int16_t streamid; /* start/stop bits and stream identifier */
175 int32_t recsize; /* stream data block (incls beg & end) */
176 int64_t seqno; /* sequence number or transaction id */
177 /* ADDITIONAL DATA */
180 struct journal_rawrecend {
181 u_int16_t endmagic; /* recovery scan, endianess detection */
182 u_int16_t check; /* check word or 0 */
183 int32_t recsize; /* same as rawrecbeg->recsize, for rev scan */
187 * Constants for stream record magic numbers. The incomplete magic
188 * number code is used internally by the memory FIFO reservation API
189 * and worker thread, allowing a block of space in the journaling
190 * stream (aka a stream block) to be reserved and then populated without
191 * stalling other threads doing their own reservation and population.
193 #define JREC_BEGMAGIC 0x1234
194 #define JREC_ENDMAGIC 0xCDEF
195 #define JREC_INCOMPLETEMAGIC 0xFFFF
198 * Stream ids are 14 bits. The top 2 bits specify when a new logical
199 * stream is being created or an existing logical stream is being terminated.
200 * A single raw stream record will set both the BEGIN and END bits if the
201 * entire transaction is encapsulated in a single stream record.
203 #define JREC_STREAMCTL_MASK 0xE000
204 #define JREC_STREAMCTL_BEGIN 0x8000 /* start a new logical stream */
205 #define JREC_STREAMCTL_END 0x4000 /* terminate a logical stream */
206 #define JREC_STREAMCTL_ABORTED 0x2000
208 #define JREC_STREAMID_MASK 0x1FFF
209 #define JREC_STREAMID_SYNCPT (JREC_STREAMCTL_BEGIN|JREC_STREAMCTL_END|0x0000)
210 #define JREC_STREAMID_PAD (JREC_STREAMCTL_BEGIN|JREC_STREAMCTL_END|0x0001)
211 #define JREC_STREAMID_DISCONT 0x0002 /* discontinuity */
212 #define JREC_STREAMID_ANNOTATE 0x0003 /* annotation */
213 /* 0x0004-0x007F reserved by DragonFly */
214 /* 0x0080-0x00FF for third party use */
215 #define JREC_STREAMID_JMIN 0x0100 /* lowest allowed general id */
216 #define JREC_STREAMID_JMAX 0x2000 /* (one past the highest allowed id) */
218 #define JREC_DEFAULTSIZE 64 /* reasonable initial reservation */
221 * Each logical journaling stream typically represents a transaction...
222 * that is, a VFS operation. The VFS operation is written out using
223 * sub-records and may contain multiple, possibly nested sub-transactions.
224 * multiple sub-transactions occur when a VFS operation cannot be represented
225 * by a single command. This is typically the case when a journal is
226 * configured to be reversable because UNDO sequences almost always have to
227 * be specified in such cases. For example, if you ftruncate() a file the
228 * journal might have to write out a sequence of WRITE records representing
229 * the lost data, otherwise the journal would not be reversable.
230 * Sub-transactions within a particular stream do not have their own sequence
231 * number field and thus may not be parallelized (the protocol is already
234 * In order to support streaming operation with a limited buffer the recsize
235 * field is allowed to be 0 for subrecords with the JMASK_NESTED bit set.
236 * If this case occurs a scanner can determine that the recursion has ended
237 * by detecting a nested subrecord with the JMASK_LAST bit set. A scanner
238 * may also set the field to the proper value after the fact to make later
239 * operations more efficient.
241 * Note that this bit must be properly set even if the recsize field is
242 * non-zero. The recsize must always be properly specified for 'leaf'
243 * subrecords, however in order to allow subsystems to potentially allocate
244 * more data space then they use the protocol allows any 'dead' space to be
245 * filled with JLEAF_PAD records.
247 * The recsize field may indicate data well past the size of the current
248 * raw stream record. That is, the scanner may have to glue together
249 * multiple stream records with the same stream id to fully decode the
250 * embedded subrecords. In particular, a subrecord could very well represent
251 * hundreds of megabytes of data (e.g. if a program were to do a
252 * multi-megabyte write()) and be split up across thousands of raw streaming
253 * records, possibly interlaced with other unrelated streams from other
254 * unrelated processes.
256 * If a large sub-transaction is aborted the logical stream may be
257 * terminated without writing out all the expected data. When this occurs
258 * the stream's ending record must also have the JREC_STREAMCTL_ABORTED bit
259 * set. However, scanners should still be robust enough to detect such
260 * overflows even if the aborted bit is not set and consider them data
263 * Aborts may also occur in the normal course of operations, especially once
264 * the journaling API is integrated into the cache coherency API. A normal
265 * abort is issued by emplacing a JLEAF_ABORT record within the transaction
266 * being aborted. Such records must be the last record in the sub-transaction,
267 * so JLEAF_LAST is also usually set. In a transaction with many
268 * sub-transactions only those sub-transactions with an abort record are
269 * aborted, the rest remain valid. Abort records are considered S.O.P. for
270 * two reasons: First, limited memory buffer space may make it impossible
271 * to delete the portion of the stream being aborted (the data may have
272 * already been sent to the target). Second, the journaling code will
273 * eventually be used to support a cache coherency layer which may have to
274 * abort operations as part of the cache coherency protocol. Note that
275 * subrecord aborts are different from stream record aborts. Stream record
276 * aborts are considered to be extrodinary situations while subrecord aborts
280 struct journal_subrecord {
281 int16_t rectype; /* 2 control bits, 14 record type bits */
282 int16_t reserved; /* future use */
283 int32_t recsize; /* record size (mandatory if not NESTED) */
284 /* ADDITIONAL DATA */
287 #define JMASK_NESTED 0x8000 /* data is a nested recursion */
288 #define JMASK_LAST 0x4000
290 #define JLEAF_PAD 0x0000
291 #define JLEAF_ABORT 0x0001
292 #define JTYPE_ASSOCIATE 0x0002
293 #define JTYPE_DISASSOCIATE 0x0003
294 #define JTYPE_UNDO (JMASK_NESTED|0x0004)
295 #define JTYPE_AUDIT (JMASK_NESTED|0x0005)
297 #define JTYPE_SETATTR (JMASK_NESTED|0x0010)
298 #define JTYPE_WRITE (JMASK_NESTED|0x0011)
299 #define JTYPE_PUTPAGES (JMASK_NESTED|0x0012)
300 #define JTYPE_SETACL (JMASK_NESTED|0x0013)
301 #define JTYPE_SETEXTATTR (JMASK_NESTED|0x0014)
302 #define JTYPE_CREATE (JMASK_NESTED|0x0015)
303 #define JTYPE_MKNOD (JMASK_NESTED|0x0016)
304 #define JTYPE_LINK (JMASK_NESTED|0x0017)
305 #define JTYPE_SYMLINK (JMASK_NESTED|0x0018)
306 #define JTYPE_WHITEOUT (JMASK_NESTED|0x0019)
307 #define JTYPE_REMOVE (JMASK_NESTED|0x001A)
308 #define JTYPE_MKDIR (JMASK_NESTED|0x001B)
309 #define JTYPE_RMDIR (JMASK_NESTED|0x001C)
310 #define JTYPE_RENAME (JMASK_NESTED|0x001D)
313 * Low level record types
315 #define JLEAF_FILEDATA 0x0401
316 #define JLEAF_PATH1 0x0402
317 #define JLEAF_PATH2 0x0403
318 #define JLEAF_PATH3 0x0404
319 #define JLEAF_PATH4 0x0405
320 #define JLEAF_UID 0x0406
321 #define JLEAF_GID 0x0407
322 #define JLEAF_MODES 0x0408
323 #define JLEAF_FFLAGS 0x0409
324 #define JLEAF_PID 0x040A
325 #define JLEAF_PPID 0x040B
326 #define JLEAF_COMM 0x040C
327 #define JLEAF_RESERVED_0D 0x040D
328 #define JLEAF_RESERVED_0E 0x040E
329 #define JLEAF_RESERVED_0F 0x040F
330 #define JLEAF_SYMLINKDATA 0x0410
331 #define JLEAF_SEEKPOS 0x0411
332 #define JLEAF_INUM 0x0412
334 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES)
337 * Support structures for the generic journaling structure
339 struct journal_memfifo {
340 int size; /* size (power of two) */
341 int mask; /* index mask (size - 1) */
342 int rindex; /* stream reader index (track fd writes) */
343 int xindex; /* last acked / reader restart */
344 int windex; /* stream writer index */
345 char *membase; /* memory buffer representing the FIFO */
349 * Generic journaling structure attached to a mount point.
352 TAILQ_ENTRY(journal) jentry;
355 int flags; /* journaling flags */
357 struct journal_memfifo fifo;
358 struct thread thread;
362 * The jrecord structure is used to build a journaling transaction. Since
363 * a single journaling transaction might encompass very large buffers it
364 * is possible for multiple transactions to be written out to the FIFO
365 * in parallel and in peacemeal.
372 struct journal_rawrecbeg *rawp;
373 struct journal_subrecord *parent;
374 struct journal_subrecord *last;