gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2004-2006 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/kern/vfs_journal.c,v 1.33 2007/05/09 00:53:34 dillon Exp $
	35	*/
	36	/*
	37	* The journaling protocol is intended to evolve into a two-way stream
	38	* whereby transaction IDs can be acknowledged by the journaling target
	39	* when the data has been committed to hard storage. Both implicit and
	40	* explicit acknowledgement schemes will be supported, depending on the
	41	* sophistication of the journaling stream, plus resynchronization and
	42	* restart when a journaling stream is interrupted. This information will
	43	* also be made available to journaling-aware filesystems to allow better
	44	* management of their own physical storage synchronization mechanisms as
	45	* well as to allow such filesystems to take direct advantage of the kernel's
	46	* journaling layer so they don't have to roll their own.
	47	*
	48	* In addition, the worker thread will have access to much larger
	49	* spooling areas then the memory buffer is able to provide by e.g.
	50	* reserving swap space, in order to absorb potentially long interruptions
	51	* of off-site journaling streams, and to prevent 'slow' off-site linkages
	52	* from radically slowing down local filesystem operations.
	53	*
	54	* Because of the non-trivial algorithms the journaling system will be
	55	* required to support, use of a worker thread is mandatory. Efficiencies
	56	* are maintained by utilitizing the memory FIFO to batch transactions when
	57	* possible, reducing the number of gratuitous thread switches and taking
	58	* advantage of cpu caches through the use of shorter batched code paths
	59	* rather then trying to do everything in the context of the process
	60	* originating the filesystem op. In the future the memory FIFO can be
	61	* made per-cpu to remove BGL or other locking requirements.
	62	*/
	63	#include <sys/param.h>
	64	#include <sys/systm.h>
	65	#include <sys/buf.h>
	66	#include <sys/conf.h>
	67	#include <sys/kernel.h>
	68	#include <sys/queue.h>
	69	#include <sys/lock.h>
	70	#include <sys/malloc.h>
	71	#include <sys/mount.h>
	72	#include <sys/unistd.h>
	73	#include <sys/vnode.h>
	74	#include <sys/poll.h>
	75	#include <sys/mountctl.h>
	76	#include <sys/journal.h>
	77	#include <sys/file.h>
	78	#include <sys/proc.h>
	79	#include <sys/msfbuf.h>
	80	#include <sys/socket.h>
	81	#include <sys/socketvar.h>
	82
	83	#include <machine/limits.h>
	84
	85	#include <vm/vm.h>
	86	#include <vm/vm_object.h>
	87	#include <vm/vm_page.h>
	88	#include <vm/vm_pager.h>
	89	#include <vm/vnode_pager.h>
	90
	91	#include <sys/file2.h>
	92	#include <sys/thread2.h>
	93	#include <sys/spinlock2.h>
	94
	95	static void journal_wthread(void *info);
	96	static void journal_rthread(void *info);
	97
	98	static void journal_reserve(struct journal jo,
	99	struct journal_rawrecbeg **rawpp,
	100	int16_t streamid, int bytes);
	101	static void journal_extend(struct journal jo,
	102	struct journal_rawrecbeg **rawpp,
	103	int truncbytes, int bytes, int *newstreamrecp);
	104	static void journal_abort(struct journal *jo,
	105	struct journal_rawrecbeg **rawpp);
	106	static void journal_commit(struct journal *jo,
	107	struct journal_rawrecbeg **rawpp,
	108	int bytes, int closeout);
	109
	110
	111	MALLOC_DEFINE(M_JOURNAL, "journal", "Journaling structures");
	112	MALLOC_DEFINE(M_JFIFO, "journal-fifo", "Journal FIFO");
	113
	114	void
	115	journal_create_threads(struct journal *jo)
	116	{
	117	jo->flags &= ~(MC_JOURNAL_STOP_REQ \| MC_JOURNAL_STOP_IMM);
	118	jo->flags \|= MC_JOURNAL_WACTIVE;
	119	lwkt_create(journal_wthread, jo, NULL, &jo->wthread,
	120	TDF_STOPREQ, -1, "journal w:%.*s", JIDMAX, jo->id);
	121	lwkt_setpri(&jo->wthread, TDPRI_KERN_DAEMON);
	122	lwkt_schedule(&jo->wthread);
	123
	124	if (jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) {
	125	jo->flags \|= MC_JOURNAL_RACTIVE;
	126	lwkt_create(journal_rthread, jo, NULL, &jo->rthread,
	127	TDF_STOPREQ, -1, "journal r:%.*s", JIDMAX, jo->id);
	128	lwkt_setpri(&jo->rthread, TDPRI_KERN_DAEMON);
	129	lwkt_schedule(&jo->rthread);
	130	}
	131	}
	132
	133	void
	134	journal_destroy_threads(struct journal *jo, int flags)
	135	{
	136	int wcount;
	137
	138	jo->flags \|= MC_JOURNAL_STOP_REQ \| (flags & MC_JOURNAL_STOP_IMM);
	139	wakeup(&jo->fifo);
	140	wcount = 0;
	141	while (jo->flags & (MC_JOURNAL_WACTIVE \| MC_JOURNAL_RACTIVE)) {
	142	tsleep(jo, 0, "jwait", hz);
	143	if (++wcount % 10 == 0) {
	144	kprintf("Warning: journal %s waiting for descriptors to close\n",
	145	jo->id);
	146	}
	147	}
	148
	149	/*
	150	* XXX SMP - threads should move to cpu requesting the restart or
	151	* termination before finishing up to properly interlock.
	152	*/
	153	tsleep(jo, 0, "jwait", hz);
	154	lwkt_free_thread(&jo->wthread);
	155	if (jo->flags & MC_JOURNAL_WANT_FULLDUPLEX)
	156	lwkt_free_thread(&jo->rthread);
	157	}
	158
	159	/*
	160	* The per-journal worker thread is responsible for writing out the
	161	* journal's FIFO to the target stream.
	162	*/
	163	static void
	164	journal_wthread(void *info)
	165	{
	166	struct journal *jo = info;
	167	struct journal_rawrecbeg *rawp;
	168	int error;
	169	size_t avail;
	170	size_t bytes;
	171	size_t res;
	172
	173	for (;;) {
	174	/*
	175	* Calculate the number of bytes available to write. This buffer
	176	* area may contain reserved records so we can't just write it out
	177	* without further checks.
	178	*/
	179	bytes = jo->fifo.windex - jo->fifo.rindex;
	180
	181	/*
	182	* sleep if no bytes are available or if an incomplete record is
	183	* encountered (it needs to be filled in before we can write it
	184	* out), and skip any pad records that we encounter.
	185	*/
	186	if (bytes == 0) {
	187	if (jo->flags & MC_JOURNAL_STOP_REQ)
	188	break;
	189	tsleep(&jo->fifo, 0, "jfifo", hz);
	190	continue;
	191	}
	192
	193	/*
	194	* Sleep if we can not go any further due to hitting an incomplete
	195	* record. This case should occur rarely but may have to be better
	196	* optimized XXX.
	197	*/
	198	rawp = (void *)(jo->fifo.membase + (jo->fifo.rindex & jo->fifo.mask));
	199	if (rawp->begmagic == JREC_INCOMPLETEMAGIC) {
	200	tsleep(&jo->fifo, 0, "jpad", hz);
	201	continue;
	202	}
	203
	204	/*
	205	* Skip any pad records. We do not write out pad records if we can
	206	* help it.
	207	*/
	208	if (rawp->streamid == JREC_STREAMID_PAD) {
	209	if ((jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) == 0) {
	210	if (jo->fifo.rindex == jo->fifo.xindex) {
	211	jo->fifo.xindex += (rawp->recsize + 15) & ~15;
	212	jo->total_acked += (rawp->recsize + 15) & ~15;
	213	}
	214	}
	215	jo->fifo.rindex += (rawp->recsize + 15) & ~15;
	216	jo->total_acked += bytes;
	217	KKASSERT(jo->fifo.windex - jo->fifo.rindex >= 0);
	218	continue;
	219	}
	220
	221	/*
	222	* 'bytes' is the amount of data that can potentially be written out.
	223	* Calculate 'res', the amount of data that can actually be written
	224	* out. res is bounded either by hitting the end of the physical
	225	* memory buffer or by hitting an incomplete record. Incomplete
	226	* records often occur due to the way the space reservation model
	227	* works.
	228	*/
	229	res = 0;
	230	avail = jo->fifo.size - (jo->fifo.rindex & jo->fifo.mask);
	231	while (res < bytes && rawp->begmagic == JREC_BEGMAGIC) {
	232	res += (rawp->recsize + 15) & ~15;
	233	if (res >= avail) {
	234	KKASSERT(res == avail);
	235	break;
	236	}
	237	rawp = (void )((char )rawp + ((rawp->recsize + 15) & ~15));
	238	}
	239
	240	/*
	241	* Issue the write and deal with any errors or other conditions.
	242	* For now assume blocking I/O. Since we are record-aware the
	243	* code cannot yet handle partial writes.
	244	*
	245	* We bump rindex prior to issuing the write to avoid racing
	246	* the acknowledgement coming back (which could prevent the ack
	247	* from bumping xindex). Restarts are always based on xindex so
	248	* we do not try to undo the rindex if an error occurs.
	249	*
	250	* XXX EWOULDBLOCK/NBIO
	251	* XXX notification on failure
	252	* XXX permanent verses temporary failures
	253	* XXX two-way acknowledgement stream in the return direction / xindex
	254	*/
	255	bytes = res;
	256	jo->fifo.rindex += bytes;
	257	error = fp_write(jo->fp,
	258	jo->fifo.membase +
	259	((jo->fifo.rindex - bytes) & jo->fifo.mask),
	260	bytes, &res, UIO_SYSSPACE);
	261	if (error) {
	262	kprintf("journal_thread(%s) write, error %d\n", jo->id, error);
	263	/* XXX */
	264	} else {
	265	KKASSERT(res == bytes);
	266	}
	267
	268	/*
	269	* Advance rindex. If the journal stream is not full duplex we also
	270	* advance xindex, otherwise the rjournal thread is responsible for
	271	* advancing xindex.
	272	*/
	273	if ((jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) == 0) {
	274	jo->fifo.xindex += bytes;
	275	jo->total_acked += bytes;
	276	}
	277	KKASSERT(jo->fifo.windex - jo->fifo.rindex >= 0);
	278	if ((jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) == 0) {
	279	if (jo->flags & MC_JOURNAL_WWAIT) {
	280	jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */
	281	wakeup(&jo->fifo.windex);
	282	}
	283	}
	284	}
	285	fp_shutdown(jo->fp, SHUT_WR);
	286	jo->flags &= ~MC_JOURNAL_WACTIVE;
	287	wakeup(jo);
	288	wakeup(&jo->fifo.windex);
	289	}
	290
	291	/*
	292	* A second per-journal worker thread is created for two-way journaling
	293	* streams to deal with the return acknowledgement stream.
	294	*/
	295	static void
	296	journal_rthread(void *info)
	297	{
	298	struct journal_rawrecbeg *rawp;
	299	struct journal_ackrecord ack;
	300	struct journal *jo = info;
	301	int64_t transid;
	302	int error;
	303	size_t count;
	304	size_t bytes;
	305
	306	transid = 0;
	307	error = 0;
	308
	309	for (;;) {
	310	/*
	311	* We have been asked to stop
	312	*/
	313	if (jo->flags & MC_JOURNAL_STOP_REQ)
	314	break;
	315
	316	/*
	317	* If we have no active transaction id, get one from the return
	318	* stream.
	319	*/
	320	if (transid == 0) {
	321	error = fp_read(jo->fp, &ack, sizeof(ack), &count,
	322	1, UIO_SYSSPACE);
	323	#if 0
	324	kprintf("fp_read ack error %d count %d\n", error, count);
	325	#endif
	326	if (error \|\| count != sizeof(ack))
	327	break;
	328	if (error) {
	329	kprintf("read error %d on receive stream\n", error);
	330	break;
	331	}
	332	if (ack.rbeg.begmagic != JREC_BEGMAGIC \|\|
	333	ack.rend.endmagic != JREC_ENDMAGIC
	334	) {
	335	kprintf("bad begmagic or endmagic on receive stream\n");
	336	break;
	337	}
	338	transid = ack.rbeg.transid;
	339	}
	340
	341	/*
	342	* Calculate the number of unacknowledged bytes. If there are no
	343	* unacknowledged bytes then unsent data was acknowledged, report,
	344	* sleep a bit, and loop in that case. This should not happen
	345	* normally. The ack record is thrown away.
	346	*/
	347	bytes = jo->fifo.rindex - jo->fifo.xindex;
	348
	349	if (bytes == 0) {
	350	kprintf("warning: unsent data acknowledged transid %08llx\n",
	351	(long long)transid);
	352	tsleep(&jo->fifo.xindex, 0, "jrseq", hz);
	353	transid = 0;
	354	continue;
	355	}
	356
	357	/*
	358	* Since rindex has advanced, the record pointed to by xindex
	359	* must be a valid record.
	360	*/
	361	rawp = (void *)(jo->fifo.membase + (jo->fifo.xindex & jo->fifo.mask));
	362	KKASSERT(rawp->begmagic == JREC_BEGMAGIC);
	363	KKASSERT(rawp->recsize <= bytes);
	364
	365	/*
	366	* The target can acknowledge several records at once.
	367	*/
	368	if (rawp->transid < transid) {
	369	#if 1
	370	kprintf("ackskip %08llx/%08llx\n",
	371	(long long)rawp->transid,
	372	(long long)transid);
	373	#endif
	374	jo->fifo.xindex += (rawp->recsize + 15) & ~15;
	375	jo->total_acked += (rawp->recsize + 15) & ~15;
	376	if (jo->flags & MC_JOURNAL_WWAIT) {
	377	jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */
	378	wakeup(&jo->fifo.windex);
	379	}
	380	continue;
	381	}
	382	if (rawp->transid == transid) {
	383	#if 1
	384	kprintf("ackskip %08llx/%08llx\n",
	385	(long long)rawp->transid,
	386	(long long)transid);
	387	#endif
	388	jo->fifo.xindex += (rawp->recsize + 15) & ~15;
	389	jo->total_acked += (rawp->recsize + 15) & ~15;
	390	if (jo->flags & MC_JOURNAL_WWAIT) {
	391	jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */
	392	wakeup(&jo->fifo.windex);
	393	}
	394	transid = 0;
	395	continue;
	396	}
	397	kprintf("warning: unsent data(2) acknowledged transid %08llx\n",
	398	(long long)transid);
	399	transid = 0;
	400	}
	401	jo->flags &= ~MC_JOURNAL_RACTIVE;
	402	wakeup(jo);
	403	wakeup(&jo->fifo.windex);
	404	}
	405
	406	/*
	407	* This builds a pad record which the journaling thread will skip over. Pad
	408	* records are required when we are unable to reserve sufficient stream space
	409	* due to insufficient space at the end of the physical memory fifo.
	410	*
	411	* Even though the record is not transmitted, a normal transid must be
	412	* assigned to it so link recovery operations after a failure work properly.
	413	*/
	414	static
	415	void
	416	journal_build_pad(struct journal_rawrecbeg *rawp, int recsize, int64_t transid)
	417	{
	418	struct journal_rawrecend *rendp;
	419
	420	KKASSERT((recsize & 15) == 0 && recsize >= 16);
	421
	422	rawp->streamid = JREC_STREAMID_PAD;
	423	rawp->recsize = recsize; /* must be 16-byte aligned */
	424	rawp->transid = transid;
	425	/*
	426	* WARNING, rendp may overlap rawp->transid. This is necessary to
	427	* allow PAD records to fit in 16 bytes. Use cpu_ccfence() to
	428	* hopefully cause the compiler to not make any assumptions.
	429	*/
	430	rendp = (void )((char )rawp + rawp->recsize - sizeof(*rendp));
	431	rendp->endmagic = JREC_ENDMAGIC;
	432	rendp->check = 0;
	433	rendp->recsize = rawp->recsize;
	434
	435	/*
	436	* Set the begin magic last. This is what will allow the journal
	437	* thread to write the record out. Use a store fence to prevent
	438	* compiler and cpu reordering of the writes.
	439	*/
	440	cpu_sfence();
	441	rawp->begmagic = JREC_BEGMAGIC;
	442	}
	443
	444	/*
	445	* Wake up the worker thread if the FIFO is more then half full or if
	446	* someone is waiting for space to be freed up. Otherwise let the
	447	* heartbeat deal with it. Being able to avoid waking up the worker
	448	* is the key to the journal's cpu performance.
	449	*/
	450	static __inline
	451	void
	452	journal_commit_wakeup(struct journal *jo)
	453	{
	454	int avail;
	455
	456	avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex);
	457	KKASSERT(avail >= 0);
	458	if ((avail < (jo->fifo.size >> 1)) \|\| (jo->flags & MC_JOURNAL_WWAIT))
	459	wakeup(&jo->fifo);
	460	}
	461
	462	/*
	463	* Create a new BEGIN stream record with the specified streamid and the
	464	* specified amount of payload space. *rawpp will be set to point to the
	465	* base of the new stream record and a pointer to the base of the payload
	466	* space will be returned. *rawpp does not need to be pre-NULLd prior to
	467	* making this call. The raw record header will be partially initialized.
	468	*
	469	* A stream can be extended, aborted, or committed by other API calls
	470	* below. This may result in a sequence of potentially disconnected
	471	* stream records to be output to the journaling target. The first record
	472	* (the one created by this function) will be marked JREC_STREAMCTL_BEGIN,
	473	* while the last record on commit or abort will be marked JREC_STREAMCTL_END
	474	* (and possibly also JREC_STREAMCTL_ABORTED). The last record could wind
	475	* up being the same as the first, in which case the bits are all set in
	476	* the first record.
	477	*
	478	* The stream record is created in an incomplete state by setting the begin
	479	* magic to JREC_INCOMPLETEMAGIC. This prevents the worker thread from
	480	* flushing the fifo past our record until we have finished populating it.
	481	* Other threads can reserve and operate on their own space without stalling
	482	* but the stream output will stall until we have completed operations. The
	483	* memory FIFO is intended to be large enough to absorb such situations
	484	* without stalling out other threads.
	485	*/
	486	static
	487	void *
	488	journal_reserve(struct journal jo, struct journal_rawrecbeg *rawpp,
	489	int16_t streamid, int bytes)
	490	{
	491	struct journal_rawrecbeg *rawp;
	492	int avail;
	493	int availtoend;
	494	int req;
	495
	496	/*
	497	* Add header and trailer overheads to the passed payload. Note that
	498	* the passed payload size need not be aligned in any way.
	499	*/
	500	bytes += sizeof(struct journal_rawrecbeg);
	501	bytes += sizeof(struct journal_rawrecend);
	502
	503	for (;;) {
	504	/*
	505	* First, check boundary conditions. If the request would wrap around
	506	* we have to skip past the ending block and return to the beginning
	507	* of the FIFO's buffer. Calculate 'req' which is the actual number
	508	* of bytes being reserved, including wrap-around dead space.
	509	*
	510	* Neither 'bytes' or 'req' are aligned.
	511	*
	512	* Note that availtoend is not truncated to avail and so cannot be
	513	* used to determine whether the reservation is possible by itself.
	514	* Also, since all fifo ops are 16-byte aligned, we can check
	515	* the size before calculating the aligned size.
	516	*/
	517	availtoend = jo->fifo.size - (jo->fifo.windex & jo->fifo.mask);
	518	KKASSERT((availtoend & 15) == 0);
	519	if (bytes > availtoend)
	520	req = bytes + availtoend; /* add pad to end */
	521	else
	522	req = bytes;
	523
	524	/*
	525	* Next calculate the total available space and see if it is
	526	* sufficient. We cannot overwrite previously buffered data
	527	* past xindex because otherwise we would not be able to restart
	528	* a broken link at the target's last point of commit.
	529	*/
	530	avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex);
	531	KKASSERT(avail >= 0 && (avail & 15) == 0);
	532
	533	if (avail < req) {
	534	/* XXX MC_JOURNAL_STOP_IMM */
	535	jo->flags \|= MC_JOURNAL_WWAIT;
	536	++jo->fifostalls;
	537	tsleep(&jo->fifo.windex, 0, "jwrite", 0);
	538	continue;
	539	}
	540
	541	/*
	542	* Create a pad record for any dead space and create an incomplete
	543	* record for the live space, then return a pointer to the
	544	* contiguous buffer space that was requested.
	545	*
	546	* NOTE: The worker thread will not flush past an incomplete
	547	* record, so the reserved space can be filled in at-will. The
	548	* journaling code must also be aware the reserved sections occuring
	549	* after this one will also not be written out even if completed
	550	* until this one is completed.
	551	*
	552	* The transaction id must accomodate real and potential pad creation.
	553	*/
	554	rawp = (void *)(jo->fifo.membase + (jo->fifo.windex & jo->fifo.mask));
	555	if (req != bytes) {
	556	journal_build_pad(rawp, availtoend, jo->transid);
	557	++jo->transid;
	558	rawp = (void *)jo->fifo.membase;
	559	}
	560	rawp->begmagic = JREC_INCOMPLETEMAGIC; /* updated by abort/commit */
	561	rawp->recsize = bytes; /* (unaligned size) */
	562	rawp->streamid = streamid \| JREC_STREAMCTL_BEGIN;
	563	rawp->transid = jo->transid;
	564	jo->transid += 2;
	565
	566	/*
	567	* Issue a memory barrier to guarentee that the record data has been
	568	* properly initialized before we advance the write index and return
	569	* a pointer to the reserved record. Otherwise the worker thread
	570	* could accidently run past us.
	571	*
	572	* Note that stream records are always 16-byte aligned.
	573	*/
	574	cpu_sfence();
	575	jo->fifo.windex += (req + 15) & ~15;
	576	*rawpp = rawp;
	577	return(rawp + 1);
	578	}
	579	/* not reached */
	580	*rawpp = NULL;
	581	return(NULL);
	582	}
	583
	584	/*
	585	* Attempt to extend the stream record by <bytes> worth of payload space.
	586	*
	587	* If it is possible to extend the existing stream record no truncation
	588	* occurs and the record is extended as specified. A pointer to the
	589	* truncation offset within the payload space is returned.
	590	*
	591	* If it is not possible to do this the existing stream record is truncated
	592	* and committed, and a new stream record of size <bytes> is created. A
	593	* pointer to the base of the new stream record's payload space is returned.
	594	*
	595	* *rawpp is set to the new reservation in the case of a new record but
	596	* the caller cannot depend on a comparison with the old rawp to determine if
	597	* this case occurs because we could end up using the same memory FIFO
	598	* offset for the new stream record. Use *newstreamrecp instead.
	599	*/
	600	static void *
	601	journal_extend(struct journal jo, struct journal_rawrecbeg *rawpp,
	602	int truncbytes, int bytes, int *newstreamrecp)
	603	{
	604	struct journal_rawrecbeg *rawp;
	605	int16_t streamid;
	606	int availtoend;
	607	int avail;
	608	int osize;
	609	int nsize;
	610	int wbase;
	611	void *rptr;
	612
	613	*newstreamrecp = 0;
	614	rawp = *rawpp;
	615	osize = (rawp->recsize + 15) & ~15;
	616	nsize = (rawp->recsize + bytes + 15) & ~15;
	617	wbase = (char *)rawp - jo->fifo.membase;
	618
	619	/*
	620	* If the aligned record size does not change we can trivially adjust
	621	* the record size.
	622	*/
	623	if (nsize == osize) {
	624	rawp->recsize += bytes;
	625	return((char *)(rawp + 1) + truncbytes);
	626	}
	627
	628	/*
	629	* If the fifo's write index hasn't been modified since we made the
	630	* reservation and we do not hit any boundary conditions, we can
	631	* trivially make the record smaller or larger.
	632	*/
	633	if ((jo->fifo.windex & jo->fifo.mask) == wbase + osize) {
	634	availtoend = jo->fifo.size - wbase;
	635	avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex) + osize;
	636	KKASSERT((availtoend & 15) == 0);
	637	KKASSERT((avail & 15) == 0);
	638	if (nsize <= avail && nsize <= availtoend) {
	639	jo->fifo.windex += nsize - osize;
	640	rawp->recsize += bytes;
	641	return((char *)(rawp + 1) + truncbytes);
	642	}
	643	}
	644
	645	/*
	646	* It was not possible to extend the buffer. Commit the current
	647	* buffer and create a new one. We manually clear the BEGIN mark that
	648	* journal_reserve() creates (because this is a continuing record, not
	649	* the start of a new stream).
	650	*/
	651	streamid = rawp->streamid & JREC_STREAMID_MASK;
	652	journal_commit(jo, rawpp, truncbytes, 0);
	653	rptr = journal_reserve(jo, rawpp, streamid, bytes);
	654	rawp = *rawpp;
	655	rawp->streamid &= ~JREC_STREAMCTL_BEGIN;
	656	*newstreamrecp = 1;
	657	return(rptr);
	658	}
	659
	660	/*
	661	* Abort a journal record. If the transaction record represents a stream
	662	* BEGIN and we can reverse the fifo's write index we can simply reverse
	663	* index the entire record, as if it were never reserved in the first place.
	664	*
	665	* Otherwise we set the JREC_STREAMCTL_ABORTED bit and commit the record
	666	* with the payload truncated to 0 bytes.
	667	*/
	668	static void
	669	journal_abort(struct journal jo, struct journal_rawrecbeg *rawpp)
	670	{
	671	struct journal_rawrecbeg *rawp;
	672	int osize;
	673
	674	rawp = *rawpp;
	675	osize = (rawp->recsize + 15) & ~15;
	676
	677	if ((rawp->streamid & JREC_STREAMCTL_BEGIN) &&
	678	(jo->fifo.windex & jo->fifo.mask) ==
	679	(char *)rawp - jo->fifo.membase + osize)
	680	{
	681	jo->fifo.windex -= osize;
	682	*rawpp = NULL;
	683	} else {
	684	rawp->streamid \|= JREC_STREAMCTL_ABORTED;
	685	journal_commit(jo, rawpp, 0, 1);
	686	}
	687	}
	688
	689	/*
	690	* Commit a journal record and potentially truncate it to the specified
	691	* number of payload bytes. If you do not want to truncate the record,
	692	* simply pass -1 for the bytes parameter. Do not pass rawp->recsize, that
	693	* field includes header and trailer and will not be correct. Note that
	694	* passing 0 will truncate the entire data payload of the record.
	695	*
	696	* The logical stream is terminated by this function.
	697	*
	698	* If truncation occurs, and it is not possible to physically optimize the
	699	* memory FIFO due to other threads having reserved space after ours,
	700	* the remaining reserved space will be covered by a pad record.
	701	*/
	702	static void
	703	journal_commit(struct journal jo, struct journal_rawrecbeg *rawpp,
	704	int bytes, int closeout)
	705	{
	706	struct journal_rawrecbeg *rawp;
	707	struct journal_rawrecend *rendp;
	708	int osize;
	709	int nsize;
	710
	711	rawp = *rawpp;
	712	*rawpp = NULL;
	713
	714	KKASSERT((char *)rawp >= jo->fifo.membase &&
	715	(char *)rawp + rawp->recsize <= jo->fifo.membase + jo->fifo.size);
	716	KKASSERT(((intptr_t)rawp & 15) == 0);
	717
	718	/*
	719	* Truncate the record if necessary. If the FIFO write index as still
	720	* at the end of our record we can optimally backindex it. Otherwise
	721	* we have to insert a pad record to cover the dead space.
	722	*
	723	* We calculate osize which is the 16-byte-aligned original recsize.
	724	* We calculate nsize which is the 16-byte-aligned new recsize.
	725	*
	726	* Due to alignment issues or in case the passed truncation bytes is
	727	* the same as the original payload, nsize may be equal to osize even
	728	* if the committed bytes is less then the originally reserved bytes.
	729	*/
	730	if (bytes >= 0) {
	731	KKASSERT(bytes >= 0 && bytes <= rawp->recsize - sizeof(struct journal_rawrecbeg) - sizeof(struct journal_rawrecend));
	732	osize = (rawp->recsize + 15) & ~15;
	733	rawp->recsize = bytes + sizeof(struct journal_rawrecbeg) +
	734	sizeof(struct journal_rawrecend);
	735	nsize = (rawp->recsize + 15) & ~15;
	736	KKASSERT(nsize <= osize);
	737	if (osize == nsize) {
	738	/* do nothing */
	739	} else if ((jo->fifo.windex & jo->fifo.mask) == (char *)rawp - jo->fifo.membase + osize) {
	740	/* we are able to backindex the fifo */
	741	jo->fifo.windex -= osize - nsize;
	742	} else {
	743	/* we cannot backindex the fifo, emplace a pad in the dead space */
	744	journal_build_pad((void )((char )rawp + nsize), osize - nsize,
	745	rawp->transid + 1);
	746	}
	747	}
	748
	749	/*
	750	* Fill in the trailer. Note that unlike pad records, the trailer will
	751	* never overlap the header.
	752	*/
	753	rendp = (void )((char )rawp +
	754	((rawp->recsize + 15) & ~15) - sizeof(*rendp));
	755	rendp->endmagic = JREC_ENDMAGIC;
	756	rendp->recsize = rawp->recsize;
	757	rendp->check = 0; /* XXX check word, disabled for now */
	758
	759	/*
	760	* Fill in begmagic last. This will allow the worker thread to proceed.
	761	* Use a memory barrier to guarentee write ordering. Mark the stream
	762	* as terminated if closeout is set. This is the typical case.
	763	*/
	764	if (closeout)
	765	rawp->streamid \|= JREC_STREAMCTL_END;
	766	cpu_sfence(); /* memory and compiler barrier */
	767	rawp->begmagic = JREC_BEGMAGIC;
	768
	769	journal_commit_wakeup(jo);
	770	}
	771
	772	/************************************************************************
	773	* TRANSACTION SUPPORT ROUTINES *
	774	************************************************************************
	775	*
	776	* JRECORD_*() - routines to create subrecord transactions and embed them
	777	* in the logical streams managed by the journal_*() routines.
	778	*/
	779
	780	/*
	781	* Initialize the passed jrecord structure and start a new stream transaction
	782	* by reserving an initial build space in the journal's memory FIFO.
	783	*/
	784	void
	785	jrecord_init(struct journal jo, struct jrecord jrec, int16_t streamid)
	786	{
	787	bzero(jrec, sizeof(*jrec));
	788	jrec->jo = jo;
	789	jrec->streamid = streamid;
	790	jrec->stream_residual = JREC_DEFAULTSIZE;
	791	jrec->stream_reserved = jrec->stream_residual;
	792	jrec->stream_ptr =
	793	journal_reserve(jo, &jrec->rawp, streamid, jrec->stream_reserved);
	794	}
	795
	796	/*
	797	* Push a recursive record type. All pushes should have matching pops.
	798	* The old parent is returned and the newly pushed record becomes the
	799	* new parent. Note that the old parent's pointer may already be invalid
	800	* or may become invalid if jrecord_write() had to build a new stream
	801	* record, so the caller should not mess with the returned pointer in
	802	* any way other then to save it.
	803	*/
	804	struct journal_subrecord *
	805	jrecord_push(struct jrecord *jrec, int16_t rectype)
	806	{
	807	struct journal_subrecord *save;
	808
	809	save = jrec->parent;
	810	jrec->parent = jrecord_write(jrec, rectype\|JMASK_NESTED, 0);
	811	jrec->last = NULL;
	812	KKASSERT(jrec->parent != NULL);
	813	++jrec->pushcount;
	814	++jrec->pushptrgood; /* cleared on flush */
	815	return(save);
	816	}
	817
	818	/*
	819	* Pop a previously pushed sub-transaction. We must set JMASK_LAST
	820	* on the last record written within the subtransaction. If the last
	821	* record written is not accessible or if the subtransaction is empty,
	822	* we must write out a pad record with JMASK_LAST set before popping.
	823	*
	824	* When popping a subtransaction the parent record's recsize field
	825	* will be properly set. If the parent pointer is no longer valid
	826	* (which can occur if the data has already been flushed out to the
	827	* stream), the protocol spec allows us to leave it 0.
	828	*
	829	* The saved parent pointer which we restore may or may not be valid,
	830	* and if not valid may or may not be NULL, depending on the value
	831	* of pushptrgood.
	832	*/
	833	void
	834	jrecord_pop(struct jrecord jrec, struct journal_subrecord save)
	835	{
	836	struct journal_subrecord *last;
	837
	838	KKASSERT(jrec->pushcount > 0);
	839	KKASSERT(jrec->residual == 0);
	840
	841	/*
	842	* Set JMASK_LAST on the last record we wrote at the current
	843	* level. If last is NULL we either no longer have access to the
	844	* record or the subtransaction was empty and we must write out a pad
	845	* record.
	846	*/
	847	if ((last = jrec->last) == NULL) {
	848	jrecord_write(jrec, JLEAF_PAD\|JMASK_LAST, 0);
	849	last = jrec->last; /* reload after possible flush */
	850	} else {
	851	last->rectype \|= JMASK_LAST;
	852	}
	853
	854	/*
	855	* pushptrgood tells us how many levels of parent record pointers
	856	* are valid. The jrec only stores the current parent record pointer
	857	* (and it is only valid if pushptrgood != 0). The higher level parent
	858	* record pointers are saved by the routines calling jrecord_push() and
	859	* jrecord_pop(). These pointers may become stale and we determine
	860	* that fact by tracking the count of valid parent pointers with
	861	* pushptrgood. Pointers become invalid when their related stream
	862	* record gets pushed out.
	863	*
	864	* If no pointer is available (the data has already been pushed out),
	865	* then no fixup of e.g. the length field is possible for non-leaf
	866	* nodes. The protocol allows for this situation by placing a larger
	867	* burden on the program scanning the stream on the other end.
	868	*
	869	* [parentA]
	870	* [node X]
	871	* [parentB]
	872	* [node Y]
	873	* [node Z]
	874	* (pop B) see NOTE B
	875	* (pop A) see NOTE A
	876	*
	877	* NOTE B: This pop sets LAST in node Z if the node is still accessible,
	878	* else a PAD record is appended and LAST is set in that.
	879	*
	880	* This pop sets the record size in parentB if parentB is still
	881	* accessible, else the record size is left 0 (the scanner must
	882	* deal with that).
	883	*
	884	* This pop sets the new 'last' record to parentB, the pointer
	885	* to which may or may not still be accessible.
	886	*
	887	* NOTE A: This pop sets LAST in parentB if the node is still accessible,
	888	* else a PAD record is appended and LAST is set in that.
	889	*
	890	* This pop sets the record size in parentA if parentA is still
	891	* accessible, else the record size is left 0 (the scanner must
	892	* deal with that).
	893	*
	894	* This pop sets the new 'last' record to parentA, the pointer
	895	* to which may or may not still be accessible.
	896	*
	897	* Also note that the last record in the stream transaction, which in
	898	* the above example is parentA, does not currently have the LAST bit
	899	* set.
	900	*
	901	* The current parent becomes the last record relative to the
	902	* saved parent passed into us. It's validity is based on
	903	* whether pushptrgood is non-zero prior to decrementing. The saved
	904	* parent becomes the new parent, and its validity is based on whether
	905	* pushptrgood is non-zero after decrementing.
	906	*
	907	* The old jrec->parent may be NULL if it is no longer accessible.
	908	* If pushptrgood is non-zero, however, it is guarenteed to not
	909	* be NULL (since no flush occured).
	910	*/
	911	jrec->last = jrec->parent;
	912	--jrec->pushcount;
	913	if (jrec->pushptrgood) {
	914	KKASSERT(jrec->last != NULL && last != NULL);
	915	if (--jrec->pushptrgood == 0) {
	916	jrec->parent = NULL; /* 'save' contains garbage or NULL */
	917	} else {
	918	KKASSERT(save != NULL);
	919	jrec->parent = save; /* 'save' must not be NULL */
	920	}
	921
	922	/*
	923	* Set the record size in the old parent. 'last' still points to
	924	* the original last record in the subtransaction being popped,
	925	* jrec->last points to the old parent (which became the last
	926	* record relative to the new parent being popped into).
	927	*/
	928	jrec->last->recsize = (char )last + last->recsize - (char )jrec->last;
	929	} else {
	930	jrec->parent = NULL;
	931	KKASSERT(jrec->last == NULL);
	932	}
	933	}
	934
	935	/*
	936	* Write out a leaf record, including associated data.
	937	*/
	938	void
	939	jrecord_leaf(struct jrecord jrec, int16_t rectype, void ptr, int bytes)
	940	{
	941	jrecord_write(jrec, rectype, bytes);
	942	jrecord_data(jrec, ptr, bytes);
	943	}
	944
	945	/*
	946	* Write a leaf record out and return a pointer to its base. The leaf
	947	* record may contain potentially megabytes of data which is supplied
	948	* in jrecord_data() calls. The exact amount must be specified in this
	949	* call.
	950	*
	951	* THE RETURNED SUBRECORD POINTER IS ONLY VALID IMMEDIATELY AFTER THE
	952	* CALL AND MAY BECOME INVALID AT ANY TIME. ONLY THE PUSH/POP CODE SHOULD
	953	* USE THE RETURN VALUE.
	954	*/
	955	struct journal_subrecord *
	956	jrecord_write(struct jrecord *jrec, int16_t rectype, int bytes)
	957	{
	958	struct journal_subrecord *last;
	959	int pusheditout;
	960
	961	/*
	962	* Try to catch some obvious errors. Nesting records must specify a
	963	* size of 0, and there should be no left-overs from previous operations
	964	* (such as incomplete data writeouts).
	965	*/
	966	KKASSERT(bytes == 0 \|\| (rectype & JMASK_NESTED) == 0);
	967	KKASSERT(jrec->residual == 0);
	968
	969	/*
	970	* Check to see if the current stream record has enough room for
	971	* the new subrecord header. If it doesn't we extend the current
	972	* stream record.
	973	*
	974	* This may have the side effect of pushing out the current stream record
	975	* and creating a new one. We must adjust our stream tracking fields
	976	* accordingly.
	977	*/
	978	if (jrec->stream_residual < sizeof(struct journal_subrecord)) {
	979	jrec->stream_ptr = journal_extend(jrec->jo, &jrec->rawp,
	980	jrec->stream_reserved - jrec->stream_residual,
	981	JREC_DEFAULTSIZE, &pusheditout);
	982	if (pusheditout) {
	983	/*
	984	* If a pushout occured, the pushed out stream record was
	985	* truncated as specified and the new record is exactly the
	986	* extension size specified.
	987	*/
	988	jrec->stream_reserved = JREC_DEFAULTSIZE;
	989	jrec->stream_residual = JREC_DEFAULTSIZE;
	990	jrec->parent = NULL; /* no longer accessible */
	991	jrec->pushptrgood = 0; /* restored parents in pops no good */
	992	} else {
	993	/*
	994	* If no pushout occured the stream record is NOT truncated and
	995	* IS extended.
	996	*/
	997	jrec->stream_reserved += JREC_DEFAULTSIZE;
	998	jrec->stream_residual += JREC_DEFAULTSIZE;
	999	}
	1000	}
	1001	last = (void *)jrec->stream_ptr;
	1002	last->rectype = rectype;
	1003	last->reserved = 0;
	1004
	1005	/*
	1006	* We may not know the record size for recursive records and the
	1007	* header may become unavailable due to limited FIFO space. Write
	1008	* -1 to indicate this special case.
	1009	*/
	1010	if ((rectype & JMASK_NESTED) && bytes == 0)
	1011	last->recsize = -1;
	1012	else
	1013	last->recsize = sizeof(struct journal_subrecord) + bytes;
	1014	jrec->last = last;
	1015	jrec->residual = bytes; /* remaining data to be posted */
	1016	jrec->residual_align = -bytes & 7; /* post-data alignment required */
	1017	jrec->stream_ptr += sizeof(last); / current write pointer */
	1018	jrec->stream_residual -= sizeof(last); / space remaining in stream */
	1019	return(last);
	1020	}
	1021
	1022	/*
	1023	* Write out the data associated with a leaf record. Any number of calls
	1024	* to this routine may be made as long as the byte count adds up to the
	1025	* amount originally specified in jrecord_write().
	1026	*
	1027	* The act of writing out the leaf data may result in numerous stream records
	1028	* being pushed out. Callers should be aware that even the associated
	1029	* subrecord header may become inaccessible due to stream record pushouts.
	1030	*/
	1031	void
	1032	jrecord_data(struct jrecord jrec, const void buf, int bytes)
	1033	{
	1034	int pusheditout;
	1035	int extsize;
	1036
	1037	KKASSERT(bytes >= 0 && bytes <= jrec->residual);
	1038
	1039	/*
	1040	* Push out stream records as long as there is insufficient room to hold
	1041	* the remaining data.
	1042	*/
	1043	while (jrec->stream_residual < bytes) {
	1044	/*
	1045	* Fill in any remaining space in the current stream record.
	1046	*/
	1047	bcopy(buf, jrec->stream_ptr, jrec->stream_residual);
	1048	buf = (const char *)buf + jrec->stream_residual;
	1049	bytes -= jrec->stream_residual;
	1050	/jrec->stream_ptr += jrec->stream_residual;/
	1051	jrec->residual -= jrec->stream_residual;
	1052	jrec->stream_residual = 0;
	1053
	1054	/*
	1055	* Try to extend the current stream record, but no more then 1/4
	1056	* the size of the FIFO.
	1057	*/
	1058	extsize = jrec->jo->fifo.size >> 2;
	1059	if (extsize > bytes)
	1060	extsize = (bytes + 15) & ~15;
	1061
	1062	jrec->stream_ptr = journal_extend(jrec->jo, &jrec->rawp,
	1063	jrec->stream_reserved - jrec->stream_residual,
	1064	extsize, &pusheditout);
	1065	if (pusheditout) {
	1066	jrec->stream_reserved = extsize;
	1067	jrec->stream_residual = extsize;
	1068	jrec->parent = NULL; /* no longer accessible */
	1069	jrec->last = NULL; /* no longer accessible */
	1070	jrec->pushptrgood = 0; /* restored parents in pops no good */
	1071	} else {
	1072	jrec->stream_reserved += extsize;
	1073	jrec->stream_residual += extsize;
	1074	}
	1075	}
	1076
	1077	/*
	1078	* Push out any remaining bytes into the current stream record.
	1079	*/
	1080	if (bytes) {
	1081	bcopy(buf, jrec->stream_ptr, bytes);
	1082	jrec->stream_ptr += bytes;
	1083	jrec->stream_residual -= bytes;
	1084	jrec->residual -= bytes;
	1085	}
	1086
	1087	/*
	1088	* Handle data alignment requirements for the subrecord. Because the
	1089	* stream record's data space is more strictly aligned, it must already
	1090	* have sufficient space to hold any subrecord alignment slop.
	1091	*/
	1092	if (jrec->residual == 0 && jrec->residual_align) {
	1093	KKASSERT(jrec->residual_align <= jrec->stream_residual);
	1094	bzero(jrec->stream_ptr, jrec->residual_align);
	1095	jrec->stream_ptr += jrec->residual_align;
	1096	jrec->stream_residual -= jrec->residual_align;
	1097	jrec->residual_align = 0;
	1098	}
	1099	}
	1100
	1101	/*
	1102	* We are finished with the transaction. This closes the transaction created
	1103	* by jrecord_init().
	1104	*
	1105	* NOTE: If abortit is not set then we must be at the top level with no
	1106	* residual subrecord data left to output.
	1107	*
	1108	* If abortit is set then we can be in any state, all pushes will be
	1109	* popped and it is ok for there to be residual data. This works
	1110	* because the virtual stream itself is truncated. Scanners must deal
	1111	* with this situation.
	1112	*
	1113	* The stream record will be committed or aborted as specified and jrecord
	1114	* resources will be cleaned up.
	1115	*/
	1116	void
	1117	jrecord_done(struct jrecord *jrec, int abortit)
	1118	{
	1119	KKASSERT(jrec->rawp != NULL);
	1120
	1121	if (abortit) {
	1122	journal_abort(jrec->jo, &jrec->rawp);
	1123	} else {
	1124	KKASSERT(jrec->pushcount == 0 && jrec->residual == 0);
	1125	journal_commit(jrec->jo, &jrec->rawp,
	1126	jrec->stream_reserved - jrec->stream_residual, 1);
	1127	}
	1128
	1129	/*
	1130	* jrec should not be used beyond this point without another init,
	1131	* but clean up some fields to ensure that we panic if it is.
	1132	*
	1133	* Note that jrec->rawp is NULLd out by journal_abort/journal_commit.
	1134	*/
	1135	jrec->jo = NULL;
	1136	jrec->stream_ptr = NULL;
	1137	}
	1138
	1139	/************************************************************************
	1140	* LOW LEVEL RECORD SUPPORT ROUTINES *
	1141	************************************************************************
	1142	*
	1143	* These routine create low level recursive and leaf subrecords representing
	1144	* common filesystem structures.
	1145	*/
	1146
	1147	/*
	1148	* Write out a filename path relative to the base of the mount point.
	1149	* rectype is typically JLEAF_PATH{1,2,3,4}.
	1150	*/
	1151	void
	1152	jrecord_write_path(struct jrecord jrec, int16_t rectype, struct namecache ncp)
	1153	{
	1154	char buf[64]; /* local buffer if it fits, else malloced */
	1155	char *base;
	1156	int pathlen;
	1157	int index;
	1158	struct namecache *scan;
	1159
	1160	/*
	1161	* Pass 1 - figure out the number of bytes required. Include terminating
	1162	* \0 on last element and '/' separator on other elements.
	1163	*
	1164	* The namecache topology terminates at the root of the filesystem
	1165	* (the normal lookup code would then continue by using the mount
	1166	* structure to figure out what it was mounted on).
	1167	*/
	1168	again:
	1169	pathlen = 0;
	1170	for (scan = ncp; scan; scan = scan->nc_parent) {
	1171	if (scan->nc_nlen > 0)
	1172	pathlen += scan->nc_nlen + 1;
	1173	}
	1174
	1175	if (pathlen <= sizeof(buf))
	1176	base = buf;
	1177	else
	1178	base = kmalloc(pathlen, M_TEMP, M_INTWAIT);
	1179
	1180	/*
	1181	* Pass 2 - generate the path buffer
	1182	*/
	1183	index = pathlen;
	1184	for (scan = ncp; scan; scan = scan->nc_parent) {
	1185	if (scan->nc_nlen == 0)
	1186	continue;
	1187	if (scan->nc_nlen >= index) {
	1188	if (base != buf)
	1189	kfree(base, M_TEMP);
	1190	goto again;
	1191	}
	1192	if (index == pathlen)
	1193	base[--index] = 0;
	1194	else
	1195	base[--index] = '/';
	1196	index -= scan->nc_nlen;
	1197	bcopy(scan->nc_name, base + index, scan->nc_nlen);
	1198	}
	1199	jrecord_leaf(jrec, rectype, base + index, pathlen - index);
	1200	if (base != buf)
	1201	kfree(base, M_TEMP);
	1202	}
	1203
	1204	/*
	1205	* Write out a file attribute structure. While somewhat inefficient, using
	1206	* a recursive data structure is the most portable and extensible way.
	1207	*/
	1208	void
	1209	jrecord_write_vattr(struct jrecord jrec, struct vattr vat)
	1210	{
	1211	void *save;
	1212
	1213	save = jrecord_push(jrec, JTYPE_VATTR);
	1214	if (vat->va_type != VNON)
	1215	jrecord_leaf(jrec, JLEAF_VTYPE, &vat->va_type, sizeof(vat->va_type));
	1216	if (vat->va_mode != (mode_t)VNOVAL)
	1217	jrecord_leaf(jrec, JLEAF_MODES, &vat->va_mode, sizeof(vat->va_mode));
	1218	if (vat->va_nlink != VNOVAL)
	1219	jrecord_leaf(jrec, JLEAF_NLINK, &vat->va_nlink, sizeof(vat->va_nlink));
	1220	if (vat->va_uid != VNOVAL)
	1221	jrecord_leaf(jrec, JLEAF_UID, &vat->va_uid, sizeof(vat->va_uid));
	1222	if (vat->va_gid != VNOVAL)
	1223	jrecord_leaf(jrec, JLEAF_GID, &vat->va_gid, sizeof(vat->va_gid));
	1224	if (vat->va_fsid != VNOVAL)
	1225	jrecord_leaf(jrec, JLEAF_FSID, &vat->va_fsid, sizeof(vat->va_fsid));
	1226	if (vat->va_fileid != VNOVAL)
	1227	jrecord_leaf(jrec, JLEAF_INUM, &vat->va_fileid, sizeof(vat->va_fileid));
	1228	if (vat->va_size != VNOVAL)
	1229	jrecord_leaf(jrec, JLEAF_SIZE, &vat->va_size, sizeof(vat->va_size));
	1230	if (vat->va_atime.tv_sec != VNOVAL)
	1231	jrecord_leaf(jrec, JLEAF_ATIME, &vat->va_atime, sizeof(vat->va_atime));
	1232	if (vat->va_mtime.tv_sec != VNOVAL)
	1233	jrecord_leaf(jrec, JLEAF_MTIME, &vat->va_mtime, sizeof(vat->va_mtime));
	1234	if (vat->va_ctime.tv_sec != VNOVAL)
	1235	jrecord_leaf(jrec, JLEAF_CTIME, &vat->va_ctime, sizeof(vat->va_ctime));
	1236	if (vat->va_gen != VNOVAL)
	1237	jrecord_leaf(jrec, JLEAF_GEN, &vat->va_gen, sizeof(vat->va_gen));
	1238	if (vat->va_flags != VNOVAL)
	1239	jrecord_leaf(jrec, JLEAF_FLAGS, &vat->va_flags, sizeof(vat->va_flags));
	1240	if (vat->va_rmajor != VNOVAL) {
	1241	udev_t rdev = makeudev(vat->va_rmajor, vat->va_rminor);
	1242	jrecord_leaf(jrec, JLEAF_UDEV, &rdev, sizeof(rdev));
	1243	jrecord_leaf(jrec, JLEAF_UMAJOR, &vat->va_rmajor, sizeof(vat->va_rmajor));
	1244	jrecord_leaf(jrec, JLEAF_UMINOR, &vat->va_rminor, sizeof(vat->va_rminor));
	1245	}
	1246	#if 0
	1247	if (vat->va_filerev != VNOVAL)
	1248	jrecord_leaf(jrec, JLEAF_FILEREV, &vat->va_filerev, sizeof(vat->va_filerev));
	1249	#endif
	1250	jrecord_pop(jrec, save);
	1251	}
	1252
	1253	/*
	1254	* Write out the creds used to issue a file operation. If a process is
	1255	* available write out additional tracking information related to the
	1256	* process.
	1257	*
	1258	* XXX additional tracking info
	1259	* XXX tty line info
	1260	*/
	1261	void
	1262	jrecord_write_cred(struct jrecord jrec, struct thread td, struct ucred *cred)
	1263	{
	1264	void *save;
	1265	struct proc *p;
	1266
	1267	save = jrecord_push(jrec, JTYPE_CRED);
	1268	jrecord_leaf(jrec, JLEAF_UID, &cred->cr_uid, sizeof(cred->cr_uid));
	1269	jrecord_leaf(jrec, JLEAF_GID, &cred->cr_gid, sizeof(cred->cr_gid));
	1270	if (td && (p = td->td_proc) != NULL) {
	1271	jrecord_leaf(jrec, JLEAF_PID, &p->p_pid, sizeof(p->p_pid));
	1272	jrecord_leaf(jrec, JLEAF_COMM, p->p_comm, sizeof(p->p_comm));
	1273	}
	1274	jrecord_pop(jrec, save);
	1275	}
	1276
	1277	/*
	1278	* Write out information required to identify a vnode
	1279	*
	1280	* XXX this needs work. We should write out the inode number as well,
	1281	* and in fact avoid writing out the file path for seqential writes
	1282	* occuring within e.g. a certain period of time.
	1283	*/
	1284	void
	1285	jrecord_write_vnode_ref(struct jrecord jrec, struct vnode vp)
	1286	{
	1287	struct nchandle nch;
	1288
	1289	nch.mount = vp->v_mount;
	1290	spin_lock_wr(&vp->v_spinlock);
	1291	TAILQ_FOREACH(nch.ncp, &vp->v_namecache, nc_vnode) {
	1292	if ((nch.ncp->nc_flag & (NCF_UNRESOLVED\|NCF_DESTROYED)) == 0)
	1293	break;
	1294	}
	1295	if (nch.ncp) {
	1296	cache_hold(&nch);
	1297	spin_unlock_wr(&vp->v_spinlock);
	1298	jrecord_write_path(jrec, JLEAF_PATH_REF, nch.ncp);
	1299	cache_drop(&nch);
	1300	} else {
	1301	spin_unlock_wr(&vp->v_spinlock);
	1302	}
	1303	}
	1304
	1305	void
	1306	jrecord_write_vnode_link(struct jrecord jrec, struct vnode vp,
	1307	struct namecache *notncp)
	1308	{
	1309	struct nchandle nch;
	1310
	1311	nch.mount = vp->v_mount;
	1312	spin_lock_wr(&vp->v_spinlock);
	1313	TAILQ_FOREACH(nch.ncp, &vp->v_namecache, nc_vnode) {
	1314	if (nch.ncp == notncp)
	1315	continue;
	1316	if ((nch.ncp->nc_flag & (NCF_UNRESOLVED\|NCF_DESTROYED)) == 0)
	1317	break;
	1318	}
	1319	if (nch.ncp) {
	1320	cache_hold(&nch);
	1321	spin_unlock_wr(&vp->v_spinlock);
	1322	jrecord_write_path(jrec, JLEAF_PATH_REF, nch.ncp);
	1323	cache_drop(&nch);
	1324	} else {
	1325	spin_unlock_wr(&vp->v_spinlock);
	1326	}
	1327	}
	1328
	1329	/*
	1330	* Write out the data represented by a pagelist
	1331	*/
	1332	void
	1333	jrecord_write_pagelist(struct jrecord *jrec, int16_t rectype,
	1334	struct vm_page *pglist, int rtvals, int pgcount,
	1335	off_t offset)
	1336	{
	1337	struct msf_buf *msf;
	1338	int error;
	1339	int b;
	1340	int i;
	1341
	1342	i = 0;
	1343	while (i < pgcount) {
	1344	/*
	1345	* Find the next valid section. Skip any invalid elements
	1346	*/
	1347	if (rtvals[i] != VM_PAGER_OK) {
	1348	++i;
	1349	offset += PAGE_SIZE;
	1350	continue;
	1351	}
	1352
	1353	/*
	1354	* Figure out how big the valid section is, capping I/O at what the
	1355	* MSFBUF can represent.
	1356	*/
	1357	b = i;
	1358	while (i < pgcount && i - b != XIO_INTERNAL_PAGES &&
	1359	rtvals[i] == VM_PAGER_OK
	1360	) {
	1361	++i;
	1362	}
	1363
	1364	/*
	1365	* And write it out.
	1366	*/
	1367	if (i - b) {
	1368	error = msf_map_pagelist(&msf, pglist + b, i - b, 0);
	1369	if (error == 0) {
	1370	kprintf("RECORD PUTPAGES %d\n", msf_buf_bytes(msf));
	1371	jrecord_leaf(jrec, JLEAF_SEEKPOS, &offset, sizeof(offset));
	1372	jrecord_leaf(jrec, rectype,
	1373	msf_buf_kva(msf), msf_buf_bytes(msf));
	1374	msf_buf_free(msf);
	1375	} else {
	1376	kprintf("jrecord_write_pagelist: mapping failure\n");
	1377	}
	1378	offset += (off_t)(i - b) << PAGE_SHIFT;
	1379	}
	1380	}
	1381	}
	1382
	1383	/*
	1384	* Write out the data represented by a UIO.
	1385	*/
	1386	struct jwuio_info {
	1387	struct jrecord *jrec;
	1388	int16_t rectype;
	1389	};
	1390
	1391	static int jrecord_write_uio_callback(void info, char buf, int bytes);
	1392
	1393	void
	1394	jrecord_write_uio(struct jrecord jrec, int16_t rectype, struct uio uio)
	1395	{
	1396	struct jwuio_info info = { jrec, rectype };
	1397	int error;
	1398
	1399	if (uio->uio_segflg != UIO_NOCOPY) {
	1400	jrecord_leaf(jrec, JLEAF_SEEKPOS, &uio->uio_offset,
	1401	sizeof(uio->uio_offset));
	1402	error = msf_uio_iterate(uio, jrecord_write_uio_callback, &info);
	1403	if (error)
	1404	kprintf("XXX warning uio iterate failed %d\n", error);
	1405	}
	1406	}
	1407
	1408	static int
	1409	jrecord_write_uio_callback(void info_arg, char buf, int bytes)
	1410	{
	1411	struct jwuio_info *info = info_arg;
	1412
	1413	jrecord_leaf(info->jrec, info->rectype, buf, bytes);
	1414	return(0);
	1415	}
	1416
	1417	void
	1418	jrecord_file_data(struct jrecord jrec, struct vnode vp,
	1419	off_t off, off_t bytes)
	1420	{
	1421	const int bufsize = 8192;
	1422	char *buf;
	1423	int error;
	1424	int n;
	1425
	1426	buf = kmalloc(bufsize, M_JOURNAL, M_WAITOK);
	1427	jrecord_leaf(jrec, JLEAF_SEEKPOS, &off, sizeof(off));
	1428	while (bytes) {
	1429	n = (bytes > bufsize) ? bufsize : (int)bytes;
	1430	error = vn_rdwr(UIO_READ, vp, buf, n, off, UIO_SYSSPACE, IO_NODELOCKED,
	1431	proc0.p_ucred, NULL);
	1432	if (error) {
	1433	jrecord_leaf(jrec, JLEAF_ERROR, &error, sizeof(error));
	1434	break;
	1435	}
	1436	jrecord_leaf(jrec, JLEAF_FILEDATA, buf, n);
	1437	bytes -= n;
	1438	off += n;
	1439	}
	1440	kfree(buf, M_JOURNAL);
	1441	}
	1442