gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2004 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/kern/vfs_journal.c,v 1.11 2005/03/05 05:08:27 dillon Exp $
	35	*/
	36	/*
	37	* Each mount point may have zero or more independantly configured journals
	38	* attached to it. Each journal is represented by a memory FIFO and worker
	39	* thread. Journal events are streamed through the FIFO to the thread,
	40	* batched up (typically on one-second intervals), and written out by the
	41	* thread.
	42	*
	43	* Journal vnode ops are executed instead of mnt_vn_norm_ops when one or
	44	* more journals have been installed on a mount point. It becomes the
	45	* responsibility of the journal op to call the underlying normal op as
	46	* appropriate.
	47	*
	48	* The journaling protocol is intended to evolve into a two-way stream
	49	* whereby transaction IDs can be acknowledged by the journaling target
	50	* when the data has been committed to hard storage. Both implicit and
	51	* explicit acknowledgement schemes will be supported, depending on the
	52	* sophistication of the journaling stream, plus resynchronization and
	53	* restart when a journaling stream is interrupted. This information will
	54	* also be made available to journaling-aware filesystems to allow better
	55	* management of their own physical storage synchronization mechanisms as
	56	* well as to allow such filesystems to take direct advantage of the kernel's
	57	* journaling layer so they don't have to roll their own.
	58	*
	59	* In addition, the worker thread will have access to much larger
	60	* spooling areas then the memory buffer is able to provide by e.g.
	61	* reserving swap space, in order to absorb potentially long interruptions
	62	* of off-site journaling streams, and to prevent 'slow' off-site linkages
	63	* from radically slowing down local filesystem operations.
	64	*
	65	* Because of the non-trivial algorithms the journaling system will be
	66	* required to support, use of a worker thread is mandatory. Efficiencies
	67	* are maintained by utilitizing the memory FIFO to batch transactions when
	68	* possible, reducing the number of gratuitous thread switches and taking
	69	* advantage of cpu caches through the use of shorter batched code paths
	70	* rather then trying to do everything in the context of the process
	71	* originating the filesystem op. In the future the memory FIFO can be
	72	* made per-cpu to remove BGL or other locking requirements.
	73	*/
	74	#include <sys/param.h>
	75	#include <sys/systm.h>
	76	#include <sys/buf.h>
	77	#include <sys/conf.h>
	78	#include <sys/kernel.h>
	79	#include <sys/queue.h>
	80	#include <sys/lock.h>
	81	#include <sys/malloc.h>
	82	#include <sys/mount.h>
	83	#include <sys/unistd.h>
	84	#include <sys/vnode.h>
	85	#include <sys/poll.h>
	86	#include <sys/mountctl.h>
	87	#include <sys/journal.h>
	88	#include <sys/file.h>
	89	#include <sys/proc.h>
	90	#include <sys/msfbuf.h>
	91
	92	#include <machine/limits.h>
	93
	94	#include <vm/vm.h>
	95	#include <vm/vm_object.h>
	96	#include <vm/vm_page.h>
	97	#include <vm/vm_pager.h>
	98	#include <vm/vnode_pager.h>
	99
	100	#include <sys/file2.h>
	101	#include <sys/thread2.h>
	102
	103	static int journal_attach(struct mount *mp);
	104	static void journal_detach(struct mount *mp);
	105	static int journal_install_vfs_journal(struct mount mp, struct file fp,
	106	const struct mountctl_install_journal *info);
	107	static int journal_remove_vfs_journal(struct mount *mp,
	108	const struct mountctl_remove_journal *info);
	109	static int journal_resync_vfs_journal(struct mount mp, const void ctl);
	110	static int journal_status_vfs_journal(struct mount *mp,
	111	const struct mountctl_status_journal *info,
	112	struct mountctl_journal_ret_status *rstat,
	113	int buflen, int *res);
	114	static void journal_thread(void *info);
	115
	116	static void journal_reserve(struct journal jo,
	117	struct journal_rawrecbeg **rawpp,
	118	int16_t streamid, int bytes);
	119	static void journal_extend(struct journal jo,
	120	struct journal_rawrecbeg **rawpp,
	121	int truncbytes, int bytes, int *newstreamrecp);
	122	static void journal_abort(struct journal *jo,
	123	struct journal_rawrecbeg **rawpp);
	124	static void journal_commit(struct journal *jo,
	125	struct journal_rawrecbeg **rawpp,
	126	int bytes, int closeout);
	127
	128	static void jrecord_init(struct journal *jo,
	129	struct jrecord *jrec, int16_t streamid);
	130	static struct journal_subrecord *jrecord_push(
	131	struct jrecord *jrec, int16_t rectype);
	132	static void jrecord_pop(struct jrecord jrec, struct journal_subrecord parent);
	133	static struct journal_subrecord jrecord_write(struct jrecord jrec,
	134	int16_t rectype, int bytes);
	135	static void jrecord_data(struct jrecord jrec, const void buf, int bytes);
	136	static void jrecord_done(struct jrecord *jrec, int abortit);
	137
	138	static int journal_setattr(struct vop_setattr_args *ap);
	139	static int journal_write(struct vop_write_args *ap);
	140	static int journal_fsync(struct vop_fsync_args *ap);
	141	static int journal_putpages(struct vop_putpages_args *ap);
	142	static int journal_setacl(struct vop_setacl_args *ap);
	143	static int journal_setextattr(struct vop_setextattr_args *ap);
	144	static int journal_ncreate(struct vop_ncreate_args *ap);
	145	static int journal_nmknod(struct vop_nmknod_args *ap);
	146	static int journal_nlink(struct vop_nlink_args *ap);
	147	static int journal_nsymlink(struct vop_nsymlink_args *ap);
	148	static int journal_nwhiteout(struct vop_nwhiteout_args *ap);
	149	static int journal_nremove(struct vop_nremove_args *ap);
	150	static int journal_nmkdir(struct vop_nmkdir_args *ap);
	151	static int journal_nrmdir(struct vop_nrmdir_args *ap);
	152	static int journal_nrename(struct vop_nrename_args *ap);
	153
	154	static struct vnodeopv_entry_desc journal_vnodeop_entries[] = {
	155	{ &vop_default_desc, vop_journal_operate_ap },
	156	{ &vop_mountctl_desc, (void *)journal_mountctl },
	157	{ &vop_setattr_desc, (void *)journal_setattr },
	158	{ &vop_write_desc, (void *)journal_write },
	159	{ &vop_fsync_desc, (void *)journal_fsync },
	160	{ &vop_putpages_desc, (void *)journal_putpages },
	161	{ &vop_setacl_desc, (void *)journal_setacl },
	162	{ &vop_setextattr_desc, (void *)journal_setextattr },
	163	{ &vop_ncreate_desc, (void *)journal_ncreate },
	164	{ &vop_nmknod_desc, (void *)journal_nmknod },
	165	{ &vop_nlink_desc, (void *)journal_nlink },
	166	{ &vop_nsymlink_desc, (void *)journal_nsymlink },
	167	{ &vop_nwhiteout_desc, (void *)journal_nwhiteout },
	168	{ &vop_nremove_desc, (void *)journal_nremove },
	169	{ &vop_nmkdir_desc, (void *)journal_nmkdir },
	170	{ &vop_nrmdir_desc, (void *)journal_nrmdir },
	171	{ &vop_nrename_desc, (void *)journal_nrename },
	172	{ NULL, NULL }
	173	};
	174
	175	static MALLOC_DEFINE(M_JOURNAL, "journal", "Journaling structures");
	176	static MALLOC_DEFINE(M_JFIFO, "journal-fifo", "Journal FIFO");
	177
	178	int
	179	journal_mountctl(struct vop_mountctl_args *ap)
	180	{
	181	struct mount *mp;
	182	int error = 0;
	183
	184	mp = ap->a_head.a_ops->vv_mount;
	185	KKASSERT(mp);
	186
	187	if (mp->mnt_vn_journal_ops == NULL) {
	188	switch(ap->a_op) {
	189	case MOUNTCTL_INSTALL_VFS_JOURNAL:
	190	error = journal_attach(mp);
	191	if (error == 0 && ap->a_ctllen != sizeof(struct mountctl_install_journal))
	192	error = EINVAL;
	193	if (error == 0 && ap->a_fp == NULL)
	194	error = EBADF;
	195	if (error == 0)
	196	error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl);
	197	if (TAILQ_EMPTY(&mp->mnt_jlist))
	198	journal_detach(mp);
	199	break;
	200	case MOUNTCTL_REMOVE_VFS_JOURNAL:
	201	case MOUNTCTL_RESYNC_VFS_JOURNAL:
	202	case MOUNTCTL_STATUS_VFS_JOURNAL:
	203	error = ENOENT;
	204	break;
	205	default:
	206	error = EOPNOTSUPP;
	207	break;
	208	}
	209	} else {
	210	switch(ap->a_op) {
	211	case MOUNTCTL_INSTALL_VFS_JOURNAL:
	212	if (ap->a_ctllen != sizeof(struct mountctl_install_journal))
	213	error = EINVAL;
	214	if (error == 0 && ap->a_fp == NULL)
	215	error = EBADF;
	216	if (error == 0)
	217	error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl);
	218	break;
	219	case MOUNTCTL_REMOVE_VFS_JOURNAL:
	220	if (ap->a_ctllen != sizeof(struct mountctl_remove_journal))
	221	error = EINVAL;
	222	if (error == 0)
	223	error = journal_remove_vfs_journal(mp, ap->a_ctl);
	224	if (TAILQ_EMPTY(&mp->mnt_jlist))
	225	journal_detach(mp);
	226	break;
	227	case MOUNTCTL_RESYNC_VFS_JOURNAL:
	228	if (ap->a_ctllen != 0)
	229	error = EINVAL;
	230	error = journal_resync_vfs_journal(mp, ap->a_ctl);
	231	break;
	232	case MOUNTCTL_STATUS_VFS_JOURNAL:
	233	if (ap->a_ctllen != sizeof(struct mountctl_status_journal))
	234	error = EINVAL;
	235	if (error == 0) {
	236	error = journal_status_vfs_journal(mp, ap->a_ctl,
	237	ap->a_buf, ap->a_buflen, ap->a_res);
	238	}
	239	break;
	240	default:
	241	error = EOPNOTSUPP;
	242	break;
	243	}
	244	}
	245	return (error);
	246	}
	247
	248	/*
	249	* High level mount point setup. When a
	250	*/
	251	static int
	252	journal_attach(struct mount *mp)
	253	{
	254	vfs_add_vnodeops(mp, &mp->mnt_vn_journal_ops, journal_vnodeop_entries);
	255	return(0);
	256	}
	257
	258	static void
	259	journal_detach(struct mount *mp)
	260	{
	261	if (mp->mnt_vn_journal_ops)
	262	vfs_rm_vnodeops(&mp->mnt_vn_journal_ops);
	263	}
	264
	265	/*
	266	* Install a journal on a mount point. Each journal has an associated worker
	267	* thread which is responsible for buffering and spooling the data to the
	268	* target. A mount point may have multiple journals attached to it. An
	269	* initial start record is generated when the journal is associated.
	270	*/
	271	static int
	272	journal_install_vfs_journal(struct mount mp, struct file fp,
	273	const struct mountctl_install_journal *info)
	274	{
	275	struct journal *jo;
	276	struct jrecord jrec;
	277	int error = 0;
	278	int size;
	279
	280	jo = malloc(sizeof(struct journal), M_JOURNAL, M_WAITOK\|M_ZERO);
	281	bcopy(info->id, jo->id, sizeof(jo->id));
	282	jo->flags = info->flags & ~(MC_JOURNAL_ACTIVE \| MC_JOURNAL_STOP_REQ);
	283
	284	/*
	285	* Memory FIFO size, round to nearest power of 2
	286	*/
	287	if (info->membufsize) {
	288	if (info->membufsize < 65536)
	289	size = 65536;
	290	else if (info->membufsize > 128 * 1024 * 1024)
	291	size = 128 * 1024 * 1024;
	292	else
	293	size = (int)info->membufsize;
	294	} else {
	295	size = 1024 * 1024;
	296	}
	297	jo->fifo.size = 1;
	298	while (jo->fifo.size < size)
	299	jo->fifo.size <<= 1;
	300
	301	/*
	302	* Other parameters. If not specified the starting transaction id
	303	* will be the current date.
	304	*/
	305	if (info->transid) {
	306	jo->transid = info->transid;
	307	} else {
	308	struct timespec ts;
	309	getnanotime(&ts);
	310	jo->transid = ((int64_t)ts.tv_sec << 30) \| ts.tv_nsec;
	311	}
	312
	313	jo->fp = fp;
	314
	315	/*
	316	* Allocate the memory FIFO
	317	*/
	318	jo->fifo.mask = jo->fifo.size - 1;
	319	jo->fifo.membase = malloc(jo->fifo.size, M_JFIFO, M_WAITOK\|M_ZERO\|M_NULLOK);
	320	if (jo->fifo.membase == NULL)
	321	error = ENOMEM;
	322
	323	/*
	324	* Create the worker thread and generate the association record.
	325	*/
	326	if (error) {
	327	free(jo, M_JOURNAL);
	328	} else {
	329	fhold(fp);
	330	jo->flags \|= MC_JOURNAL_ACTIVE;
	331	lwkt_create(journal_thread, jo, NULL, &jo->thread,
	332	TDF_STOPREQ, -1, "journal %.*s", JIDMAX, jo->id);
	333	lwkt_setpri(&jo->thread, TDPRI_KERN_DAEMON);
	334	lwkt_schedule(&jo->thread);
	335
	336	jrecord_init(jo, &jrec, JREC_STREAMID_DISCONT);
	337	jrecord_write(&jrec, JTYPE_ASSOCIATE, 0);
	338	jrecord_done(&jrec, 0);
	339	TAILQ_INSERT_TAIL(&mp->mnt_jlist, jo, jentry);
	340	}
	341	return(error);
	342	}
	343
	344	/*
	345	* Disassociate a journal from a mount point and terminate its worker thread.
	346	* A final termination record is written out before the file pointer is
	347	* dropped.
	348	*/
	349	static int
	350	journal_remove_vfs_journal(struct mount *mp,
	351	const struct mountctl_remove_journal *info)
	352	{
	353	struct journal *jo;
	354	struct jrecord jrec;
	355	int error;
	356
	357	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	358	if (bcmp(jo->id, info->id, sizeof(jo->id)) == 0)
	359	break;
	360	}
	361	if (jo) {
	362	error = 0;
	363	TAILQ_REMOVE(&mp->mnt_jlist, jo, jentry);
	364
	365	jrecord_init(jo, &jrec, JREC_STREAMID_DISCONT);
	366	jrecord_write(&jrec, JTYPE_DISASSOCIATE, 0);
	367	jrecord_done(&jrec, 0);
	368
	369	jo->flags \|= MC_JOURNAL_STOP_REQ \| (info->flags & MC_JOURNAL_STOP_IMM);
	370	wakeup(&jo->fifo);
	371	while (jo->flags & MC_JOURNAL_ACTIVE) {
	372	tsleep(jo, 0, "jwait", 0);
	373	}
	374	lwkt_free_thread(&jo->thread); /* XXX SMP */
	375	if (jo->fp)
	376	fdrop(jo->fp, curthread);
	377	if (jo->fifo.membase)
	378	free(jo->fifo.membase, M_JFIFO);
	379	free(jo, M_JOURNAL);
	380	} else {
	381	error = EINVAL;
	382	}
	383	return (error);
	384	}
	385
	386	static int
	387	journal_resync_vfs_journal(struct mount mp, const void ctl)
	388	{
	389	return(EINVAL);
	390	}
	391
	392	static int
	393	journal_status_vfs_journal(struct mount *mp,
	394	const struct mountctl_status_journal *info,
	395	struct mountctl_journal_ret_status *rstat,
	396	int buflen, int *res)
	397	{
	398	struct journal *jo;
	399	int error = 0;
	400	int index;
	401
	402	index = 0;
	403	*res = 0;
	404	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	405	if (info->index == MC_JOURNAL_INDEX_ID) {
	406	if (bcmp(jo->id, info->id, sizeof(jo->id)) != 0)
	407	continue;
	408	} else if (info->index >= 0) {
	409	if (info->index < index)
	410	continue;
	411	} else if (info->index != MC_JOURNAL_INDEX_ALL) {
	412	continue;
	413	}
	414	if (buflen < sizeof(*rstat)) {
	415	if (*res)
	416	rstat[-1].flags \|= MC_JOURNAL_STATUS_MORETOCOME;
	417	else
	418	error = EINVAL;
	419	break;
	420	}
	421	bzero(rstat, sizeof(*rstat));
	422	rstat->recsize = sizeof(*rstat);
	423	bcopy(jo->id, rstat->id, sizeof(jo->id));
	424	rstat->index = index;
	425	rstat->membufsize = jo->fifo.size;
	426	rstat->membufused = jo->fifo.xindex - jo->fifo.rindex;
	427	rstat->membufiopend = jo->fifo.windex - jo->fifo.rindex;
	428	rstat->bytessent = jo->total_acked;
	429	++rstat;
	430	++index;
	431	res += sizeof(rstat);
	432	buflen -= sizeof(*rstat);
	433	}
	434	return(error);
	435	}
	436	/*
	437	* The per-journal worker thread is responsible for writing out the
	438	* journal's FIFO to the target stream.
	439	*/
	440	static void
	441	journal_thread(void *info)
	442	{
	443	struct journal *jo = info;
	444	struct journal_rawrecbeg *rawp;
	445	int bytes;
	446	int error;
	447	int avail;
	448	int res;
	449
	450	for (;;) {
	451	/*
	452	* Calculate the number of bytes available to write. This buffer
	453	* area may contain reserved records so we can't just write it out
	454	* without further checks.
	455	*/
	456	bytes = jo->fifo.windex - jo->fifo.rindex;
	457
	458	/*
	459	* sleep if no bytes are available or if an incomplete record is
	460	* encountered (it needs to be filled in before we can write it
	461	* out), and skip any pad records that we encounter.
	462	*/
	463	if (bytes == 0) {
	464	if (jo->flags & MC_JOURNAL_STOP_REQ)
	465	break;
	466	tsleep(&jo->fifo, 0, "jfifo", hz);
	467	continue;
	468	}
	469
	470	/*
	471	* Sleep if we can not go any further due to hitting an incomplete
	472	* record. This case should occur rarely but may have to be better
	473	* optimized XXX.
	474	*/
	475	rawp = (void *)(jo->fifo.membase + (jo->fifo.rindex & jo->fifo.mask));
	476	if (rawp->begmagic == JREC_INCOMPLETEMAGIC) {
	477	tsleep(&jo->fifo, 0, "jpad", hz);
	478	continue;
	479	}
	480
	481	/*
	482	* Skip any pad records. We do not write out pad records if we can
	483	* help it.
	484	*
	485	* If xindex is caught up to rindex it gets incremented along with
	486	* rindex. XXX
	487	*/
	488	if (rawp->streamid == JREC_STREAMID_PAD) {
	489	if (jo->fifo.rindex == jo->fifo.xindex)
	490	jo->fifo.xindex += (rawp->recsize + 15) & ~15;
	491	jo->fifo.rindex += (rawp->recsize + 15) & ~15;
	492	jo->total_acked += bytes;
	493	KKASSERT(jo->fifo.windex - jo->fifo.rindex >= 0);
	494	continue;
	495	}
	496
	497	/*
	498	* 'bytes' is the amount of data that can potentially be written out.
	499	* Calculate 'res', the amount of data that can actually be written
	500	* out. res is bounded either by hitting the end of the physical
	501	* memory buffer or by hitting an incomplete record. Incomplete
	502	* records often occur due to the way the space reservation model
	503	* works.
	504	*/
	505	res = 0;
	506	avail = jo->fifo.size - (jo->fifo.rindex & jo->fifo.mask);
	507	while (res < bytes && rawp->begmagic == JREC_BEGMAGIC) {
	508	res += (rawp->recsize + 15) & ~15;
	509	if (res >= avail) {
	510	KKASSERT(res == avail);
	511	break;
	512	}
	513	rawp = (void )((char )rawp + ((rawp->recsize + 15) & ~15));
	514	}
	515
	516	/*
	517	* Issue the write and deal with any errors or other conditions.
	518	* For now assume blocking I/O. Since we are record-aware the
	519	* code cannot yet handle partial writes.
	520	*
	521	* XXX EWOULDBLOCK/NBIO
	522	* XXX notification on failure
	523	* XXX permanent verses temporary failures
	524	* XXX two-way acknowledgement stream in the return direction / xindex
	525	*/
	526	bytes = res;
	527	error = fp_write(jo->fp,
	528	jo->fifo.membase + (jo->fifo.rindex & jo->fifo.mask),
	529	bytes, &res);
	530	if (error) {
	531	printf("journal_thread(%s) write, error %d\n", jo->id, error);
	532	/* XXX */
	533	} else {
	534	KKASSERT(res == bytes);
	535	}
	536
	537	/*
	538	* Advance rindex. XXX for now also advance xindex, which will
	539	* eventually be advanced only when the target acknowledges the
	540	* sequence space.
	541	*/
	542	jo->fifo.rindex += bytes;
	543	jo->fifo.xindex += bytes;
	544	jo->total_acked += bytes;
	545	KKASSERT(jo->fifo.windex - jo->fifo.rindex >= 0);
	546	if (jo->flags & MC_JOURNAL_WWAIT) {
	547	jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */
	548	wakeup(&jo->fifo.windex);
	549	}
	550	}
	551	jo->flags &= ~MC_JOURNAL_ACTIVE;
	552	wakeup(jo);
	553	wakeup(&jo->fifo.windex);
	554	}
	555
	556	/*
	557	* This builds a pad record which the journaling thread will skip over. Pad
	558	* records are required when we are unable to reserve sufficient stream space
	559	* due to insufficient space at the end of the physical memory fifo.
	560	*/
	561	static
	562	void
	563	journal_build_pad(struct journal_rawrecbeg *rawp, int recsize)
	564	{
	565	struct journal_rawrecend *rendp;
	566
	567	KKASSERT((recsize & 15) == 0 && recsize >= 16);
	568
	569	rawp->streamid = JREC_STREAMID_PAD;
	570	rawp->recsize = recsize; /* must be 16-byte aligned */
	571	rawp->seqno = 0;
	572	/*
	573	* WARNING, rendp may overlap rawp->seqno. This is necessary to
	574	* allow PAD records to fit in 16 bytes. Use cpu_mb1() to
	575	* hopefully cause the compiler to not make any assumptions.
	576	*/
	577	rendp = (void )((char )rawp + rawp->recsize - sizeof(*rendp));
	578	rendp->endmagic = JREC_ENDMAGIC;
	579	rendp->check = 0;
	580	rendp->recsize = rawp->recsize;
	581
	582	/*
	583	* Set the begin magic last. This is what will allow the journal
	584	* thread to write the record out.
	585	*/
	586	cpu_mb1();
	587	rawp->begmagic = JREC_BEGMAGIC;
	588	}
	589
	590	/*
	591	* Wake up the worker thread if the FIFO is more then half full or if
	592	* someone is waiting for space to be freed up. Otherwise let the
	593	* heartbeat deal with it. Being able to avoid waking up the worker
	594	* is the key to the journal's cpu performance.
	595	*/
	596	static __inline
	597	void
	598	journal_commit_wakeup(struct journal *jo)
	599	{
	600	int avail;
	601
	602	avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex);
	603	KKASSERT(avail >= 0);
	604	if ((avail < (jo->fifo.size >> 1)) \|\| (jo->flags & MC_JOURNAL_WWAIT))
	605	wakeup(&jo->fifo);
	606	}
	607
	608	/*
	609	* Create a new BEGIN stream record with the specified streamid and the
	610	* specified amount of payload space. *rawpp will be set to point to the
	611	* base of the new stream record and a pointer to the base of the payload
	612	* space will be returned. *rawpp does not need to be pre-NULLd prior to
	613	* making this call.
	614	*
	615	* A stream can be extended, aborted, or committed by other API calls
	616	* below. This may result in a sequence of potentially disconnected
	617	* stream records to be output to the journaling target. The first record
	618	* (the one created by this function) will be marked JREC_STREAMCTL_BEGIN,
	619	* while the last record on commit or abort will be marked JREC_STREAMCTL_END
	620	* (and possibly also JREC_STREAMCTL_ABORTED). The last record could wind
	621	* up being the same as the first, in which case the bits are all set in
	622	* the first record.
	623	*
	624	* The stream record is created in an incomplete state by setting the begin
	625	* magic to JREC_INCOMPLETEMAGIC. This prevents the worker thread from
	626	* flushing the fifo past our record until we have finished populating it.
	627	* Other threads can reserve and operate on their own space without stalling
	628	* but the stream output will stall until we have completed operations. The
	629	* memory FIFO is intended to be large enough to absorb such situations
	630	* without stalling out other threads.
	631	*/
	632	static
	633	void *
	634	journal_reserve(struct journal jo, struct journal_rawrecbeg *rawpp,
	635	int16_t streamid, int bytes)
	636	{
	637	struct journal_rawrecbeg *rawp;
	638	int avail;
	639	int availtoend;
	640	int req;
	641
	642	/*
	643	* Add header and trailer overheads to the passed payload. Note that
	644	* the passed payload size need not be aligned in any way.
	645	*/
	646	bytes += sizeof(struct journal_rawrecbeg);
	647	bytes += sizeof(struct journal_rawrecend);
	648
	649	for (;;) {
	650	/*
	651	* First, check boundary conditions. If the request would wrap around
	652	* we have to skip past the ending block and return to the beginning
	653	* of the FIFO's buffer. Calculate 'req' which is the actual number
	654	* of bytes being reserved, including wrap-around dead space.
	655	*
	656	* Neither 'bytes' or 'req' are aligned.
	657	*
	658	* Note that availtoend is not truncated to avail and so cannot be
	659	* used to determine whether the reservation is possible by itself.
	660	* Also, since all fifo ops are 16-byte aligned, we can check
	661	* the size before calculating the aligned size.
	662	*/
	663	availtoend = jo->fifo.size - (jo->fifo.windex & jo->fifo.mask);
	664	KKASSERT((availtoend & 15) == 0);
	665	if (bytes > availtoend)
	666	req = bytes + availtoend; /* add pad to end */
	667	else
	668	req = bytes;
	669
	670	/*
	671	* Next calculate the total available space and see if it is
	672	* sufficient. We cannot overwrite previously buffered data
	673	* past xindex because otherwise we would not be able to restart
	674	* a broken link at the target's last point of commit.
	675	*/
	676	avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex);
	677	KKASSERT(avail >= 0 && (avail & 15) == 0);
	678
	679	if (avail < req) {
	680	/* XXX MC_JOURNAL_STOP_IMM */
	681	jo->flags \|= MC_JOURNAL_WWAIT;
	682	tsleep(&jo->fifo.windex, 0, "jwrite", 0);
	683	continue;
	684	}
	685
	686	/*
	687	* Create a pad record for any dead space and create an incomplete
	688	* record for the live space, then return a pointer to the
	689	* contiguous buffer space that was requested.
	690	*
	691	* NOTE: The worker thread will not flush past an incomplete
	692	* record, so the reserved space can be filled in at-will. The
	693	* journaling code must also be aware the reserved sections occuring
	694	* after this one will also not be written out even if completed
	695	* until this one is completed.
	696	*/
	697	rawp = (void *)(jo->fifo.membase + (jo->fifo.windex & jo->fifo.mask));
	698	if (req != bytes) {
	699	journal_build_pad(rawp, availtoend);
	700	rawp = (void *)jo->fifo.membase;
	701	}
	702	rawp->begmagic = JREC_INCOMPLETEMAGIC; /* updated by abort/commit */
	703	rawp->recsize = bytes; /* (unaligned size) */
	704	rawp->streamid = streamid \| JREC_STREAMCTL_BEGIN;
	705	rawp->seqno = 0; /* set by caller */
	706
	707	/*
	708	* Issue a memory barrier to guarentee that the record data has been
	709	* properly initialized before we advance the write index and return
	710	* a pointer to the reserved record. Otherwise the worker thread
	711	* could accidently run past us.
	712	*
	713	* Note that stream records are always 16-byte aligned.
	714	*/
	715	cpu_mb1();
	716	jo->fifo.windex += (req + 15) & ~15;
	717	*rawpp = rawp;
	718	return(rawp + 1);
	719	}
	720	/* not reached */
	721	*rawpp = NULL;
	722	return(NULL);
	723	}
	724
	725	/*
	726	* Attempt to extend the stream record by <bytes> worth of payload space.
	727	*
	728	* If it is possible to extend the existing stream record no truncation
	729	* occurs and the record is extended as specified. A pointer to the
	730	* truncation offset within the payload space is returned.
	731	*
	732	* If it is not possible to do this the existing stream record is truncated
	733	* and committed, and a new stream record of size <bytes> is created. A
	734	* pointer to the base of the new stream record's payload space is returned.
	735	*
	736	* *rawpp is set to the new reservation in the case of a new record but
	737	* the caller cannot depend on a comparison with the old rawp to determine if
	738	* this case occurs because we could end up using the same memory FIFO
	739	* offset for the new stream record. Use *newstreamrecp instead.
	740	*/
	741	static void *
	742	journal_extend(struct journal jo, struct journal_rawrecbeg *rawpp,
	743	int truncbytes, int bytes, int *newstreamrecp)
	744	{
	745	struct journal_rawrecbeg *rawp;
	746	int16_t streamid;
	747	int availtoend;
	748	int avail;
	749	int osize;
	750	int nsize;
	751	int wbase;
	752	void *rptr;
	753
	754	*newstreamrecp = 0;
	755	rawp = *rawpp;
	756	osize = (rawp->recsize + 15) & ~15;
	757	nsize = (rawp->recsize + bytes + 15) & ~15;
	758	wbase = (char *)rawp - jo->fifo.membase;
	759
	760	/*
	761	* If the aligned record size does not change we can trivially adjust
	762	* the record size.
	763	*/
	764	if (nsize == osize) {
	765	rawp->recsize += bytes;
	766	return((char *)(rawp + 1) + truncbytes);
	767	}
	768
	769	/*
	770	* If the fifo's write index hasn't been modified since we made the
	771	* reservation and we do not hit any boundary conditions, we can
	772	* trivially make the record smaller or larger.
	773	*/
	774	if ((jo->fifo.windex & jo->fifo.mask) == wbase + osize) {
	775	availtoend = jo->fifo.size - wbase;
	776	avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex) + osize;
	777	KKASSERT((availtoend & 15) == 0);
	778	KKASSERT((avail & 15) == 0);
	779	if (nsize <= avail && nsize <= availtoend) {
	780	jo->fifo.windex += nsize - osize;
	781	rawp->recsize += bytes;
	782	return((char *)(rawp + 1) + truncbytes);
	783	}
	784	}
	785
	786	/*
	787	* It was not possible to extend the buffer. Commit the current
	788	* buffer and create a new one. We manually clear the BEGIN mark that
	789	* journal_reserve() creates (because this is a continuing record, not
	790	* the start of a new stream).
	791	*/
	792	streamid = rawp->streamid & JREC_STREAMID_MASK;
	793	journal_commit(jo, rawpp, truncbytes, 0);
	794	rptr = journal_reserve(jo, rawpp, streamid, bytes);
	795	rawp = *rawpp;
	796	rawp->streamid &= ~JREC_STREAMCTL_BEGIN;
	797	*newstreamrecp = 1;
	798	return(rptr);
	799	}
	800
	801	/*
	802	* Abort a journal record. If the transaction record represents a stream
	803	* BEGIN and we can reverse the fifo's write index we can simply reverse
	804	* index the entire record, as if it were never reserved in the first place.
	805	*
	806	* Otherwise we set the JREC_STREAMCTL_ABORTED bit and commit the record
	807	* with the payload truncated to 0 bytes.
	808	*/
	809	static void
	810	journal_abort(struct journal jo, struct journal_rawrecbeg *rawpp)
	811	{
	812	struct journal_rawrecbeg *rawp;
	813	int osize;
	814
	815	rawp = *rawpp;
	816	osize = (rawp->recsize + 15) & ~15;
	817
	818	if ((rawp->streamid & JREC_STREAMCTL_BEGIN) &&
	819	(jo->fifo.windex & jo->fifo.mask) ==
	820	(char *)rawp - jo->fifo.membase + osize)
	821	{
	822	jo->fifo.windex -= osize;
	823	*rawpp = NULL;
	824	} else {
	825	rawp->streamid \|= JREC_STREAMCTL_ABORTED;
	826	journal_commit(jo, rawpp, 0, 1);
	827	}
	828	}
	829
	830	/*
	831	* Commit a journal record and potentially truncate it to the specified
	832	* number of payload bytes. If you do not want to truncate the record,
	833	* simply pass -1 for the bytes parameter. Do not pass rawp->recsize, that
	834	* field includes header and trailer and will not be correct. Note that
	835	* passing 0 will truncate the entire data payload of the record.
	836	*
	837	* The logical stream is terminated by this function.
	838	*
	839	* If truncation occurs, and it is not possible to physically optimize the
	840	* memory FIFO due to other threads having reserved space after ours,
	841	* the remaining reserved space will be covered by a pad record.
	842	*/
	843	static void
	844	journal_commit(struct journal jo, struct journal_rawrecbeg *rawpp,
	845	int bytes, int closeout)
	846	{
	847	struct journal_rawrecbeg *rawp;
	848	struct journal_rawrecend *rendp;
	849	int osize;
	850	int nsize;
	851
	852	rawp = *rawpp;
	853	*rawpp = NULL;
	854
	855	KKASSERT((char *)rawp >= jo->fifo.membase &&
	856	(char *)rawp + rawp->recsize <= jo->fifo.membase + jo->fifo.size);
	857	KKASSERT(((intptr_t)rawp & 15) == 0);
	858
	859	/*
	860	* Truncate the record if necessary. If the FIFO write index as still
	861	* at the end of our record we can optimally backindex it. Otherwise
	862	* we have to insert a pad record to cover the dead space.
	863	*
	864	* We calculate osize which is the 16-byte-aligned original recsize.
	865	* We calculate nsize which is the 16-byte-aligned new recsize.
	866	*
	867	* Due to alignment issues or in case the passed truncation bytes is
	868	* the same as the original payload, nsize may be equal to osize even
	869	* if the committed bytes is less then the originally reserved bytes.
	870	*/
	871	if (bytes >= 0) {
	872	KKASSERT(bytes >= 0 && bytes <= rawp->recsize - sizeof(struct journal_rawrecbeg) - sizeof(struct journal_rawrecend));
	873	osize = (rawp->recsize + 15) & ~15;
	874	rawp->recsize = bytes + sizeof(struct journal_rawrecbeg) +
	875	sizeof(struct journal_rawrecend);
	876	nsize = (rawp->recsize + 15) & ~15;
	877	KKASSERT(nsize <= osize);
	878	if (osize == nsize) {
	879	/* do nothing */
	880	} else if ((jo->fifo.windex & jo->fifo.mask) == (char *)rawp - jo->fifo.membase + osize) {
	881	/* we are able to backindex the fifo */
	882	jo->fifo.windex -= osize - nsize;
	883	} else {
	884	/* we cannot backindex the fifo, emplace a pad in the dead space */
	885	journal_build_pad((void )((char )rawp + nsize), osize - nsize);
	886	}
	887	}
	888
	889	/*
	890	* Fill in the trailer. Note that unlike pad records, the trailer will
	891	* never overlap the header.
	892	*/
	893	rendp = (void )((char )rawp +
	894	((rawp->recsize + 15) & ~15) - sizeof(*rendp));
	895	rendp->endmagic = JREC_ENDMAGIC;
	896	rendp->recsize = rawp->recsize;
	897	rendp->check = 0; /* XXX check word, disabled for now */
	898
	899	/*
	900	* Fill in begmagic last. This will allow the worker thread to proceed.
	901	* Use a memory barrier to guarentee write ordering. Mark the stream
	902	* as terminated if closeout is set. This is the typical case.
	903	*/
	904	if (closeout)
	905	rawp->streamid \|= JREC_STREAMCTL_END;
	906	cpu_mb1(); /* memory barrier */
	907	rawp->begmagic = JREC_BEGMAGIC;
	908
	909	journal_commit_wakeup(jo);
	910	}
	911
	912	/************************************************************************
	913	* TRANSACTION SUPPORT ROUTINES *
	914	************************************************************************
	915	*
	916	* JRECORD_*() - routines to create subrecord transactions and embed them
	917	* in the logical streams managed by the journal_*() routines.
	918	*/
	919
	920	static int16_t sid = JREC_STREAMID_JMIN;
	921
	922	/*
	923	* Initialize the passed jrecord structure and start a new stream transaction
	924	* by reserving an initial build space in the journal's memory FIFO.
	925	*/
	926	static void
	927	jrecord_init(struct journal jo, struct jrecord jrec, int16_t streamid)
	928	{
	929	bzero(jrec, sizeof(*jrec));
	930	jrec->jo = jo;
	931	if (streamid < 0) {
	932	streamid = sid++; /* XXX need to track stream ids! */
	933	if (sid == JREC_STREAMID_JMAX)
	934	sid = JREC_STREAMID_JMIN;
	935	}
	936	jrec->streamid = streamid;
	937	jrec->stream_residual = JREC_DEFAULTSIZE;
	938	jrec->stream_reserved = jrec->stream_residual;
	939	jrec->stream_ptr =
	940	journal_reserve(jo, &jrec->rawp, streamid, jrec->stream_reserved);
	941	}
	942
	943	/*
	944	* Push a recursive record type. All pushes should have matching pops.
	945	* The old parent is returned and the newly pushed record becomes the
	946	* new parent. Note that the old parent's pointer may already be invalid
	947	* or may become invalid if jrecord_write() had to build a new stream
	948	* record, so the caller should not mess with the returned pointer in
	949	* any way other then to save it.
	950	*/
	951	static
	952	struct journal_subrecord *
	953	jrecord_push(struct jrecord *jrec, int16_t rectype)
	954	{
	955	struct journal_subrecord *save;
	956
	957	save = jrec->parent;
	958	jrec->parent = jrecord_write(jrec, rectype\|JMASK_NESTED, 0);
	959	jrec->last = NULL;
	960	KKASSERT(jrec->parent != NULL);
	961	++jrec->pushcount;
	962	++jrec->pushptrgood; /* cleared on flush */
	963	return(save);
	964	}
	965
	966	/*
	967	* Pop a previously pushed sub-transaction. We must set JMASK_LAST
	968	* on the last record written within the subtransaction. If the last
	969	* record written is not accessible or if the subtransaction is empty,
	970	* we must write out a pad record with JMASK_LAST set before popping.
	971	*
	972	* When popping a subtransaction the parent record's recsize field
	973	* will be properly set. If the parent pointer is no longer valid
	974	* (which can occur if the data has already been flushed out to the
	975	* stream), the protocol spec allows us to leave it 0.
	976	*
	977	* The saved parent pointer which we restore may or may not be valid,
	978	* and if not valid may or may not be NULL, depending on the value
	979	* of pushptrgood.
	980	*/
	981	static void
	982	jrecord_pop(struct jrecord jrec, struct journal_subrecord save)
	983	{
	984	struct journal_subrecord *last;
	985
	986	KKASSERT(jrec->pushcount > 0);
	987	KKASSERT(jrec->residual == 0);
	988
	989	/*
	990	* Set JMASK_LAST on the last record we wrote at the current
	991	* level. If last is NULL we either no longer have access to the
	992	* record or the subtransaction was empty and we must write out a pad
	993	* record.
	994	*/
	995	if ((last = jrec->last) == NULL) {
	996	jrecord_write(jrec, JLEAF_PAD\|JMASK_LAST, 0);
	997	last = jrec->last; /* reload after possible flush */
	998	} else {
	999	last->rectype \|= JMASK_LAST;
	1000	}
	1001
	1002	/*
	1003	* pushptrgood tells us how many levels of parent record pointers
	1004	* are valid. The jrec only stores the current parent record pointer
	1005	* (and it is only valid if pushptrgood != 0). The higher level parent
	1006	* record pointers are saved by the routines calling jrecord_push() and
	1007	* jrecord_pop(). These pointers may become stale and we determine
	1008	* that fact by tracking the count of valid parent pointers with
	1009	* pushptrgood. Pointers become invalid when their related stream
	1010	* record gets pushed out.
	1011	*
	1012	* If no pointer is available (the data has already been pushed out),
	1013	* then no fixup of e.g. the length field is possible for non-leaf
	1014	* nodes. The protocol allows for this situation by placing a larger
	1015	* burden on the program scanning the stream on the other end.
	1016	*
	1017	* [parentA]
	1018	* [node X]
	1019	* [parentB]
	1020	* [node Y]
	1021	* [node Z]
	1022	* (pop B) see NOTE B
	1023	* (pop A) see NOTE A
	1024	*
	1025	* NOTE B: This pop sets LAST in node Z if the node is still accessible,
	1026	* else a PAD record is appended and LAST is set in that.
	1027	*
	1028	* This pop sets the record size in parentB if parentB is still
	1029	* accessible, else the record size is left 0 (the scanner must
	1030	* deal with that).
	1031	*
	1032	* This pop sets the new 'last' record to parentB, the pointer
	1033	* to which may or may not still be accessible.
	1034	*
	1035	* NOTE A: This pop sets LAST in parentB if the node is still accessible,
	1036	* else a PAD record is appended and LAST is set in that.
	1037	*
	1038	* This pop sets the record size in parentA if parentA is still
	1039	* accessible, else the record size is left 0 (the scanner must
	1040	* deal with that).
	1041	*
	1042	* This pop sets the new 'last' record to parentA, the pointer
	1043	* to which may or may not still be accessible.
	1044	*
	1045	* Also note that the last record in the stream transaction, which in
	1046	* the above example is parentA, does not currently have the LAST bit
	1047	* set.
	1048	*
	1049	* The current parent becomes the last record relative to the
	1050	* saved parent passed into us. It's validity is based on
	1051	* whether pushptrgood is non-zero prior to decrementing. The saved
	1052	* parent becomes the new parent, and its validity is based on whether
	1053	* pushptrgood is non-zero after decrementing.
	1054	*
	1055	* The old jrec->parent may be NULL if it is no longer accessible.
	1056	* If pushptrgood is non-zero, however, it is guarenteed to not
	1057	* be NULL (since no flush occured).
	1058	*/
	1059	jrec->last = jrec->parent;
	1060	--jrec->pushcount;
	1061	if (jrec->pushptrgood) {
	1062	KKASSERT(jrec->last != NULL && last != NULL);
	1063	if (--jrec->pushptrgood == 0) {
	1064	jrec->parent = NULL; /* 'save' contains garbage or NULL */
	1065	} else {
	1066	KKASSERT(save != NULL);
	1067	jrec->parent = save; /* 'save' must not be NULL */
	1068	}
	1069
	1070	/*
	1071	* Set the record size in the old parent. 'last' still points to
	1072	* the original last record in the subtransaction being popped,
	1073	* jrec->last points to the old parent (which became the last
	1074	* record relative to the new parent being popped into).
	1075	*/
	1076	jrec->last->recsize = (char )last + last->recsize - (char )jrec->last;
	1077	} else {
	1078	jrec->parent = NULL;
	1079	KKASSERT(jrec->last == NULL);
	1080	}
	1081	}
	1082
	1083	/*
	1084	* Write out a leaf record, including associated data.
	1085	*/
	1086	static
	1087	void
	1088	jrecord_leaf(struct jrecord jrec, int16_t rectype, void ptr, int bytes)
	1089	{
	1090	jrecord_write(jrec, rectype, bytes);
	1091	jrecord_data(jrec, ptr, bytes);
	1092	}
	1093
	1094	/*
	1095	* Write a leaf record out and return a pointer to its base. The leaf
	1096	* record may contain potentially megabytes of data which is supplied
	1097	* in jrecord_data() calls. The exact amount must be specified in this
	1098	* call.
	1099	*
	1100	* THE RETURNED SUBRECORD POINTER IS ONLY VALID IMMEDIATELY AFTER THE
	1101	* CALL AND MAY BECOME INVALID AT ANY TIME. ONLY THE PUSH/POP CODE SHOULD
	1102	* USE THE RETURN VALUE.
	1103	*/
	1104	static
	1105	struct journal_subrecord *
	1106	jrecord_write(struct jrecord *jrec, int16_t rectype, int bytes)
	1107	{
	1108	struct journal_subrecord *last;
	1109	int pusheditout;
	1110
	1111	/*
	1112	* Try to catch some obvious errors. Nesting records must specify a
	1113	* size of 0, and there should be no left-overs from previous operations
	1114	* (such as incomplete data writeouts).
	1115	*/
	1116	KKASSERT(bytes == 0 \|\| (rectype & JMASK_NESTED) == 0);
	1117	KKASSERT(jrec->residual == 0);
	1118
	1119	/*
	1120	* Check to see if the current stream record has enough room for
	1121	* the new subrecord header. If it doesn't we extend the current
	1122	* stream record.
	1123	*
	1124	* This may have the side effect of pushing out the current stream record
	1125	* and creating a new one. We must adjust our stream tracking fields
	1126	* accordingly.
	1127	*/
	1128	if (jrec->stream_residual < sizeof(struct journal_subrecord)) {
	1129	jrec->stream_ptr = journal_extend(jrec->jo, &jrec->rawp,
	1130	jrec->stream_reserved - jrec->stream_residual,
	1131	JREC_DEFAULTSIZE, &pusheditout);
	1132	if (pusheditout) {
	1133	/*
	1134	* If a pushout occured, the pushed out stream record was
	1135	* truncated as specified and the new record is exactly the
	1136	* extension size specified.
	1137	*/
	1138	jrec->stream_reserved = JREC_DEFAULTSIZE;
	1139	jrec->stream_residual = JREC_DEFAULTSIZE;
	1140	jrec->parent = NULL; /* no longer accessible */
	1141	jrec->pushptrgood = 0; /* restored parents in pops no good */
	1142	} else {
	1143	/*
	1144	* If no pushout occured the stream record is NOT truncated and
	1145	* IS extended.
	1146	*/
	1147	jrec->stream_reserved += JREC_DEFAULTSIZE;
	1148	jrec->stream_residual += JREC_DEFAULTSIZE;
	1149	}
	1150	}
	1151	last = (void *)jrec->stream_ptr;
	1152	last->rectype = rectype;
	1153	last->reserved = 0;
	1154	last->recsize = sizeof(struct journal_subrecord) + bytes;
	1155	jrec->last = last;
	1156	jrec->residual = bytes; /* remaining data to be posted */
	1157	jrec->residual_align = -bytes & 7; /* post-data alignment required */
	1158	jrec->stream_ptr += sizeof(last); / current write pointer */
	1159	jrec->stream_residual -= sizeof(last); / space remaining in stream */
	1160	return(last);
	1161	}
	1162
	1163	/*
	1164	* Write out the data associated with a leaf record. Any number of calls
	1165	* to this routine may be made as long as the byte count adds up to the
	1166	* amount originally specified in jrecord_write().
	1167	*
	1168	* The act of writing out the leaf data may result in numerous stream records
	1169	* being pushed out. Callers should be aware that even the associated
	1170	* subrecord header may become inaccessible due to stream record pushouts.
	1171	*/
	1172	static void
	1173	jrecord_data(struct jrecord jrec, const void buf, int bytes)
	1174	{
	1175	int pusheditout;
	1176	int extsize;
	1177
	1178	KKASSERT(bytes >= 0 && bytes <= jrec->residual);
	1179
	1180	/*
	1181	* Push out stream records as long as there is insufficient room to hold
	1182	* the remaining data.
	1183	*/
	1184	while (jrec->stream_residual < bytes) {
	1185	/*
	1186	* Fill in any remaining space in the current stream record.
	1187	*/
	1188	bcopy(buf, jrec->stream_ptr, jrec->stream_residual);
	1189	buf = (const char *)buf + jrec->stream_residual;
	1190	bytes -= jrec->stream_residual;
	1191	/jrec->stream_ptr += jrec->stream_residual;/
	1192	jrec->residual -= jrec->stream_residual;
	1193	jrec->stream_residual = 0;
	1194
	1195	/*
	1196	* Try to extend the current stream record, but no more then 1/4
	1197	* the size of the FIFO.
	1198	*/
	1199	extsize = jrec->jo->fifo.size >> 2;
	1200	if (extsize > bytes)
	1201	extsize = (bytes + 15) & ~15;
	1202
	1203	jrec->stream_ptr = journal_extend(jrec->jo, &jrec->rawp,
	1204	jrec->stream_reserved - jrec->stream_residual,
	1205	extsize, &pusheditout);
	1206	if (pusheditout) {
	1207	jrec->stream_reserved = extsize;
	1208	jrec->stream_residual = extsize;
	1209	jrec->parent = NULL; /* no longer accessible */
	1210	jrec->last = NULL; /* no longer accessible */
	1211	jrec->pushptrgood = 0; /* restored parents in pops no good */
	1212	} else {
	1213	jrec->stream_reserved += extsize;
	1214	jrec->stream_residual += extsize;
	1215	}
	1216	}
	1217
	1218	/*
	1219	* Push out any remaining bytes into the current stream record.
	1220	*/
	1221	if (bytes) {
	1222	bcopy(buf, jrec->stream_ptr, bytes);
	1223	jrec->stream_ptr += bytes;
	1224	jrec->stream_residual -= bytes;
	1225	jrec->residual -= bytes;
	1226	}
	1227
	1228	/*
	1229	* Handle data alignment requirements for the subrecord. Because the
	1230	* stream record's data space is more strictly aligned, it must already
	1231	* have sufficient space to hold any subrecord alignment slop.
	1232	*/
	1233	if (jrec->residual == 0 && jrec->residual_align) {
	1234	KKASSERT(jrec->residual_align <= jrec->stream_residual);
	1235	bzero(jrec->stream_ptr, jrec->residual_align);
	1236	jrec->stream_ptr += jrec->residual_align;
	1237	jrec->stream_residual -= jrec->residual_align;
	1238	jrec->residual_align = 0;
	1239	}
	1240	}
	1241
	1242	/*
	1243	* We are finished with the transaction. This closes the transaction created
	1244	* by jrecord_init().
	1245	*
	1246	* NOTE: If abortit is not set then we must be at the top level with no
	1247	* residual subrecord data left to output.
	1248	*
	1249	* If abortit is set then we can be in any state, all pushes will be
	1250	* popped and it is ok for there to be residual data. This works
	1251	* because the virtual stream itself is truncated. Scanners must deal
	1252	* with this situation.
	1253	*
	1254	* The stream record will be committed or aborted as specified and jrecord
	1255	* resources will be cleaned up.
	1256	*/
	1257	static void
	1258	jrecord_done(struct jrecord *jrec, int abortit)
	1259	{
	1260	KKASSERT(jrec->rawp != NULL);
	1261
	1262	if (abortit) {
	1263	journal_abort(jrec->jo, &jrec->rawp);
	1264	} else {
	1265	KKASSERT(jrec->pushcount == 0 && jrec->residual == 0);
	1266	journal_commit(jrec->jo, &jrec->rawp,
	1267	jrec->stream_reserved - jrec->stream_residual, 1);
	1268	}
	1269
	1270	/*
	1271	* jrec should not be used beyond this point without another init,
	1272	* but clean up some fields to ensure that we panic if it is.
	1273	*
	1274	* Note that jrec->rawp is NULLd out by journal_abort/journal_commit.
	1275	*/
	1276	jrec->jo = NULL;
	1277	jrec->stream_ptr = NULL;
	1278	}
	1279
	1280	/************************************************************************
	1281	* LOW LEVEL RECORD SUPPORT ROUTINES *
	1282	************************************************************************
	1283	*
	1284	* These routine create low level recursive and leaf subrecords representing
	1285	* common filesystem structures.
	1286	*/
	1287
	1288	/*
	1289	* Write out a filename path relative to the base of the mount point.
	1290	* rectype is typically JLEAF_PATH{1,2,3,4}.
	1291	*/
	1292	static void
	1293	jrecord_write_path(struct jrecord jrec, int16_t rectype, struct namecache ncp)
	1294	{
	1295	char buf[64]; /* local buffer if it fits, else malloced */
	1296	char *base;
	1297	int pathlen;
	1298	int index;
	1299	struct namecache *scan;
	1300
	1301	/*
	1302	* Pass 1 - figure out the number of bytes required. Include terminating
	1303	* \0 on last element and '/' separator on other elements.
	1304	*/
	1305	again:
	1306	pathlen = 0;
	1307	for (scan = ncp;
	1308	scan && (scan->nc_flag & NCF_MOUNTPT) == 0;
	1309	scan = scan->nc_parent
	1310	) {
	1311	pathlen += scan->nc_nlen + 1;
	1312	}
	1313
	1314	if (pathlen <= sizeof(buf))
	1315	base = buf;
	1316	else
	1317	base = malloc(pathlen, M_TEMP, M_INTWAIT);
	1318
	1319	/*
	1320	* Pass 2 - generate the path buffer
	1321	*/
	1322	index = pathlen;
	1323	for (scan = ncp;
	1324	scan && (scan->nc_flag & NCF_MOUNTPT) == 0;
	1325	scan = scan->nc_parent
	1326	) {
	1327	if (scan->nc_nlen >= index) {
	1328	if (base != buf)
	1329	free(base, M_TEMP);
	1330	goto again;
	1331	}
	1332	if (index == pathlen)
	1333	base[--index] = 0;
	1334	else
	1335	base[--index] = '/';
	1336	index -= scan->nc_nlen;
	1337	bcopy(scan->nc_name, base + index, scan->nc_nlen);
	1338	}
	1339	jrecord_leaf(jrec, rectype, base + index, pathlen - index);
	1340	if (base != buf)
	1341	free(base, M_TEMP);
	1342	}
	1343
	1344	/*
	1345	* Write out a file attribute structure. While somewhat inefficient, using
	1346	* a recursive data structure is the most portable and extensible way.
	1347	*/
	1348	static void
	1349	jrecord_write_vattr(struct jrecord jrec, struct vattr vat)
	1350	{
	1351	void *save;
	1352
	1353	save = jrecord_push(jrec, JTYPE_VATTR);
	1354	if (vat->va_type != VNON)
	1355	jrecord_leaf(jrec, JLEAF_UID, &vat->va_type, sizeof(vat->va_type));
	1356	if (vat->va_uid != VNOVAL)
	1357	jrecord_leaf(jrec, JLEAF_UID, &vat->va_mode, sizeof(vat->va_mode));
	1358	if (vat->va_nlink != VNOVAL)
	1359	jrecord_leaf(jrec, JLEAF_NLINK, &vat->va_nlink, sizeof(vat->va_nlink));
	1360	if (vat->va_uid != VNOVAL)
	1361	jrecord_leaf(jrec, JLEAF_UID, &vat->va_uid, sizeof(vat->va_uid));
	1362	if (vat->va_gid != VNOVAL)
	1363	jrecord_leaf(jrec, JLEAF_GID, &vat->va_gid, sizeof(vat->va_gid));
	1364	if (vat->va_fsid != VNOVAL)
	1365	jrecord_leaf(jrec, JLEAF_FSID, &vat->va_fsid, sizeof(vat->va_fsid));
	1366	if (vat->va_fileid != VNOVAL)
	1367	jrecord_leaf(jrec, JLEAF_INUM, &vat->va_fileid, sizeof(vat->va_fileid));
	1368	if (vat->va_size != VNOVAL)
	1369	jrecord_leaf(jrec, JLEAF_SIZE, &vat->va_size, sizeof(vat->va_size));
	1370	if (vat->va_atime.tv_sec != VNOVAL)
	1371	jrecord_leaf(jrec, JLEAF_ATIME, &vat->va_atime, sizeof(vat->va_atime));
	1372	if (vat->va_mtime.tv_sec != VNOVAL)
	1373	jrecord_leaf(jrec, JLEAF_MTIME, &vat->va_mtime, sizeof(vat->va_mtime));
	1374	if (vat->va_ctime.tv_sec != VNOVAL)
	1375	jrecord_leaf(jrec, JLEAF_CTIME, &vat->va_ctime, sizeof(vat->va_ctime));
	1376	if (vat->va_gen != VNOVAL)
	1377	jrecord_leaf(jrec, JLEAF_GEN, &vat->va_gen, sizeof(vat->va_gen));
	1378	if (vat->va_flags != VNOVAL)
	1379	jrecord_leaf(jrec, JLEAF_FLAGS, &vat->va_flags, sizeof(vat->va_flags));
	1380	if (vat->va_rdev != VNOVAL)
	1381	jrecord_leaf(jrec, JLEAF_UDEV, &vat->va_rdev, sizeof(vat->va_rdev));
	1382	#if 0
	1383	if (vat->va_filerev != VNOVAL)
	1384	jrecord_leaf(jrec, JLEAF_FILEREV, &vat->va_filerev, sizeof(vat->va_filerev));
	1385	#endif
	1386	jrecord_pop(jrec, save);
	1387	}
	1388
	1389	/*
	1390	* Write out the creds used to issue a file operation. If a process is
	1391	* available write out additional tracking information related to the
	1392	* process.
	1393	*
	1394	* XXX additional tracking info
	1395	* XXX tty line info
	1396	*/
	1397	static void
	1398	jrecord_write_cred(struct jrecord jrec, struct thread td, struct ucred *cred)
	1399	{
	1400	void *save;
	1401	struct proc *p;
	1402
	1403	save = jrecord_push(jrec, JTYPE_CRED);
	1404	jrecord_leaf(jrec, JLEAF_UID, &cred->cr_uid, sizeof(cred->cr_uid));
	1405	jrecord_leaf(jrec, JLEAF_GID, &cred->cr_gid, sizeof(cred->cr_gid));
	1406	if (td && (p = td->td_proc) != NULL) {
	1407	jrecord_leaf(jrec, JLEAF_PID, &p->p_pid, sizeof(p->p_pid));
	1408	jrecord_leaf(jrec, JLEAF_COMM, p->p_comm, sizeof(p->p_comm));
	1409	}
	1410	jrecord_pop(jrec, save);
	1411	}
	1412
	1413	/*
	1414	* Write out information required to identify a vnode
	1415	*
	1416	* XXX this needs work. We should write out the inode number as well,
	1417	* and in fact avoid writing out the file path for seqential writes
	1418	* occuring within e.g. a certain period of time.
	1419	*/
	1420	static void
	1421	jrecord_write_vnode_ref(struct jrecord jrec, struct vnode vp)
	1422	{
	1423	struct namecache *ncp;
	1424
	1425	TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
	1426	if ((ncp->nc_flag & (NCF_UNRESOLVED\|NCF_DESTROYED)) == 0)
	1427	break;
	1428	}
	1429	if (ncp)
	1430	jrecord_write_path(jrec, JLEAF_PATH_REF, ncp);
	1431	}
	1432
	1433	#if 0
	1434	/*
	1435	* Write out the current contents of the file within the specified
	1436	* range. This is typically called from within an UNDO section. A
	1437	* locked vnode must be passed.
	1438	*/
	1439	static int
	1440	jrecord_write_filearea(struct jrecord jrec, struct vnode vp,
	1441	off_t begoff, off_t endoff)
	1442	{
	1443	}
	1444	#endif
	1445
	1446	/*
	1447	* Write out the data represented by a pagelist
	1448	*/
	1449	static void
	1450	jrecord_write_pagelist(struct jrecord *jrec, int16_t rectype,
	1451	struct vm_page *pglist, int rtvals, int pgcount,
	1452	off_t offset)
	1453	{
	1454	struct msf_buf *msf;
	1455	int error;
	1456	int b;
	1457	int i;
	1458
	1459	i = 0;
	1460	while (i < pgcount) {
	1461	/*
	1462	* Find the next valid section. Skip any invalid elements
	1463	*/
	1464	if (rtvals[i] != VM_PAGER_OK) {
	1465	++i;
	1466	offset += PAGE_SIZE;
	1467	continue;
	1468	}
	1469
	1470	/*
	1471	* Figure out how big the valid section is, capping I/O at what the
	1472	* MSFBUF can represent.
	1473	*/
	1474	b = i;
	1475	while (i < pgcount && i - b != XIO_INTERNAL_PAGES &&
	1476	rtvals[i] == VM_PAGER_OK
	1477	) {
	1478	++i;
	1479	}
	1480
	1481	/*
	1482	* And write it out.
	1483	*/
	1484	if (i - b) {
	1485	error = msf_map_pagelist(&msf, pglist + b, i - b, 0);
	1486	if (error == 0) {
	1487	printf("RECORD PUTPAGES %d\n", msf_buf_bytes(msf));
	1488	jrecord_leaf(jrec, JLEAF_SEEKPOS, &offset, sizeof(offset));
	1489	jrecord_leaf(jrec, rectype,
	1490	msf_buf_kva(msf), msf_buf_bytes(msf));
	1491	msf_buf_free(msf);
	1492	} else {
	1493	printf("jrecord_write_pagelist: mapping failure\n");
	1494	}
	1495	offset += (off_t)(i - b) << PAGE_SHIFT;
	1496	}
	1497	}
	1498	}
	1499
	1500	/*
	1501	* Write out the data represented by a UIO.
	1502	*/
	1503	struct jwuio_info {
	1504	struct jrecord *jrec;
	1505	int16_t rectype;
	1506	};
	1507
	1508	static int jrecord_write_uio_callback(void info, char buf, int bytes);
	1509
	1510	static void
	1511	jrecord_write_uio(struct jrecord jrec, int16_t rectype, struct uio uio)
	1512	{
	1513	struct jwuio_info info = { jrec, rectype };
	1514	int error;
	1515
	1516	if (uio->uio_segflg != UIO_NOCOPY) {
	1517	jrecord_leaf(jrec, JLEAF_SEEKPOS, &uio->uio_offset,
	1518	sizeof(uio->uio_offset));
	1519	error = msf_uio_iterate(uio, jrecord_write_uio_callback, &info);
	1520	if (error)
	1521	printf("XXX warning uio iterate failed %d\n", error);
	1522	}
	1523	}
	1524
	1525	static int
	1526	jrecord_write_uio_callback(void info_arg, char buf, int bytes)
	1527	{
	1528	struct jwuio_info *info = info_arg;
	1529
	1530	jrecord_leaf(info->jrec, info->rectype, buf, bytes);
	1531	return(0);
	1532	}
	1533
	1534	/************************************************************************
	1535	* JOURNAL VNOPS *
	1536	************************************************************************
	1537	*
	1538	* These are function shims replacing the normal filesystem ops. We become
	1539	* responsible for calling the underlying filesystem ops. We have the choice
	1540	* of executing the underlying op first and then generating the journal entry,
	1541	* or starting the journal entry, executing the underlying op, and then
	1542	* either completing or aborting it.
	1543	*
	1544	* The journal is supposed to be a high-level entity, which generally means
	1545	* identifying files by name rather then by inode. Supplying both allows
	1546	* the journal to be used both for inode-number-compatible 'mirrors' and
	1547	* for simple filesystem replication.
	1548	*
	1549	* Writes are particularly difficult to deal with because a single write may
	1550	* represent a hundred megabyte buffer or more, and both writes and truncations
	1551	* require the 'old' data to be written out as well as the new data if the
	1552	* log is reversable. Other issues:
	1553	*
	1554	* - How to deal with operations on unlinked files (no path available),
	1555	* but which may still be filesystem visible due to hard links.
	1556	*
	1557	* - How to deal with modifications made via a memory map.
	1558	*
	1559	* - Future cache coherency support will require cache coherency API calls
	1560	* both prior to and after the call to the underlying VFS.
	1561	*
	1562	* ALSO NOTE: We do not have to shim compatibility VOPs like MKDIR which have
	1563	* new VFS equivalents (NMKDIR).
	1564	*/
	1565
	1566	/*
	1567	* Journal vop_settattr { a_vp, a_vap, a_cred, a_td }
	1568	*/
	1569	static
	1570	int
	1571	journal_setattr(struct vop_setattr_args *ap)
	1572	{
	1573	struct mount *mp;
	1574	struct journal *jo;
	1575	struct jrecord jrec;
	1576	void save; / warning, save pointers do not always remain valid */
	1577	int error;
	1578
	1579	error = vop_journal_operate_ap(&ap->a_head);
	1580	mp = ap->a_head.a_ops->vv_mount;
	1581	if (error == 0) {
	1582	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	1583	jrecord_init(jo, &jrec, -1);
	1584	save = jrecord_push(&jrec, JTYPE_SETATTR);
	1585	jrecord_write_cred(&jrec, ap->a_td, ap->a_cred);
	1586	jrecord_write_vnode_ref(&jrec, ap->a_vp);
	1587	jrecord_write_vattr(&jrec, ap->a_vap);
	1588	jrecord_pop(&jrec, save);
	1589	jrecord_done(&jrec, 0);
	1590	}
	1591	}
	1592	return (error);
	1593	}
	1594
	1595	/*
	1596	* Journal vop_write { a_vp, a_uio, a_ioflag, a_cred }
	1597	*/
	1598	static
	1599	int
	1600	journal_write(struct vop_write_args *ap)
	1601	{
	1602	struct mount *mp;
	1603	struct journal *jo;
	1604	struct jrecord jrec;
	1605	struct uio uio_copy;
	1606	struct iovec uio_one_iovec;
	1607	void save; / warning, save pointers do not always remain valid */
	1608	int error;
	1609
	1610	/*
	1611	* This is really nasty. UIO's don't retain sufficient information to
	1612	* be reusable once they've gone through the VOP chain. The iovecs get
	1613	* cleared, so we have to copy the UIO.
	1614	*
	1615	* XXX fix the UIO code to not destroy iov's during a scan so we can
	1616	* reuse the uio over and over again.
	1617	*/
	1618	uio_copy = *ap->a_uio;
	1619	if (uio_copy.uio_iovcnt == 1) {
	1620	uio_one_iovec = ap->a_uio->uio_iov[0];
	1621	uio_copy.uio_iov = &uio_one_iovec;
	1622	} else {
	1623	uio_copy.uio_iov = malloc(uio_copy.uio_iovcnt * sizeof(struct iovec),
	1624	M_JOURNAL, M_WAITOK);
	1625	bcopy(ap->a_uio->uio_iov, uio_copy.uio_iov,
	1626	uio_copy.uio_iovcnt * sizeof(struct iovec));
	1627	}
	1628
	1629	error = vop_journal_operate_ap(&ap->a_head);
	1630	mp = ap->a_head.a_ops->vv_mount;
	1631	if (error == 0) {
	1632	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	1633	jrecord_init(jo, &jrec, -1);
	1634	save = jrecord_push(&jrec, JTYPE_WRITE);
	1635	jrecord_write_cred(&jrec, NULL, ap->a_cred);
	1636	jrecord_write_vnode_ref(&jrec, ap->a_vp);
	1637	jrecord_write_uio(&jrec, JLEAF_FILEDATA, &uio_copy);
	1638	jrecord_pop(&jrec, save);
	1639	jrecord_done(&jrec, 0);
	1640	}
	1641	}
	1642
	1643	if (uio_copy.uio_iov != &uio_one_iovec)
	1644	free(uio_copy.uio_iov, M_JOURNAL);
	1645
	1646
	1647	return (error);
	1648	}
	1649
	1650	/*
	1651	* Journal vop_fsync { a_vp, a_waitfor, a_td }
	1652	*/
	1653	static
	1654	int
	1655	journal_fsync(struct vop_fsync_args *ap)
	1656	{
	1657	struct mount *mp;
	1658	struct journal *jo;
	1659	int error;
	1660
	1661	error = vop_journal_operate_ap(&ap->a_head);
	1662	mp = ap->a_head.a_ops->vv_mount;
	1663	if (error == 0) {
	1664	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	1665	/* XXX synchronize pending journal records */
	1666	}
	1667	}
	1668	return (error);
	1669	}
	1670
	1671	/*
	1672	* Journal vop_putpages { a_vp, a_m, a_count, a_sync, a_rtvals, a_offset }
	1673	*
	1674	* note: a_count is in bytes.
	1675	*/
	1676	static
	1677	int
	1678	journal_putpages(struct vop_putpages_args *ap)
	1679	{
	1680	struct mount *mp;
	1681	struct journal *jo;
	1682	struct jrecord jrec;
	1683	void save; / warning, save pointers do not always remain valid */
	1684	int error;
	1685
	1686	error = vop_journal_operate_ap(&ap->a_head);
	1687	mp = ap->a_head.a_ops->vv_mount;
	1688	if (error == 0 && ap->a_count > 0) {
	1689	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	1690	jrecord_init(jo, &jrec, -1);
	1691	save = jrecord_push(&jrec, JTYPE_PUTPAGES);
	1692	jrecord_write_vnode_ref(&jrec, ap->a_vp);
	1693	jrecord_write_pagelist(&jrec, JLEAF_FILEDATA,
	1694	ap->a_m, ap->a_rtvals, btoc(ap->a_count), ap->a_offset);
	1695	jrecord_pop(&jrec, save);
	1696	jrecord_done(&jrec, 0);
	1697	}
	1698	}
	1699	return (error);
	1700	}
	1701
	1702	/*
	1703	* Journal vop_setacl { a_vp, a_type, a_aclp, a_cred, a_td }
	1704	*/
	1705	static
	1706	int
	1707	journal_setacl(struct vop_setacl_args *ap)
	1708	{
	1709	struct mount *mp;
	1710	struct journal *jo;
	1711	struct jrecord jrec;
	1712	void save; / warning, save pointers do not always remain valid */
	1713	int error;
	1714
	1715	error = vop_journal_operate_ap(&ap->a_head);
	1716	mp = ap->a_head.a_ops->vv_mount;
	1717	if (error == 0) {
	1718	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	1719	jrecord_init(jo, &jrec, -1);
	1720	save = jrecord_push(&jrec, JTYPE_SETACL);
	1721	jrecord_write_cred(&jrec, ap->a_td, ap->a_cred);
	1722	jrecord_write_vnode_ref(&jrec, ap->a_vp);
	1723	/* XXX type, aclp */
	1724	jrecord_pop(&jrec, save);
	1725	jrecord_done(&jrec, 0);
	1726	}
	1727	}
	1728	return (error);
	1729	}
	1730
	1731	/*
	1732	* Journal vop_setextattr { a_vp, a_name, a_uio, a_cred, a_td }
	1733	*/
	1734	static
	1735	int
	1736	journal_setextattr(struct vop_setextattr_args *ap)
	1737	{
	1738	struct mount *mp;
	1739	struct journal *jo;
	1740	struct jrecord jrec;
	1741	void save; / warning, save pointers do not always remain valid */
	1742	int error;
	1743
	1744	error = vop_journal_operate_ap(&ap->a_head);
	1745	mp = ap->a_head.a_ops->vv_mount;
	1746	if (error == 0) {
	1747	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	1748	jrecord_init(jo, &jrec, -1);
	1749	save = jrecord_push(&jrec, JTYPE_SETEXTATTR);
	1750	jrecord_write_cred(&jrec, ap->a_td, ap->a_cred);
	1751	jrecord_write_vnode_ref(&jrec, ap->a_vp);
	1752	jrecord_leaf(&jrec, JLEAF_ATTRNAME, ap->a_name, strlen(ap->a_name));
	1753	jrecord_write_uio(&jrec, JLEAF_FILEDATA, ap->a_uio);
	1754	jrecord_pop(&jrec, save);
	1755	jrecord_done(&jrec, 0);
	1756	}
	1757	}
	1758	return (error);
	1759	}
	1760
	1761	/*
	1762	* Journal vop_ncreate { a_ncp, a_vpp, a_cred, a_vap }
	1763	*/
	1764	static
	1765	int
	1766	journal_ncreate(struct vop_ncreate_args *ap)
	1767	{
	1768	struct mount *mp;
	1769	struct journal *jo;
	1770	struct jrecord jrec;
	1771	void save; / warning, save pointers do not always remain valid */
	1772	int error;
	1773
	1774	error = vop_journal_operate_ap(&ap->a_head);
	1775	mp = ap->a_head.a_ops->vv_mount;
	1776	if (error == 0) {
	1777	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	1778	jrecord_init(jo, &jrec, -1);
	1779	save = jrecord_push(&jrec, JTYPE_CREATE);
	1780	jrecord_write_cred(&jrec, NULL, ap->a_cred);
	1781	jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
	1782	if (*ap->a_vpp)
	1783	jrecord_write_vnode_ref(&jrec, *ap->a_vpp);
	1784	jrecord_pop(&jrec, save);
	1785	jrecord_done(&jrec, 0);
	1786	}
	1787	}
	1788	return (error);
	1789	}
	1790
	1791	/*
	1792	* Journal vop_nmknod { a_ncp, a_vpp, a_cred, a_vap }
	1793	*/
	1794	static
	1795	int
	1796	journal_nmknod(struct vop_nmknod_args *ap)
	1797	{
	1798	struct mount *mp;
	1799	struct journal *jo;
	1800	struct jrecord jrec;
	1801	void save; / warning, save pointers do not always remain valid */
	1802	int error;
	1803
	1804	error = vop_journal_operate_ap(&ap->a_head);
	1805	mp = ap->a_head.a_ops->vv_mount;
	1806	if (error == 0) {
	1807	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	1808	jrecord_init(jo, &jrec, -1);
	1809	save = jrecord_push(&jrec, JTYPE_MKNOD);
	1810	jrecord_write_cred(&jrec, NULL, ap->a_cred);
	1811	jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
	1812	jrecord_write_vattr(&jrec, ap->a_vap);
	1813	if (*ap->a_vpp)
	1814	jrecord_write_vnode_ref(&jrec, *ap->a_vpp);
	1815	jrecord_pop(&jrec, save);
	1816	jrecord_done(&jrec, 0);
	1817	}
	1818	}
	1819	return (error);
	1820	}
	1821
	1822	/*
	1823	* Journal vop_nlink { a_ncp, a_vp, a_cred }
	1824	*/
	1825	static
	1826	int
	1827	journal_nlink(struct vop_nlink_args *ap)
	1828	{
	1829	struct mount *mp;
	1830	struct journal *jo;
	1831	struct jrecord jrec;
	1832	void save; / warning, save pointers do not always remain valid */
	1833	int error;
	1834
	1835	error = vop_journal_operate_ap(&ap->a_head);
	1836	mp = ap->a_head.a_ops->vv_mount;
	1837	if (error == 0) {
	1838	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	1839	jrecord_init(jo, &jrec, -1);
	1840	save = jrecord_push(&jrec, JTYPE_LINK);
	1841	jrecord_write_cred(&jrec, NULL, ap->a_cred);
	1842	jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
	1843	jrecord_write_vnode_ref(&jrec, ap->a_vp);
	1844	/* XXX PATH to VP and inode number */
	1845	jrecord_pop(&jrec, save);
	1846	jrecord_done(&jrec, 0);
	1847	}
	1848	}
	1849	return (error);
	1850	}
	1851
	1852	/*
	1853	* Journal vop_symlink { a_ncp, a_vpp, a_cred, a_vap, a_target }
	1854	*/
	1855	static
	1856	int
	1857	journal_nsymlink(struct vop_nsymlink_args *ap)
	1858	{
	1859	struct mount *mp;
	1860	struct journal *jo;
	1861	struct jrecord jrec;
	1862	void save; / warning, save pointers do not always remain valid */
	1863	int error;
	1864
	1865	error = vop_journal_operate_ap(&ap->a_head);
	1866	mp = ap->a_head.a_ops->vv_mount;
	1867	if (error == 0) {
	1868	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	1869	jrecord_init(jo, &jrec, -1);
	1870	save = jrecord_push(&jrec, JTYPE_SYMLINK);
	1871	jrecord_write_cred(&jrec, NULL, ap->a_cred);
	1872	jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
	1873	jrecord_leaf(&jrec, JLEAF_SYMLINKDATA,
	1874	ap->a_target, strlen(ap->a_target));
	1875	if (*ap->a_vpp)
	1876	jrecord_write_vnode_ref(&jrec, *ap->a_vpp);
	1877	jrecord_pop(&jrec, save);
	1878	jrecord_done(&jrec, 0);
	1879	}
	1880	}
	1881	return (error);
	1882	}
	1883
	1884	/*
	1885	* Journal vop_nwhiteout { a_ncp, a_cred, a_flags }
	1886	*/
	1887	static
	1888	int
	1889	journal_nwhiteout(struct vop_nwhiteout_args *ap)
	1890	{
	1891	struct mount *mp;
	1892	struct journal *jo;
	1893	struct jrecord jrec;
	1894	void save; / warning, save pointers do not always remain valid */
	1895	int error;
	1896
	1897	error = vop_journal_operate_ap(&ap->a_head);
	1898	mp = ap->a_head.a_ops->vv_mount;
	1899	if (error == 0) {
	1900	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	1901	jrecord_init(jo, &jrec, -1);
	1902	save = jrecord_push(&jrec, JTYPE_WHITEOUT);
	1903	jrecord_write_cred(&jrec, NULL, ap->a_cred);
	1904	jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
	1905	jrecord_pop(&jrec, save);
	1906	jrecord_done(&jrec, 0);
	1907	}
	1908	}
	1909	return (error);
	1910	}
	1911
	1912	/*
	1913	* Journal vop_nremove { a_ncp, a_cred }
	1914	*/
	1915	static
	1916	int
	1917	journal_nremove(struct vop_nremove_args *ap)
	1918	{
	1919	struct mount *mp;
	1920	struct journal *jo;
	1921	struct jrecord jrec;
	1922	void save; / warning, save pointers do not always remain valid */
	1923	int error;
	1924
	1925	error = vop_journal_operate_ap(&ap->a_head);
	1926	mp = ap->a_head.a_ops->vv_mount;
	1927	if (error == 0) {
	1928	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	1929	jrecord_init(jo, &jrec, -1);
	1930	save = jrecord_push(&jrec, JTYPE_REMOVE);
	1931	jrecord_write_cred(&jrec, NULL, ap->a_cred);
	1932	jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
	1933	jrecord_pop(&jrec, save);
	1934	jrecord_done(&jrec, 0);
	1935	}
	1936	}
	1937	return (error);
	1938	}
	1939
	1940	/*
	1941	* Journal vop_nmkdir { a_ncp, a_vpp, a_cred, a_vap }
	1942	*/
	1943	static
	1944	int
	1945	journal_nmkdir(struct vop_nmkdir_args *ap)
	1946	{
	1947	struct mount *mp;
	1948	struct journal *jo;
	1949	struct jrecord jrec;
	1950	void save; / warning, save pointers do not always remain valid */
	1951	int error;
	1952
	1953	error = vop_journal_operate_ap(&ap->a_head);
	1954	mp = ap->a_head.a_ops->vv_mount;
	1955	if (error == 0) {
	1956	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	1957	jrecord_init(jo, &jrec, -1);
	1958	if (jo->flags & MC_JOURNAL_WANT_REVERSABLE) {
	1959	save = jrecord_push(&jrec, JTYPE_UNDO);
	1960	/* XXX undo operations */
	1961	jrecord_pop(&jrec, save);
	1962	}
	1963	#if 0
	1964	if (jo->flags & MC_JOURNAL_WANT_AUDIT) {
	1965	jrecord_write_audit(&jrec);
	1966	}
	1967	#endif
	1968	save = jrecord_push(&jrec, JTYPE_MKDIR);
	1969	jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
	1970	jrecord_write_cred(&jrec, NULL, ap->a_cred);
	1971	jrecord_write_vattr(&jrec, ap->a_vap);
	1972	jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
	1973	if (*ap->a_vpp)
	1974	jrecord_write_vnode_ref(&jrec, *ap->a_vpp);
	1975	jrecord_pop(&jrec, save);
	1976	jrecord_done(&jrec, 0);
	1977	}
	1978	}
	1979	return (error);
	1980	}
	1981
	1982	/*
	1983	* Journal vop_nrmdir { a_ncp, a_cred }
	1984	*/
	1985	static
	1986	int
	1987	journal_nrmdir(struct vop_nrmdir_args *ap)
	1988	{
	1989	struct mount *mp;
	1990	struct journal *jo;
	1991	struct jrecord jrec;
	1992	void save; / warning, save pointers do not always remain valid */
	1993	int error;
	1994
	1995	error = vop_journal_operate_ap(&ap->a_head);
	1996	mp = ap->a_head.a_ops->vv_mount;
	1997	if (error == 0) {
	1998	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	1999	jrecord_init(jo, &jrec, -1);
	2000	save = jrecord_push(&jrec, JTYPE_RMDIR);
	2001	jrecord_write_cred(&jrec, NULL, ap->a_cred);
	2002	jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
	2003	jrecord_pop(&jrec, save);
	2004	jrecord_done(&jrec, 0);
	2005	}
	2006	}
	2007	return (error);
	2008	}
	2009
	2010	/*
	2011	* Journal vop_nrename { a_fncp, a_tncp, a_cred }
	2012	*/
	2013	static
	2014	int
	2015	journal_nrename(struct vop_nrename_args *ap)
	2016	{
	2017	struct mount *mp;
	2018	struct journal *jo;
	2019	struct jrecord jrec;
	2020	void save; / warning, save pointers do not always remain valid */
	2021	int error;
	2022
	2023	error = vop_journal_operate_ap(&ap->a_head);
	2024	mp = ap->a_head.a_ops->vv_mount;
	2025	if (error == 0) {
	2026	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	2027	jrecord_init(jo, &jrec, -1);
	2028	save = jrecord_push(&jrec, JTYPE_RENAME);
	2029	jrecord_write_cred(&jrec, NULL, ap->a_cred);
	2030	jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_fncp);
	2031	jrecord_write_path(&jrec, JLEAF_PATH2, ap->a_tncp);
	2032	jrecord_pop(&jrec, save);
	2033	jrecord_done(&jrec, 0);
	2034	}
	2035	}
	2036	return (error);
	2037	}
	2038