gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2004 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/kern/vfs_jops.c,v 1.19 2005/08/24 20:28:31 dillon Exp $
	35	*/
	36	/*
	37	* Each mount point may have zero or more independantly configured journals
	38	* attached to it. Each journal is represented by a memory FIFO and worker
	39	* thread. Journal events are streamed through the FIFO to the thread,
	40	* batched up (typically on one-second intervals), and written out by the
	41	* thread.
	42	*
	43	* Journal vnode ops are executed instead of mnt_vn_norm_ops when one or
	44	* more journals have been installed on a mount point. It becomes the
	45	* responsibility of the journal op to call the underlying normal op as
	46	* appropriate.
	47	*
	48	* The journaling protocol is intended to evolve into a two-way stream
	49	* whereby transaction IDs can be acknowledged by the journaling target
	50	* when the data has been committed to hard storage. Both implicit and
	51	* explicit acknowledgement schemes will be supported, depending on the
	52	* sophistication of the journaling stream, plus resynchronization and
	53	* restart when a journaling stream is interrupted. This information will
	54	* also be made available to journaling-aware filesystems to allow better
	55	* management of their own physical storage synchronization mechanisms as
	56	* well as to allow such filesystems to take direct advantage of the kernel's
	57	* journaling layer so they don't have to roll their own.
	58	*
	59	* In addition, the worker thread will have access to much larger
	60	* spooling areas then the memory buffer is able to provide by e.g.
	61	* reserving swap space, in order to absorb potentially long interruptions
	62	* of off-site journaling streams, and to prevent 'slow' off-site linkages
	63	* from radically slowing down local filesystem operations.
	64	*
	65	* Because of the non-trivial algorithms the journaling system will be
	66	* required to support, use of a worker thread is mandatory. Efficiencies
	67	* are maintained by utilitizing the memory FIFO to batch transactions when
	68	* possible, reducing the number of gratuitous thread switches and taking
	69	* advantage of cpu caches through the use of shorter batched code paths
	70	* rather then trying to do everything in the context of the process
	71	* originating the filesystem op. In the future the memory FIFO can be
	72	* made per-cpu to remove BGL or other locking requirements.
	73	*/
	74	#include <sys/param.h>
	75	#include <sys/systm.h>
	76	#include <sys/buf.h>
	77	#include <sys/conf.h>
	78	#include <sys/kernel.h>
	79	#include <sys/queue.h>
	80	#include <sys/lock.h>
	81	#include <sys/malloc.h>
	82	#include <sys/mount.h>
	83	#include <sys/unistd.h>
	84	#include <sys/vnode.h>
	85	#include <sys/poll.h>
	86	#include <sys/mountctl.h>
	87	#include <sys/journal.h>
	88	#include <sys/file.h>
	89	#include <sys/proc.h>
	90	#include <sys/msfbuf.h>
	91	#include <sys/socket.h>
	92	#include <sys/socketvar.h>
	93
	94	#include <machine/limits.h>
	95
	96	#include <vm/vm.h>
	97	#include <vm/vm_object.h>
	98	#include <vm/vm_page.h>
	99	#include <vm/vm_pager.h>
	100	#include <vm/vnode_pager.h>
	101
	102	#include <sys/file2.h>
	103	#include <sys/thread2.h>
	104
	105	static int journal_attach(struct mount *mp);
	106	static void journal_detach(struct mount *mp);
	107	static int journal_install_vfs_journal(struct mount mp, struct file fp,
	108	const struct mountctl_install_journal *info);
	109	static int journal_restart_vfs_journal(struct mount mp, struct file fp,
	110	const struct mountctl_restart_journal *info);
	111	static int journal_remove_vfs_journal(struct mount *mp,
	112	const struct mountctl_remove_journal *info);
	113	static int journal_restart(struct mount mp, struct file fp,
	114	struct journal *jo, int flags);
	115	static int journal_destroy(struct mount mp, struct journal jo, int flags);
	116	static int journal_resync_vfs_journal(struct mount mp, const void ctl);
	117	static int journal_status_vfs_journal(struct mount *mp,
	118	const struct mountctl_status_journal *info,
	119	struct mountctl_journal_ret_status *rstat,
	120	int buflen, int *res);
	121	static void journal_create_threads(struct journal *jo);
	122	static void journal_destroy_threads(struct journal *jo, int flags);
	123	static void journal_wthread(void *info);
	124	static void journal_rthread(void *info);
	125
	126	static void journal_reserve(struct journal jo,
	127	struct journal_rawrecbeg **rawpp,
	128	int16_t streamid, int bytes);
	129	static void journal_extend(struct journal jo,
	130	struct journal_rawrecbeg **rawpp,
	131	int truncbytes, int bytes, int *newstreamrecp);
	132	static void journal_abort(struct journal *jo,
	133	struct journal_rawrecbeg **rawpp);
	134	static void journal_commit(struct journal *jo,
	135	struct journal_rawrecbeg **rawpp,
	136	int bytes, int closeout);
	137
	138	static void jrecord_init(struct journal *jo,
	139	struct jrecord *jrec, int16_t streamid);
	140	static struct journal_subrecord *jrecord_push(
	141	struct jrecord *jrec, int16_t rectype);
	142	static void jrecord_pop(struct jrecord jrec, struct journal_subrecord parent);
	143	static struct journal_subrecord jrecord_write(struct jrecord jrec,
	144	int16_t rectype, int bytes);
	145	static void jrecord_data(struct jrecord jrec, const void buf, int bytes);
	146	static void jrecord_done(struct jrecord *jrec, int abortit);
	147	static void jrecord_undo_file(struct jrecord jrec, struct vnode vp,
	148	int jrflags, off_t off, off_t bytes);
	149
	150	static int journal_setattr(struct vop_setattr_args *ap);
	151	static int journal_write(struct vop_write_args *ap);
	152	static int journal_fsync(struct vop_fsync_args *ap);
	153	static int journal_putpages(struct vop_putpages_args *ap);
	154	static int journal_setacl(struct vop_setacl_args *ap);
	155	static int journal_setextattr(struct vop_setextattr_args *ap);
	156	static int journal_ncreate(struct vop_ncreate_args *ap);
	157	static int journal_nmknod(struct vop_nmknod_args *ap);
	158	static int journal_nlink(struct vop_nlink_args *ap);
	159	static int journal_nsymlink(struct vop_nsymlink_args *ap);
	160	static int journal_nwhiteout(struct vop_nwhiteout_args *ap);
	161	static int journal_nremove(struct vop_nremove_args *ap);
	162	static int journal_nmkdir(struct vop_nmkdir_args *ap);
	163	static int journal_nrmdir(struct vop_nrmdir_args *ap);
	164	static int journal_nrename(struct vop_nrename_args *ap);
	165
	166	#define JRUNDO_SIZE 0x00000001
	167	#define JRUNDO_UID 0x00000002
	168	#define JRUNDO_GID 0x00000004
	169	#define JRUNDO_FSID 0x00000008
	170	#define JRUNDO_MODES 0x00000010
	171	#define JRUNDO_INUM 0x00000020
	172	#define JRUNDO_ATIME 0x00000040
	173	#define JRUNDO_MTIME 0x00000080
	174	#define JRUNDO_CTIME 0x00000100
	175	#define JRUNDO_GEN 0x00000200
	176	#define JRUNDO_FLAGS 0x00000400
	177	#define JRUNDO_UDEV 0x00000800
	178	#define JRUNDO_FILEDATA 0x00010000
	179	#define JRUNDO_GETVP 0x00020000
	180	#define JRUNDO_CONDLINK 0x00040000 /* write file data if link count 1 */
	181	#define JRUNDO_VATTR (JRUNDO_SIZE\|JRUNDO_UID\|JRUNDO_GID\|JRUNDO_FSID\|\
	182	JRUNDO_MODES\|JRUNDO_INUM\|JRUNDO_ATIME\|JRUNDO_MTIME\|\
	183	JRUNDO_CTIME\|JRUNDO_GEN\|JRUNDO_FLAGS\|JRUNDO_UDEV)
	184	#define JRUNDO_ALL (JRUNDO_VATTR\|JRUNDO_FILEDATA)
	185
	186	static struct vnodeopv_entry_desc journal_vnodeop_entries[] = {
	187	{ &vop_default_desc, vop_journal_operate_ap },
	188	{ &vop_mountctl_desc, (void *)journal_mountctl },
	189	{ &vop_setattr_desc, (void *)journal_setattr },
	190	{ &vop_write_desc, (void *)journal_write },
	191	{ &vop_fsync_desc, (void *)journal_fsync },
	192	{ &vop_putpages_desc, (void *)journal_putpages },
	193	{ &vop_setacl_desc, (void *)journal_setacl },
	194	{ &vop_setextattr_desc, (void *)journal_setextattr },
	195	{ &vop_ncreate_desc, (void *)journal_ncreate },
	196	{ &vop_nmknod_desc, (void *)journal_nmknod },
	197	{ &vop_nlink_desc, (void *)journal_nlink },
	198	{ &vop_nsymlink_desc, (void *)journal_nsymlink },
	199	{ &vop_nwhiteout_desc, (void *)journal_nwhiteout },
	200	{ &vop_nremove_desc, (void *)journal_nremove },
	201	{ &vop_nmkdir_desc, (void *)journal_nmkdir },
	202	{ &vop_nrmdir_desc, (void *)journal_nrmdir },
	203	{ &vop_nrename_desc, (void *)journal_nrename },
	204	{ NULL, NULL }
	205	};
	206
	207	static MALLOC_DEFINE(M_JOURNAL, "journal", "Journaling structures");
	208	static MALLOC_DEFINE(M_JFIFO, "journal-fifo", "Journal FIFO");
	209
	210	int
	211	journal_mountctl(struct vop_mountctl_args *ap)
	212	{
	213	struct mount *mp;
	214	int error = 0;
	215
	216	mp = ap->a_head.a_ops->vv_mount;
	217	KKASSERT(mp);
	218
	219	if (mp->mnt_vn_journal_ops == NULL) {
	220	switch(ap->a_op) {
	221	case MOUNTCTL_INSTALL_VFS_JOURNAL:
	222	error = journal_attach(mp);
	223	if (error == 0 && ap->a_ctllen != sizeof(struct mountctl_install_journal))
	224	error = EINVAL;
	225	if (error == 0 && ap->a_fp == NULL)
	226	error = EBADF;
	227	if (error == 0)
	228	error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl);
	229	if (TAILQ_EMPTY(&mp->mnt_jlist))
	230	journal_detach(mp);
	231	break;
	232	case MOUNTCTL_RESTART_VFS_JOURNAL:
	233	case MOUNTCTL_REMOVE_VFS_JOURNAL:
	234	case MOUNTCTL_RESYNC_VFS_JOURNAL:
	235	case MOUNTCTL_STATUS_VFS_JOURNAL:
	236	error = ENOENT;
	237	break;
	238	default:
	239	error = EOPNOTSUPP;
	240	break;
	241	}
	242	} else {
	243	switch(ap->a_op) {
	244	case MOUNTCTL_INSTALL_VFS_JOURNAL:
	245	if (ap->a_ctllen != sizeof(struct mountctl_install_journal))
	246	error = EINVAL;
	247	if (error == 0 && ap->a_fp == NULL)
	248	error = EBADF;
	249	if (error == 0)
	250	error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl);
	251	break;
	252	case MOUNTCTL_RESTART_VFS_JOURNAL:
	253	if (ap->a_ctllen != sizeof(struct mountctl_restart_journal))
	254	error = EINVAL;
	255	if (error == 0 && ap->a_fp == NULL)
	256	error = EBADF;
	257	if (error == 0)
	258	error = journal_restart_vfs_journal(mp, ap->a_fp, ap->a_ctl);
	259	break;
	260	case MOUNTCTL_REMOVE_VFS_JOURNAL:
	261	if (ap->a_ctllen != sizeof(struct mountctl_remove_journal))
	262	error = EINVAL;
	263	if (error == 0)
	264	error = journal_remove_vfs_journal(mp, ap->a_ctl);
	265	if (TAILQ_EMPTY(&mp->mnt_jlist))
	266	journal_detach(mp);
	267	break;
	268	case MOUNTCTL_RESYNC_VFS_JOURNAL:
	269	if (ap->a_ctllen != 0)
	270	error = EINVAL;
	271	error = journal_resync_vfs_journal(mp, ap->a_ctl);
	272	break;
	273	case MOUNTCTL_STATUS_VFS_JOURNAL:
	274	if (ap->a_ctllen != sizeof(struct mountctl_status_journal))
	275	error = EINVAL;
	276	if (error == 0) {
	277	error = journal_status_vfs_journal(mp, ap->a_ctl,
	278	ap->a_buf, ap->a_buflen, ap->a_res);
	279	}
	280	break;
	281	default:
	282	error = EOPNOTSUPP;
	283	break;
	284	}
	285	}
	286	return (error);
	287	}
	288
	289	/*
	290	* High level mount point setup. When a
	291	*/
	292	static int
	293	journal_attach(struct mount *mp)
	294	{
	295	vfs_add_vnodeops(mp, &mp->mnt_vn_journal_ops, journal_vnodeop_entries);
	296	return(0);
	297	}
	298
	299	static void
	300	journal_detach(struct mount *mp)
	301	{
	302	if (mp->mnt_vn_journal_ops)
	303	vfs_rm_vnodeops(&mp->mnt_vn_journal_ops);
	304	}
	305
	306	/*
	307	* Install a journal on a mount point. Each journal has an associated worker
	308	* thread which is responsible for buffering and spooling the data to the
	309	* target. A mount point may have multiple journals attached to it. An
	310	* initial start record is generated when the journal is associated.
	311	*/
	312	static int
	313	journal_install_vfs_journal(struct mount mp, struct file fp,
	314	const struct mountctl_install_journal *info)
	315	{
	316	struct journal *jo;
	317	struct jrecord jrec;
	318	int error = 0;
	319	int size;
	320
	321	jo = malloc(sizeof(struct journal), M_JOURNAL, M_WAITOK\|M_ZERO);
	322	bcopy(info->id, jo->id, sizeof(jo->id));
	323	jo->flags = info->flags & ~(MC_JOURNAL_WACTIVE \| MC_JOURNAL_RACTIVE \|
	324	MC_JOURNAL_STOP_REQ);
	325
	326	/*
	327	* Memory FIFO size, round to nearest power of 2
	328	*/
	329	if (info->membufsize) {
	330	if (info->membufsize < 65536)
	331	size = 65536;
	332	else if (info->membufsize > 128 * 1024 * 1024)
	333	size = 128 * 1024 * 1024;
	334	else
	335	size = (int)info->membufsize;
	336	} else {
	337	size = 1024 * 1024;
	338	}
	339	jo->fifo.size = 1;
	340	while (jo->fifo.size < size)
	341	jo->fifo.size <<= 1;
	342
	343	/*
	344	* Other parameters. If not specified the starting transaction id
	345	* will be the current date.
	346	*/
	347	if (info->transid) {
	348	jo->transid = info->transid;
	349	} else {
	350	struct timespec ts;
	351	getnanotime(&ts);
	352	jo->transid = ((int64_t)ts.tv_sec << 30) \| ts.tv_nsec;
	353	}
	354
	355	jo->fp = fp;
	356
	357	/*
	358	* Allocate the memory FIFO
	359	*/
	360	jo->fifo.mask = jo->fifo.size - 1;
	361	jo->fifo.membase = malloc(jo->fifo.size, M_JFIFO, M_WAITOK\|M_ZERO\|M_NULLOK);
	362	if (jo->fifo.membase == NULL)
	363	error = ENOMEM;
	364
	365	/*
	366	* Create the worker threads and generate the association record.
	367	*/
	368	if (error) {
	369	free(jo, M_JOURNAL);
	370	} else {
	371	fhold(fp);
	372	journal_create_threads(jo);
	373	jrecord_init(jo, &jrec, JREC_STREAMID_DISCONT);
	374	jrecord_write(&jrec, JTYPE_ASSOCIATE, 0);
	375	jrecord_done(&jrec, 0);
	376	TAILQ_INSERT_TAIL(&mp->mnt_jlist, jo, jentry);
	377	}
	378	return(error);
	379	}
	380
	381	/*
	382	* Restart a journal with a new descriptor. The existing reader and writer
	383	* threads are terminated and a new descriptor is associated with the
	384	* journal. The FIFO rindex is reset to xindex and the threads are then
	385	* restarted.
	386	*/
	387	static int
	388	journal_restart_vfs_journal(struct mount mp, struct file fp,
	389	const struct mountctl_restart_journal *info)
	390	{
	391	struct journal *jo;
	392	int error;
	393
	394	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	395	if (bcmp(jo->id, info->id, sizeof(jo->id)) == 0)
	396	break;
	397	}
	398	if (jo)
	399	error = journal_restart(mp, fp, jo, info->flags);
	400	else
	401	error = EINVAL;
	402	return (error);
	403	}
	404
	405	static int
	406	journal_restart(struct mount mp, struct file fp,
	407	struct journal *jo, int flags)
	408	{
	409	/*
	410	* XXX lock the jo
	411	*/
	412
	413	#if 0
	414	/*
	415	* Record the fact that we are doing a restart in the journal.
	416	* XXX it isn't safe to do this if the journal is being restarted
	417	* because it was locked up and the writer thread has already exited.
	418	*/
	419	jrecord_init(jo, &jrec, JREC_STREAMID_RESTART);
	420	jrecord_write(&jrec, JTYPE_DISASSOCIATE, 0);
	421	jrecord_done(&jrec, 0);
	422	#endif
	423
	424	/*
	425	* Stop the reader and writer threads and clean up the current
	426	* descriptor.
	427	*/
	428	printf("RESTART WITH FP %p KILLING %p\n", fp, jo->fp);
	429	journal_destroy_threads(jo, flags);
	430
	431	if (jo->fp)
	432	fdrop(jo->fp, curthread);
	433
	434	/*
	435	* Associate the new descriptor, reset the FIFO index, and recreate
	436	* the threads.
	437	*/
	438	fhold(fp);
	439	jo->fp = fp;
	440	jo->fifo.rindex = jo->fifo.xindex;
	441	journal_create_threads(jo);
	442
	443	return(0);
	444	}
	445
	446	/*
	447	* Disassociate a journal from a mount point and terminate its worker thread.
	448	* A final termination record is written out before the file pointer is
	449	* dropped.
	450	*/
	451	static int
	452	journal_remove_vfs_journal(struct mount *mp,
	453	const struct mountctl_remove_journal *info)
	454	{
	455	struct journal *jo;
	456	int error;
	457
	458	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	459	if (bcmp(jo->id, info->id, sizeof(jo->id)) == 0)
	460	break;
	461	}
	462	if (jo)
	463	error = journal_destroy(mp, jo, info->flags);
	464	else
	465	error = EINVAL;
	466	return (error);
	467	}
	468
	469	/*
	470	* Remove all journals associated with a mount point. Usually called
	471	* by the umount code.
	472	*/
	473	void
	474	journal_remove_all_journals(struct mount *mp, int flags)
	475	{
	476	struct journal *jo;
	477
	478	while ((jo = TAILQ_FIRST(&mp->mnt_jlist)) != NULL) {
	479	journal_destroy(mp, jo, flags);
	480	}
	481	}
	482
	483	static int
	484	journal_destroy(struct mount mp, struct journal jo, int flags)
	485	{
	486	struct jrecord jrec;
	487
	488	TAILQ_REMOVE(&mp->mnt_jlist, jo, jentry);
	489
	490	jrecord_init(jo, &jrec, JREC_STREAMID_DISCONT);
	491	jrecord_write(&jrec, JTYPE_DISASSOCIATE, 0);
	492	jrecord_done(&jrec, 0);
	493
	494	journal_destroy_threads(jo, flags);
	495
	496	if (jo->fp)
	497	fdrop(jo->fp, curthread);
	498	if (jo->fifo.membase)
	499	free(jo->fifo.membase, M_JFIFO);
	500	free(jo, M_JOURNAL);
	501	return(0);
	502	}
	503
	504	static int
	505	journal_resync_vfs_journal(struct mount mp, const void ctl)
	506	{
	507	return(EINVAL);
	508	}
	509
	510	static int
	511	journal_status_vfs_journal(struct mount *mp,
	512	const struct mountctl_status_journal *info,
	513	struct mountctl_journal_ret_status *rstat,
	514	int buflen, int *res)
	515	{
	516	struct journal *jo;
	517	int error = 0;
	518	int index;
	519
	520	index = 0;
	521	*res = 0;
	522	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	523	if (info->index == MC_JOURNAL_INDEX_ID) {
	524	if (bcmp(jo->id, info->id, sizeof(jo->id)) != 0)
	525	continue;
	526	} else if (info->index >= 0) {
	527	if (info->index < index)
	528	continue;
	529	} else if (info->index != MC_JOURNAL_INDEX_ALL) {
	530	continue;
	531	}
	532	if (buflen < sizeof(*rstat)) {
	533	if (*res)
	534	rstat[-1].flags \|= MC_JOURNAL_STATUS_MORETOCOME;
	535	else
	536	error = EINVAL;
	537	break;
	538	}
	539	bzero(rstat, sizeof(*rstat));
	540	rstat->recsize = sizeof(*rstat);
	541	bcopy(jo->id, rstat->id, sizeof(jo->id));
	542	rstat->index = index;
	543	rstat->membufsize = jo->fifo.size;
	544	rstat->membufused = jo->fifo.windex - jo->fifo.xindex;
	545	rstat->membufunacked = jo->fifo.rindex - jo->fifo.xindex;
	546	rstat->bytessent = jo->total_acked;
	547	rstat->fifostalls = jo->fifostalls;
	548	++rstat;
	549	++index;
	550	res += sizeof(rstat);
	551	buflen -= sizeof(*rstat);
	552	}
	553	return(error);
	554	}
	555
	556	static void
	557	journal_create_threads(struct journal *jo)
	558	{
	559	jo->flags &= ~(MC_JOURNAL_STOP_REQ \| MC_JOURNAL_STOP_IMM);
	560	jo->flags \|= MC_JOURNAL_WACTIVE;
	561	lwkt_create(journal_wthread, jo, NULL, &jo->wthread,
	562	TDF_STOPREQ, -1, "journal w:%.*s", JIDMAX, jo->id);
	563	lwkt_setpri(&jo->wthread, TDPRI_KERN_DAEMON);
	564	lwkt_schedule(&jo->wthread);
	565
	566	if (jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) {
	567	jo->flags \|= MC_JOURNAL_RACTIVE;
	568	lwkt_create(journal_rthread, jo, NULL, &jo->rthread,
	569	TDF_STOPREQ, -1, "journal r:%.*s", JIDMAX, jo->id);
	570	lwkt_setpri(&jo->rthread, TDPRI_KERN_DAEMON);
	571	lwkt_schedule(&jo->rthread);
	572	}
	573	}
	574
	575	static void
	576	journal_destroy_threads(struct journal *jo, int flags)
	577	{
	578	int wcount;
	579
	580	jo->flags \|= MC_JOURNAL_STOP_REQ \| (flags & MC_JOURNAL_STOP_IMM);
	581	wakeup(&jo->fifo);
	582	wcount = 0;
	583	while (jo->flags & (MC_JOURNAL_WACTIVE \| MC_JOURNAL_RACTIVE)) {
	584	tsleep(jo, 0, "jwait", hz);
	585	if (++wcount % 10 == 0) {
	586	printf("Warning: journal %s waiting for descriptors to close\n",
	587	jo->id);
	588	}
	589	}
	590
	591	/*
	592	* XXX SMP - threads should move to cpu requesting the restart or
	593	* termination before finishing up to properly interlock.
	594	*/
	595	tsleep(jo, 0, "jwait", hz);
	596	lwkt_free_thread(&jo->wthread);
	597	if (jo->flags & MC_JOURNAL_WANT_FULLDUPLEX)
	598	lwkt_free_thread(&jo->rthread);
	599	}
	600
	601	/*
	602	* The per-journal worker thread is responsible for writing out the
	603	* journal's FIFO to the target stream.
	604	*/
	605	static void
	606	journal_wthread(void *info)
	607	{
	608	struct journal *jo = info;
	609	struct journal_rawrecbeg *rawp;
	610	int bytes;
	611	int error;
	612	int avail;
	613	int res;
	614
	615	for (;;) {
	616	/*
	617	* Calculate the number of bytes available to write. This buffer
	618	* area may contain reserved records so we can't just write it out
	619	* without further checks.
	620	*/
	621	bytes = jo->fifo.windex - jo->fifo.rindex;
	622
	623	/*
	624	* sleep if no bytes are available or if an incomplete record is
	625	* encountered (it needs to be filled in before we can write it
	626	* out), and skip any pad records that we encounter.
	627	*/
	628	if (bytes == 0) {
	629	if (jo->flags & MC_JOURNAL_STOP_REQ)
	630	break;
	631	tsleep(&jo->fifo, 0, "jfifo", hz);
	632	continue;
	633	}
	634
	635	/*
	636	* Sleep if we can not go any further due to hitting an incomplete
	637	* record. This case should occur rarely but may have to be better
	638	* optimized XXX.
	639	*/
	640	rawp = (void *)(jo->fifo.membase + (jo->fifo.rindex & jo->fifo.mask));
	641	if (rawp->begmagic == JREC_INCOMPLETEMAGIC) {
	642	tsleep(&jo->fifo, 0, "jpad", hz);
	643	continue;
	644	}
	645
	646	/*
	647	* Skip any pad records. We do not write out pad records if we can
	648	* help it.
	649	*/
	650	if (rawp->streamid == JREC_STREAMID_PAD) {
	651	if ((jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) == 0) {
	652	if (jo->fifo.rindex == jo->fifo.xindex) {
	653	jo->fifo.xindex += (rawp->recsize + 15) & ~15;
	654	jo->total_acked += (rawp->recsize + 15) & ~15;
	655	}
	656	}
	657	jo->fifo.rindex += (rawp->recsize + 15) & ~15;
	658	jo->total_acked += bytes;
	659	KKASSERT(jo->fifo.windex - jo->fifo.rindex >= 0);
	660	continue;
	661	}
	662
	663	/*
	664	* 'bytes' is the amount of data that can potentially be written out.
	665	* Calculate 'res', the amount of data that can actually be written
	666	* out. res is bounded either by hitting the end of the physical
	667	* memory buffer or by hitting an incomplete record. Incomplete
	668	* records often occur due to the way the space reservation model
	669	* works.
	670	*/
	671	res = 0;
	672	avail = jo->fifo.size - (jo->fifo.rindex & jo->fifo.mask);
	673	while (res < bytes && rawp->begmagic == JREC_BEGMAGIC) {
	674	res += (rawp->recsize + 15) & ~15;
	675	if (res >= avail) {
	676	KKASSERT(res == avail);
	677	break;
	678	}
	679	rawp = (void )((char )rawp + ((rawp->recsize + 15) & ~15));
	680	}
	681
	682	/*
	683	* Issue the write and deal with any errors or other conditions.
	684	* For now assume blocking I/O. Since we are record-aware the
	685	* code cannot yet handle partial writes.
	686	*
	687	* We bump rindex prior to issuing the write to avoid racing
	688	* the acknowledgement coming back (which could prevent the ack
	689	* from bumping xindex). Restarts are always based on xindex so
	690	* we do not try to undo the rindex if an error occurs.
	691	*
	692	* XXX EWOULDBLOCK/NBIO
	693	* XXX notification on failure
	694	* XXX permanent verses temporary failures
	695	* XXX two-way acknowledgement stream in the return direction / xindex
	696	*/
	697	bytes = res;
	698	jo->fifo.rindex += bytes;
	699	error = fp_write(jo->fp,
	700	jo->fifo.membase + ((jo->fifo.rindex - bytes) & jo->fifo.mask),
	701	bytes, &res);
	702	if (error) {
	703	printf("journal_thread(%s) write, error %d\n", jo->id, error);
	704	/* XXX */
	705	} else {
	706	KKASSERT(res == bytes);
	707	}
	708
	709	/*
	710	* Advance rindex. If the journal stream is not full duplex we also
	711	* advance xindex, otherwise the rjournal thread is responsible for
	712	* advancing xindex.
	713	*/
	714	if ((jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) == 0) {
	715	jo->fifo.xindex += bytes;
	716	jo->total_acked += bytes;
	717	}
	718	KKASSERT(jo->fifo.windex - jo->fifo.rindex >= 0);
	719	if ((jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) == 0) {
	720	if (jo->flags & MC_JOURNAL_WWAIT) {
	721	jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */
	722	wakeup(&jo->fifo.windex);
	723	}
	724	}
	725	}
	726	fp_shutdown(jo->fp, SHUT_WR);
	727	jo->flags &= ~MC_JOURNAL_WACTIVE;
	728	wakeup(jo);
	729	wakeup(&jo->fifo.windex);
	730	}
	731
	732	/*
	733	* A second per-journal worker thread is created for two-way journaling
	734	* streams to deal with the return acknowledgement stream.
	735	*/
	736	static void
	737	journal_rthread(void *info)
	738	{
	739	struct journal_rawrecbeg *rawp;
	740	struct journal_ackrecord ack;
	741	struct journal *jo = info;
	742	int64_t transid;
	743	int error;
	744	int count;
	745	int bytes;
	746
	747	transid = 0;
	748	error = 0;
	749
	750	for (;;) {
	751	/*
	752	* We have been asked to stop
	753	*/
	754	if (jo->flags & MC_JOURNAL_STOP_REQ)
	755	break;
	756
	757	/*
	758	* If we have no active transaction id, get one from the return
	759	* stream.
	760	*/
	761	if (transid == 0) {
	762	error = fp_read(jo->fp, &ack, sizeof(ack), &count, 1);
	763	#if 0
	764	printf("fp_read ack error %d count %d\n", error, count);
	765	#endif
	766	if (error \|\| count != sizeof(ack))
	767	break;
	768	if (error) {
	769	printf("read error %d on receive stream\n", error);
	770	break;
	771	}
	772	if (ack.rbeg.begmagic != JREC_BEGMAGIC \|\|
	773	ack.rend.endmagic != JREC_ENDMAGIC
	774	) {
	775	printf("bad begmagic or endmagic on receive stream\n");
	776	break;
	777	}
	778	transid = ack.rbeg.transid;
	779	}
	780
	781	/*
	782	* Calculate the number of unacknowledged bytes. If there are no
	783	* unacknowledged bytes then unsent data was acknowledged, report,
	784	* sleep a bit, and loop in that case. This should not happen
	785	* normally. The ack record is thrown away.
	786	*/
	787	bytes = jo->fifo.rindex - jo->fifo.xindex;
	788
	789	if (bytes == 0) {
	790	printf("warning: unsent data acknowledged transid %08llx\n", transid);
	791	tsleep(&jo->fifo.xindex, 0, "jrseq", hz);
	792	transid = 0;
	793	continue;
	794	}
	795
	796	/*
	797	* Since rindex has advanced, the record pointed to by xindex
	798	* must be a valid record.
	799	*/
	800	rawp = (void *)(jo->fifo.membase + (jo->fifo.xindex & jo->fifo.mask));
	801	KKASSERT(rawp->begmagic == JREC_BEGMAGIC);
	802	KKASSERT(rawp->recsize <= bytes);
	803
	804	/*
	805	* The target can acknowledge several records at once.
	806	*/
	807	if (rawp->transid < transid) {
	808	#if 1
	809	printf("ackskip %08llx/%08llx\n", rawp->transid, transid);
	810	#endif
	811	jo->fifo.xindex += (rawp->recsize + 15) & ~15;
	812	jo->total_acked += (rawp->recsize + 15) & ~15;
	813	if (jo->flags & MC_JOURNAL_WWAIT) {
	814	jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */
	815	wakeup(&jo->fifo.windex);
	816	}
	817	continue;
	818	}
	819	if (rawp->transid == transid) {
	820	#if 1
	821	printf("ackskip %08llx/%08llx\n", rawp->transid, transid);
	822	#endif
	823	jo->fifo.xindex += (rawp->recsize + 15) & ~15;
	824	jo->total_acked += (rawp->recsize + 15) & ~15;
	825	if (jo->flags & MC_JOURNAL_WWAIT) {
	826	jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */
	827	wakeup(&jo->fifo.windex);
	828	}
	829	transid = 0;
	830	continue;
	831	}
	832	printf("warning: unsent data(2) acknowledged transid %08llx\n", transid);
	833	transid = 0;
	834	}
	835	jo->flags &= ~MC_JOURNAL_RACTIVE;
	836	wakeup(jo);
	837	wakeup(&jo->fifo.windex);
	838	}
	839
	840	/*
	841	* This builds a pad record which the journaling thread will skip over. Pad
	842	* records are required when we are unable to reserve sufficient stream space
	843	* due to insufficient space at the end of the physical memory fifo.
	844	*
	845	* Even though the record is not transmitted, a normal transid must be
	846	* assigned to it so link recovery operations after a failure work properly.
	847	*/
	848	static
	849	void
	850	journal_build_pad(struct journal_rawrecbeg *rawp, int recsize, int64_t transid)
	851	{
	852	struct journal_rawrecend *rendp;
	853
	854	KKASSERT((recsize & 15) == 0 && recsize >= 16);
	855
	856	rawp->streamid = JREC_STREAMID_PAD;
	857	rawp->recsize = recsize; /* must be 16-byte aligned */
	858	rawp->transid = transid;
	859	/*
	860	* WARNING, rendp may overlap rawp->seqno. This is necessary to
	861	* allow PAD records to fit in 16 bytes. Use cpu_ccfence() to
	862	* hopefully cause the compiler to not make any assumptions.
	863	*/
	864	rendp = (void )((char )rawp + rawp->recsize - sizeof(*rendp));
	865	rendp->endmagic = JREC_ENDMAGIC;
	866	rendp->check = 0;
	867	rendp->recsize = rawp->recsize;
	868
	869	/*
	870	* Set the begin magic last. This is what will allow the journal
	871	* thread to write the record out. Use a store fence to prevent
	872	* compiler and cpu reordering of the writes.
	873	*/
	874	cpu_sfence();
	875	rawp->begmagic = JREC_BEGMAGIC;
	876	}
	877
	878	/*
	879	* Wake up the worker thread if the FIFO is more then half full or if
	880	* someone is waiting for space to be freed up. Otherwise let the
	881	* heartbeat deal with it. Being able to avoid waking up the worker
	882	* is the key to the journal's cpu performance.
	883	*/
	884	static __inline
	885	void
	886	journal_commit_wakeup(struct journal *jo)
	887	{
	888	int avail;
	889
	890	avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex);
	891	KKASSERT(avail >= 0);
	892	if ((avail < (jo->fifo.size >> 1)) \|\| (jo->flags & MC_JOURNAL_WWAIT))
	893	wakeup(&jo->fifo);
	894	}
	895
	896	/*
	897	* Create a new BEGIN stream record with the specified streamid and the
	898	* specified amount of payload space. *rawpp will be set to point to the
	899	* base of the new stream record and a pointer to the base of the payload
	900	* space will be returned. *rawpp does not need to be pre-NULLd prior to
	901	* making this call. The raw record header will be partially initialized.
	902	*
	903	* A stream can be extended, aborted, or committed by other API calls
	904	* below. This may result in a sequence of potentially disconnected
	905	* stream records to be output to the journaling target. The first record
	906	* (the one created by this function) will be marked JREC_STREAMCTL_BEGIN,
	907	* while the last record on commit or abort will be marked JREC_STREAMCTL_END
	908	* (and possibly also JREC_STREAMCTL_ABORTED). The last record could wind
	909	* up being the same as the first, in which case the bits are all set in
	910	* the first record.
	911	*
	912	* The stream record is created in an incomplete state by setting the begin
	913	* magic to JREC_INCOMPLETEMAGIC. This prevents the worker thread from
	914	* flushing the fifo past our record until we have finished populating it.
	915	* Other threads can reserve and operate on their own space without stalling
	916	* but the stream output will stall until we have completed operations. The
	917	* memory FIFO is intended to be large enough to absorb such situations
	918	* without stalling out other threads.
	919	*/
	920	static
	921	void *
	922	journal_reserve(struct journal jo, struct journal_rawrecbeg *rawpp,
	923	int16_t streamid, int bytes)
	924	{
	925	struct journal_rawrecbeg *rawp;
	926	int avail;
	927	int availtoend;
	928	int req;
	929
	930	/*
	931	* Add header and trailer overheads to the passed payload. Note that
	932	* the passed payload size need not be aligned in any way.
	933	*/
	934	bytes += sizeof(struct journal_rawrecbeg);
	935	bytes += sizeof(struct journal_rawrecend);
	936
	937	for (;;) {
	938	/*
	939	* First, check boundary conditions. If the request would wrap around
	940	* we have to skip past the ending block and return to the beginning
	941	* of the FIFO's buffer. Calculate 'req' which is the actual number
	942	* of bytes being reserved, including wrap-around dead space.
	943	*
	944	* Neither 'bytes' or 'req' are aligned.
	945	*
	946	* Note that availtoend is not truncated to avail and so cannot be
	947	* used to determine whether the reservation is possible by itself.
	948	* Also, since all fifo ops are 16-byte aligned, we can check
	949	* the size before calculating the aligned size.
	950	*/
	951	availtoend = jo->fifo.size - (jo->fifo.windex & jo->fifo.mask);
	952	KKASSERT((availtoend & 15) == 0);
	953	if (bytes > availtoend)
	954	req = bytes + availtoend; /* add pad to end */
	955	else
	956	req = bytes;
	957
	958	/*
	959	* Next calculate the total available space and see if it is
	960	* sufficient. We cannot overwrite previously buffered data
	961	* past xindex because otherwise we would not be able to restart
	962	* a broken link at the target's last point of commit.
	963	*/
	964	avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex);
	965	KKASSERT(avail >= 0 && (avail & 15) == 0);
	966
	967	if (avail < req) {
	968	/* XXX MC_JOURNAL_STOP_IMM */
	969	jo->flags \|= MC_JOURNAL_WWAIT;
	970	++jo->fifostalls;
	971	tsleep(&jo->fifo.windex, 0, "jwrite", 0);
	972	continue;
	973	}
	974
	975	/*
	976	* Create a pad record for any dead space and create an incomplete
	977	* record for the live space, then return a pointer to the
	978	* contiguous buffer space that was requested.
	979	*
	980	* NOTE: The worker thread will not flush past an incomplete
	981	* record, so the reserved space can be filled in at-will. The
	982	* journaling code must also be aware the reserved sections occuring
	983	* after this one will also not be written out even if completed
	984	* until this one is completed.
	985	*
	986	* The transaction id must accomodate real and potential pad creation.
	987	*/
	988	rawp = (void *)(jo->fifo.membase + (jo->fifo.windex & jo->fifo.mask));
	989	if (req != bytes) {
	990	journal_build_pad(rawp, availtoend, jo->transid);
	991	++jo->transid;
	992	rawp = (void *)jo->fifo.membase;
	993	}
	994	rawp->begmagic = JREC_INCOMPLETEMAGIC; /* updated by abort/commit */
	995	rawp->recsize = bytes; /* (unaligned size) */
	996	rawp->streamid = streamid \| JREC_STREAMCTL_BEGIN;
	997	rawp->transid = jo->transid;
	998	jo->transid += 2;
	999
	1000	/*
	1001	* Issue a memory barrier to guarentee that the record data has been
	1002	* properly initialized before we advance the write index and return
	1003	* a pointer to the reserved record. Otherwise the worker thread
	1004	* could accidently run past us.
	1005	*
	1006	* Note that stream records are always 16-byte aligned.
	1007	*/
	1008	cpu_sfence();
	1009	jo->fifo.windex += (req + 15) & ~15;
	1010	*rawpp = rawp;
	1011	return(rawp + 1);
	1012	}
	1013	/* not reached */
	1014	*rawpp = NULL;
	1015	return(NULL);
	1016	}
	1017
	1018	/*
	1019	* Attempt to extend the stream record by <bytes> worth of payload space.
	1020	*
	1021	* If it is possible to extend the existing stream record no truncation
	1022	* occurs and the record is extended as specified. A pointer to the
	1023	* truncation offset within the payload space is returned.
	1024	*
	1025	* If it is not possible to do this the existing stream record is truncated
	1026	* and committed, and a new stream record of size <bytes> is created. A
	1027	* pointer to the base of the new stream record's payload space is returned.
	1028	*
	1029	* *rawpp is set to the new reservation in the case of a new record but
	1030	* the caller cannot depend on a comparison with the old rawp to determine if
	1031	* this case occurs because we could end up using the same memory FIFO
	1032	* offset for the new stream record. Use *newstreamrecp instead.
	1033	*/
	1034	static void *
	1035	journal_extend(struct journal jo, struct journal_rawrecbeg *rawpp,
	1036	int truncbytes, int bytes, int *newstreamrecp)
	1037	{
	1038	struct journal_rawrecbeg *rawp;
	1039	int16_t streamid;
	1040	int availtoend;
	1041	int avail;
	1042	int osize;
	1043	int nsize;
	1044	int wbase;
	1045	void *rptr;
	1046
	1047	*newstreamrecp = 0;
	1048	rawp = *rawpp;
	1049	osize = (rawp->recsize + 15) & ~15;
	1050	nsize = (rawp->recsize + bytes + 15) & ~15;
	1051	wbase = (char *)rawp - jo->fifo.membase;
	1052
	1053	/*
	1054	* If the aligned record size does not change we can trivially adjust
	1055	* the record size.
	1056	*/
	1057	if (nsize == osize) {
	1058	rawp->recsize += bytes;
	1059	return((char *)(rawp + 1) + truncbytes);
	1060	}
	1061
	1062	/*
	1063	* If the fifo's write index hasn't been modified since we made the
	1064	* reservation and we do not hit any boundary conditions, we can
	1065	* trivially make the record smaller or larger.
	1066	*/
	1067	if ((jo->fifo.windex & jo->fifo.mask) == wbase + osize) {
	1068	availtoend = jo->fifo.size - wbase;
	1069	avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex) + osize;
	1070	KKASSERT((availtoend & 15) == 0);
	1071	KKASSERT((avail & 15) == 0);
	1072	if (nsize <= avail && nsize <= availtoend) {
	1073	jo->fifo.windex += nsize - osize;
	1074	rawp->recsize += bytes;
	1075	return((char *)(rawp + 1) + truncbytes);
	1076	}
	1077	}
	1078
	1079	/*
	1080	* It was not possible to extend the buffer. Commit the current
	1081	* buffer and create a new one. We manually clear the BEGIN mark that
	1082	* journal_reserve() creates (because this is a continuing record, not
	1083	* the start of a new stream).
	1084	*/
	1085	streamid = rawp->streamid & JREC_STREAMID_MASK;
	1086	journal_commit(jo, rawpp, truncbytes, 0);
	1087	rptr = journal_reserve(jo, rawpp, streamid, bytes);
	1088	rawp = *rawpp;
	1089	rawp->streamid &= ~JREC_STREAMCTL_BEGIN;
	1090	*newstreamrecp = 1;
	1091	return(rptr);
	1092	}
	1093
	1094	/*
	1095	* Abort a journal record. If the transaction record represents a stream
	1096	* BEGIN and we can reverse the fifo's write index we can simply reverse
	1097	* index the entire record, as if it were never reserved in the first place.
	1098	*
	1099	* Otherwise we set the JREC_STREAMCTL_ABORTED bit and commit the record
	1100	* with the payload truncated to 0 bytes.
	1101	*/
	1102	static void
	1103	journal_abort(struct journal jo, struct journal_rawrecbeg *rawpp)
	1104	{
	1105	struct journal_rawrecbeg *rawp;
	1106	int osize;
	1107
	1108	rawp = *rawpp;
	1109	osize = (rawp->recsize + 15) & ~15;
	1110
	1111	if ((rawp->streamid & JREC_STREAMCTL_BEGIN) &&
	1112	(jo->fifo.windex & jo->fifo.mask) ==
	1113	(char *)rawp - jo->fifo.membase + osize)
	1114	{
	1115	jo->fifo.windex -= osize;
	1116	*rawpp = NULL;
	1117	} else {
	1118	rawp->streamid \|= JREC_STREAMCTL_ABORTED;
	1119	journal_commit(jo, rawpp, 0, 1);
	1120	}
	1121	}
	1122
	1123	/*
	1124	* Commit a journal record and potentially truncate it to the specified
	1125	* number of payload bytes. If you do not want to truncate the record,
	1126	* simply pass -1 for the bytes parameter. Do not pass rawp->recsize, that
	1127	* field includes header and trailer and will not be correct. Note that
	1128	* passing 0 will truncate the entire data payload of the record.
	1129	*
	1130	* The logical stream is terminated by this function.
	1131	*
	1132	* If truncation occurs, and it is not possible to physically optimize the
	1133	* memory FIFO due to other threads having reserved space after ours,
	1134	* the remaining reserved space will be covered by a pad record.
	1135	*/
	1136	static void
	1137	journal_commit(struct journal jo, struct journal_rawrecbeg *rawpp,
	1138	int bytes, int closeout)
	1139	{
	1140	struct journal_rawrecbeg *rawp;
	1141	struct journal_rawrecend *rendp;
	1142	int osize;
	1143	int nsize;
	1144
	1145	rawp = *rawpp;
	1146	*rawpp = NULL;
	1147
	1148	KKASSERT((char *)rawp >= jo->fifo.membase &&
	1149	(char *)rawp + rawp->recsize <= jo->fifo.membase + jo->fifo.size);
	1150	KKASSERT(((intptr_t)rawp & 15) == 0);
	1151
	1152	/*
	1153	* Truncate the record if necessary. If the FIFO write index as still
	1154	* at the end of our record we can optimally backindex it. Otherwise
	1155	* we have to insert a pad record to cover the dead space.
	1156	*
	1157	* We calculate osize which is the 16-byte-aligned original recsize.
	1158	* We calculate nsize which is the 16-byte-aligned new recsize.
	1159	*
	1160	* Due to alignment issues or in case the passed truncation bytes is
	1161	* the same as the original payload, nsize may be equal to osize even
	1162	* if the committed bytes is less then the originally reserved bytes.
	1163	*/
	1164	if (bytes >= 0) {
	1165	KKASSERT(bytes >= 0 && bytes <= rawp->recsize - sizeof(struct journal_rawrecbeg) - sizeof(struct journal_rawrecend));
	1166	osize = (rawp->recsize + 15) & ~15;
	1167	rawp->recsize = bytes + sizeof(struct journal_rawrecbeg) +
	1168	sizeof(struct journal_rawrecend);
	1169	nsize = (rawp->recsize + 15) & ~15;
	1170	KKASSERT(nsize <= osize);
	1171	if (osize == nsize) {
	1172	/* do nothing */
	1173	} else if ((jo->fifo.windex & jo->fifo.mask) == (char *)rawp - jo->fifo.membase + osize) {
	1174	/* we are able to backindex the fifo */
	1175	jo->fifo.windex -= osize - nsize;
	1176	} else {
	1177	/* we cannot backindex the fifo, emplace a pad in the dead space */
	1178	journal_build_pad((void )((char )rawp + nsize), osize - nsize,
	1179	rawp->transid + 1);
	1180	}
	1181	}
	1182
	1183	/*
	1184	* Fill in the trailer. Note that unlike pad records, the trailer will
	1185	* never overlap the header.
	1186	*/
	1187	rendp = (void )((char )rawp +
	1188	((rawp->recsize + 15) & ~15) - sizeof(*rendp));
	1189	rendp->endmagic = JREC_ENDMAGIC;
	1190	rendp->recsize = rawp->recsize;
	1191	rendp->check = 0; /* XXX check word, disabled for now */
	1192
	1193	/*
	1194	* Fill in begmagic last. This will allow the worker thread to proceed.
	1195	* Use a memory barrier to guarentee write ordering. Mark the stream
	1196	* as terminated if closeout is set. This is the typical case.
	1197	*/
	1198	if (closeout)
	1199	rawp->streamid \|= JREC_STREAMCTL_END;
	1200	cpu_sfence(); /* memory and compiler barrier */
	1201	rawp->begmagic = JREC_BEGMAGIC;
	1202
	1203	journal_commit_wakeup(jo);
	1204	}
	1205
	1206	/************************************************************************
	1207	* PARALLEL TRANSACTION SUPPORT ROUTINES *
	1208	************************************************************************
	1209	*
	1210	* JRECLIST_*() - routines which create and iterate over jrecord structures,
	1211	* because a mount point may have multiple attached journals.
	1212	*/
	1213
	1214	/*
	1215	* Initialize the passed jrecord_list and create a jrecord for each
	1216	* journal we need to write to. Unnecessary mallocs are avoided by
	1217	* using the passed jrecord structure as the first jrecord in the list.
	1218	* A starting transaction is pushed for each jrecord.
	1219	*
	1220	* Returns non-zero if any of the journals require undo records.
	1221	*/
	1222	static
	1223	int
	1224	jreclist_init(struct mount mp, struct jrecord_list jreclist,
	1225	struct jrecord *jreccache, int16_t rectype)
	1226	{
	1227	struct journal *jo;
	1228	struct jrecord *jrec;
	1229	int wantrev = 0;
	1230	int count = 0;
	1231
	1232	TAILQ_INIT(jreclist);
	1233	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	1234	if (count == 0)
	1235	jrec = jreccache;
	1236	else
	1237	jrec = malloc(sizeof(*jrec), M_JOURNAL, M_WAITOK);
	1238	jrecord_init(jo, jrec, -1);
	1239	jrec->user_save = jrecord_push(jrec, rectype);
	1240	TAILQ_INSERT_TAIL(jreclist, jrec, user_entry);
	1241	if (jo->flags & MC_JOURNAL_WANT_REVERSABLE)
	1242	wantrev = 1;
	1243	++count;
	1244	}
	1245	return(wantrev);
	1246	}
	1247
	1248	/*
	1249	* Terminate the journaled transactions started by jreclist_init(). If
	1250	* an error occured, the transaction records will be aborted.
	1251	*/
	1252	static
	1253	void
	1254	jreclist_done(struct jrecord_list *jreclist, int error)
	1255	{
	1256	struct jrecord *jrec;
	1257	int count;
	1258
	1259	TAILQ_FOREACH(jrec, jreclist, user_entry) {
	1260	jrecord_pop(jrec, jrec->user_save);
	1261	jrecord_done(jrec, error);
	1262	}
	1263	count = 0;
	1264	while ((jrec = TAILQ_FIRST(jreclist)) != NULL) {
	1265	TAILQ_REMOVE(jreclist, jrec, user_entry);
	1266	if (count)
	1267	free(jrec, M_JOURNAL);
	1268	++count;
	1269	}
	1270	}
	1271
	1272	/*
	1273	* This procedure writes out UNDO records for available reversable
	1274	* journals.
	1275	*
	1276	* XXX could use improvement. There is no need to re-read the file
	1277	* for each journal.
	1278	*/
	1279	static
	1280	void
	1281	jreclist_undo_file(struct jrecord_list jreclist, struct vnode vp,
	1282	int jrflags, off_t off, off_t bytes)
	1283	{
	1284	struct jrecord *jrec;
	1285	int error;
	1286
	1287	error = 0;
	1288	if (jrflags & JRUNDO_GETVP)
	1289	error = vget(vp, LK_SHARED, curthread);
	1290	if (error == 0) {
	1291	TAILQ_FOREACH(jrec, jreclist, user_entry) {
	1292	if (jrec->jo->flags & MC_JOURNAL_WANT_REVERSABLE) {
	1293	jrecord_undo_file(jrec, vp, jrflags, off, bytes);
	1294	}
	1295	}
	1296	}
	1297	if (error == 0 && jrflags & JRUNDO_GETVP)
	1298	vput(vp);
	1299	}
	1300
	1301	/************************************************************************
	1302	* TRANSACTION SUPPORT ROUTINES *
	1303	************************************************************************
	1304	*
	1305	* JRECORD_*() - routines to create subrecord transactions and embed them
	1306	* in the logical streams managed by the journal_*() routines.
	1307	*/
	1308
	1309	static int16_t sid = JREC_STREAMID_JMIN;
	1310
	1311	/*
	1312	* Initialize the passed jrecord structure and start a new stream transaction
	1313	* by reserving an initial build space in the journal's memory FIFO.
	1314	*/
	1315	static void
	1316	jrecord_init(struct journal jo, struct jrecord jrec, int16_t streamid)
	1317	{
	1318	bzero(jrec, sizeof(*jrec));
	1319	jrec->jo = jo;
	1320	if (streamid < 0) {
	1321	streamid = sid++; /* XXX need to track stream ids! */
	1322	if (sid == JREC_STREAMID_JMAX)
	1323	sid = JREC_STREAMID_JMIN;
	1324	}
	1325	jrec->streamid = streamid;
	1326	jrec->stream_residual = JREC_DEFAULTSIZE;
	1327	jrec->stream_reserved = jrec->stream_residual;
	1328	jrec->stream_ptr =
	1329	journal_reserve(jo, &jrec->rawp, streamid, jrec->stream_reserved);
	1330	}
	1331
	1332	/*
	1333	* Push a recursive record type. All pushes should have matching pops.
	1334	* The old parent is returned and the newly pushed record becomes the
	1335	* new parent. Note that the old parent's pointer may already be invalid
	1336	* or may become invalid if jrecord_write() had to build a new stream
	1337	* record, so the caller should not mess with the returned pointer in
	1338	* any way other then to save it.
	1339	*/
	1340	static
	1341	struct journal_subrecord *
	1342	jrecord_push(struct jrecord *jrec, int16_t rectype)
	1343	{
	1344	struct journal_subrecord *save;
	1345
	1346	save = jrec->parent;
	1347	jrec->parent = jrecord_write(jrec, rectype\|JMASK_NESTED, 0);
	1348	jrec->last = NULL;
	1349	KKASSERT(jrec->parent != NULL);
	1350	++jrec->pushcount;
	1351	++jrec->pushptrgood; /* cleared on flush */
	1352	return(save);
	1353	}
	1354
	1355	/*
	1356	* Pop a previously pushed sub-transaction. We must set JMASK_LAST
	1357	* on the last record written within the subtransaction. If the last
	1358	* record written is not accessible or if the subtransaction is empty,
	1359	* we must write out a pad record with JMASK_LAST set before popping.
	1360	*
	1361	* When popping a subtransaction the parent record's recsize field
	1362	* will be properly set. If the parent pointer is no longer valid
	1363	* (which can occur if the data has already been flushed out to the
	1364	* stream), the protocol spec allows us to leave it 0.
	1365	*
	1366	* The saved parent pointer which we restore may or may not be valid,
	1367	* and if not valid may or may not be NULL, depending on the value
	1368	* of pushptrgood.
	1369	*/
	1370	static void
	1371	jrecord_pop(struct jrecord jrec, struct journal_subrecord save)
	1372	{
	1373	struct journal_subrecord *last;
	1374
	1375	KKASSERT(jrec->pushcount > 0);
	1376	KKASSERT(jrec->residual == 0);
	1377
	1378	/*
	1379	* Set JMASK_LAST on the last record we wrote at the current
	1380	* level. If last is NULL we either no longer have access to the
	1381	* record or the subtransaction was empty and we must write out a pad
	1382	* record.
	1383	*/
	1384	if ((last = jrec->last) == NULL) {
	1385	jrecord_write(jrec, JLEAF_PAD\|JMASK_LAST, 0);
	1386	last = jrec->last; /* reload after possible flush */
	1387	} else {
	1388	last->rectype \|= JMASK_LAST;
	1389	}
	1390
	1391	/*
	1392	* pushptrgood tells us how many levels of parent record pointers
	1393	* are valid. The jrec only stores the current parent record pointer
	1394	* (and it is only valid if pushptrgood != 0). The higher level parent
	1395	* record pointers are saved by the routines calling jrecord_push() and
	1396	* jrecord_pop(). These pointers may become stale and we determine
	1397	* that fact by tracking the count of valid parent pointers with
	1398	* pushptrgood. Pointers become invalid when their related stream
	1399	* record gets pushed out.
	1400	*
	1401	* If no pointer is available (the data has already been pushed out),
	1402	* then no fixup of e.g. the length field is possible for non-leaf
	1403	* nodes. The protocol allows for this situation by placing a larger
	1404	* burden on the program scanning the stream on the other end.
	1405	*
	1406	* [parentA]
	1407	* [node X]
	1408	* [parentB]
	1409	* [node Y]
	1410	* [node Z]
	1411	* (pop B) see NOTE B
	1412	* (pop A) see NOTE A
	1413	*
	1414	* NOTE B: This pop sets LAST in node Z if the node is still accessible,
	1415	* else a PAD record is appended and LAST is set in that.
	1416	*
	1417	* This pop sets the record size in parentB if parentB is still
	1418	* accessible, else the record size is left 0 (the scanner must
	1419	* deal with that).
	1420	*
	1421	* This pop sets the new 'last' record to parentB, the pointer
	1422	* to which may or may not still be accessible.
	1423	*
	1424	* NOTE A: This pop sets LAST in parentB if the node is still accessible,
	1425	* else a PAD record is appended and LAST is set in that.
	1426	*
	1427	* This pop sets the record size in parentA if parentA is still
	1428	* accessible, else the record size is left 0 (the scanner must
	1429	* deal with that).
	1430	*
	1431	* This pop sets the new 'last' record to parentA, the pointer
	1432	* to which may or may not still be accessible.
	1433	*
	1434	* Also note that the last record in the stream transaction, which in
	1435	* the above example is parentA, does not currently have the LAST bit
	1436	* set.
	1437	*
	1438	* The current parent becomes the last record relative to the
	1439	* saved parent passed into us. It's validity is based on
	1440	* whether pushptrgood is non-zero prior to decrementing. The saved
	1441	* parent becomes the new parent, and its validity is based on whether
	1442	* pushptrgood is non-zero after decrementing.
	1443	*
	1444	* The old jrec->parent may be NULL if it is no longer accessible.
	1445	* If pushptrgood is non-zero, however, it is guarenteed to not
	1446	* be NULL (since no flush occured).
	1447	*/
	1448	jrec->last = jrec->parent;
	1449	--jrec->pushcount;
	1450	if (jrec->pushptrgood) {
	1451	KKASSERT(jrec->last != NULL && last != NULL);
	1452	if (--jrec->pushptrgood == 0) {
	1453	jrec->parent = NULL; /* 'save' contains garbage or NULL */
	1454	} else {
	1455	KKASSERT(save != NULL);
	1456	jrec->parent = save; /* 'save' must not be NULL */
	1457	}
	1458
	1459	/*
	1460	* Set the record size in the old parent. 'last' still points to
	1461	* the original last record in the subtransaction being popped,
	1462	* jrec->last points to the old parent (which became the last
	1463	* record relative to the new parent being popped into).
	1464	*/
	1465	jrec->last->recsize = (char )last + last->recsize - (char )jrec->last;
	1466	} else {
	1467	jrec->parent = NULL;
	1468	KKASSERT(jrec->last == NULL);
	1469	}
	1470	}
	1471
	1472	/*
	1473	* Write out a leaf record, including associated data.
	1474	*/
	1475	static
	1476	void
	1477	jrecord_leaf(struct jrecord jrec, int16_t rectype, void ptr, int bytes)
	1478	{
	1479	jrecord_write(jrec, rectype, bytes);
	1480	jrecord_data(jrec, ptr, bytes);
	1481	}
	1482
	1483	/*
	1484	* Write a leaf record out and return a pointer to its base. The leaf
	1485	* record may contain potentially megabytes of data which is supplied
	1486	* in jrecord_data() calls. The exact amount must be specified in this
	1487	* call.
	1488	*
	1489	* THE RETURNED SUBRECORD POINTER IS ONLY VALID IMMEDIATELY AFTER THE
	1490	* CALL AND MAY BECOME INVALID AT ANY TIME. ONLY THE PUSH/POP CODE SHOULD
	1491	* USE THE RETURN VALUE.
	1492	*/
	1493	static
	1494	struct journal_subrecord *
	1495	jrecord_write(struct jrecord *jrec, int16_t rectype, int bytes)
	1496	{
	1497	struct journal_subrecord *last;
	1498	int pusheditout;
	1499
	1500	/*
	1501	* Try to catch some obvious errors. Nesting records must specify a
	1502	* size of 0, and there should be no left-overs from previous operations
	1503	* (such as incomplete data writeouts).
	1504	*/
	1505	KKASSERT(bytes == 0 \|\| (rectype & JMASK_NESTED) == 0);
	1506	KKASSERT(jrec->residual == 0);
	1507
	1508	/*
	1509	* Check to see if the current stream record has enough room for
	1510	* the new subrecord header. If it doesn't we extend the current
	1511	* stream record.
	1512	*
	1513	* This may have the side effect of pushing out the current stream record
	1514	* and creating a new one. We must adjust our stream tracking fields
	1515	* accordingly.
	1516	*/
	1517	if (jrec->stream_residual < sizeof(struct journal_subrecord)) {
	1518	jrec->stream_ptr = journal_extend(jrec->jo, &jrec->rawp,
	1519	jrec->stream_reserved - jrec->stream_residual,
	1520	JREC_DEFAULTSIZE, &pusheditout);
	1521	if (pusheditout) {
	1522	/*
	1523	* If a pushout occured, the pushed out stream record was
	1524	* truncated as specified and the new record is exactly the
	1525	* extension size specified.
	1526	*/
	1527	jrec->stream_reserved = JREC_DEFAULTSIZE;
	1528	jrec->stream_residual = JREC_DEFAULTSIZE;
	1529	jrec->parent = NULL; /* no longer accessible */
	1530	jrec->pushptrgood = 0; /* restored parents in pops no good */
	1531	} else {
	1532	/*
	1533	* If no pushout occured the stream record is NOT truncated and
	1534	* IS extended.
	1535	*/
	1536	jrec->stream_reserved += JREC_DEFAULTSIZE;
	1537	jrec->stream_residual += JREC_DEFAULTSIZE;
	1538	}
	1539	}
	1540	last = (void *)jrec->stream_ptr;
	1541	last->rectype = rectype;
	1542	last->reserved = 0;
	1543
	1544	/*
	1545	* We may not know the record size for recursive records and the
	1546	* header may become unavailable due to limited FIFO space. Write
	1547	* -1 to indicate this special case.
	1548	*/
	1549	if ((rectype & JMASK_NESTED) && bytes == 0)
	1550	last->recsize = -1;
	1551	else
	1552	last->recsize = sizeof(struct journal_subrecord) + bytes;
	1553	jrec->last = last;
	1554	jrec->residual = bytes; /* remaining data to be posted */
	1555	jrec->residual_align = -bytes & 7; /* post-data alignment required */
	1556	jrec->stream_ptr += sizeof(last); / current write pointer */
	1557	jrec->stream_residual -= sizeof(last); / space remaining in stream */
	1558	return(last);
	1559	}
	1560
	1561	/*
	1562	* Write out the data associated with a leaf record. Any number of calls
	1563	* to this routine may be made as long as the byte count adds up to the
	1564	* amount originally specified in jrecord_write().
	1565	*
	1566	* The act of writing out the leaf data may result in numerous stream records
	1567	* being pushed out. Callers should be aware that even the associated
	1568	* subrecord header may become inaccessible due to stream record pushouts.
	1569	*/
	1570	static void
	1571	jrecord_data(struct jrecord jrec, const void buf, int bytes)
	1572	{
	1573	int pusheditout;
	1574	int extsize;
	1575
	1576	KKASSERT(bytes >= 0 && bytes <= jrec->residual);
	1577
	1578	/*
	1579	* Push out stream records as long as there is insufficient room to hold
	1580	* the remaining data.
	1581	*/
	1582	while (jrec->stream_residual < bytes) {
	1583	/*
	1584	* Fill in any remaining space in the current stream record.
	1585	*/
	1586	bcopy(buf, jrec->stream_ptr, jrec->stream_residual);
	1587	buf = (const char *)buf + jrec->stream_residual;
	1588	bytes -= jrec->stream_residual;
	1589	/jrec->stream_ptr += jrec->stream_residual;/
	1590	jrec->residual -= jrec->stream_residual;
	1591	jrec->stream_residual = 0;
	1592
	1593	/*
	1594	* Try to extend the current stream record, but no more then 1/4
	1595	* the size of the FIFO.
	1596	*/
	1597	extsize = jrec->jo->fifo.size >> 2;
	1598	if (extsize > bytes)
	1599	extsize = (bytes + 15) & ~15;
	1600
	1601	jrec->stream_ptr = journal_extend(jrec->jo, &jrec->rawp,
	1602	jrec->stream_reserved - jrec->stream_residual,
	1603	extsize, &pusheditout);
	1604	if (pusheditout) {
	1605	jrec->stream_reserved = extsize;
	1606	jrec->stream_residual = extsize;
	1607	jrec->parent = NULL; /* no longer accessible */
	1608	jrec->last = NULL; /* no longer accessible */
	1609	jrec->pushptrgood = 0; /* restored parents in pops no good */
	1610	} else {
	1611	jrec->stream_reserved += extsize;
	1612	jrec->stream_residual += extsize;
	1613	}
	1614	}
	1615
	1616	/*
	1617	* Push out any remaining bytes into the current stream record.
	1618	*/
	1619	if (bytes) {
	1620	bcopy(buf, jrec->stream_ptr, bytes);
	1621	jrec->stream_ptr += bytes;
	1622	jrec->stream_residual -= bytes;
	1623	jrec->residual -= bytes;
	1624	}
	1625
	1626	/*
	1627	* Handle data alignment requirements for the subrecord. Because the
	1628	* stream record's data space is more strictly aligned, it must already
	1629	* have sufficient space to hold any subrecord alignment slop.
	1630	*/
	1631	if (jrec->residual == 0 && jrec->residual_align) {
	1632	KKASSERT(jrec->residual_align <= jrec->stream_residual);
	1633	bzero(jrec->stream_ptr, jrec->residual_align);
	1634	jrec->stream_ptr += jrec->residual_align;
	1635	jrec->stream_residual -= jrec->residual_align;
	1636	jrec->residual_align = 0;
	1637	}
	1638	}
	1639
	1640	/*
	1641	* We are finished with the transaction. This closes the transaction created
	1642	* by jrecord_init().
	1643	*
	1644	* NOTE: If abortit is not set then we must be at the top level with no
	1645	* residual subrecord data left to output.
	1646	*
	1647	* If abortit is set then we can be in any state, all pushes will be
	1648	* popped and it is ok for there to be residual data. This works
	1649	* because the virtual stream itself is truncated. Scanners must deal
	1650	* with this situation.
	1651	*
	1652	* The stream record will be committed or aborted as specified and jrecord
	1653	* resources will be cleaned up.
	1654	*/
	1655	static void
	1656	jrecord_done(struct jrecord *jrec, int abortit)
	1657	{
	1658	KKASSERT(jrec->rawp != NULL);
	1659
	1660	if (abortit) {
	1661	journal_abort(jrec->jo, &jrec->rawp);
	1662	} else {
	1663	KKASSERT(jrec->pushcount == 0 && jrec->residual == 0);
	1664	journal_commit(jrec->jo, &jrec->rawp,
	1665	jrec->stream_reserved - jrec->stream_residual, 1);
	1666	}
	1667
	1668	/*
	1669	* jrec should not be used beyond this point without another init,
	1670	* but clean up some fields to ensure that we panic if it is.
	1671	*
	1672	* Note that jrec->rawp is NULLd out by journal_abort/journal_commit.
	1673	*/
	1674	jrec->jo = NULL;
	1675	jrec->stream_ptr = NULL;
	1676	}
	1677
	1678	/************************************************************************
	1679	* LOW LEVEL RECORD SUPPORT ROUTINES *
	1680	************************************************************************
	1681	*
	1682	* These routine create low level recursive and leaf subrecords representing
	1683	* common filesystem structures.
	1684	*/
	1685
	1686	/*
	1687	* Write out a filename path relative to the base of the mount point.
	1688	* rectype is typically JLEAF_PATH{1,2,3,4}.
	1689	*/
	1690	static void
	1691	jrecord_write_path(struct jrecord jrec, int16_t rectype, struct namecache ncp)
	1692	{
	1693	char buf[64]; /* local buffer if it fits, else malloced */
	1694	char *base;
	1695	int pathlen;
	1696	int index;
	1697	struct namecache *scan;
	1698
	1699	/*
	1700	* Pass 1 - figure out the number of bytes required. Include terminating
	1701	* \0 on last element and '/' separator on other elements.
	1702	*/
	1703	again:
	1704	pathlen = 0;
	1705	for (scan = ncp;
	1706	scan && (scan->nc_flag & NCF_MOUNTPT) == 0;
	1707	scan = scan->nc_parent
	1708	) {
	1709	pathlen += scan->nc_nlen + 1;
	1710	}
	1711
	1712	if (pathlen <= sizeof(buf))
	1713	base = buf;
	1714	else
	1715	base = malloc(pathlen, M_TEMP, M_INTWAIT);
	1716
	1717	/*
	1718	* Pass 2 - generate the path buffer
	1719	*/
	1720	index = pathlen;
	1721	for (scan = ncp;
	1722	scan && (scan->nc_flag & NCF_MOUNTPT) == 0;
	1723	scan = scan->nc_parent
	1724	) {
	1725	if (scan->nc_nlen >= index) {
	1726	if (base != buf)
	1727	free(base, M_TEMP);
	1728	goto again;
	1729	}
	1730	if (index == pathlen)
	1731	base[--index] = 0;
	1732	else
	1733	base[--index] = '/';
	1734	index -= scan->nc_nlen;
	1735	bcopy(scan->nc_name, base + index, scan->nc_nlen);
	1736	}
	1737	jrecord_leaf(jrec, rectype, base + index, pathlen - index);
	1738	if (base != buf)
	1739	free(base, M_TEMP);
	1740	}
	1741
	1742	/*
	1743	* Write out a file attribute structure. While somewhat inefficient, using
	1744	* a recursive data structure is the most portable and extensible way.
	1745	*/
	1746	static void
	1747	jrecord_write_vattr(struct jrecord jrec, struct vattr vat)
	1748	{
	1749	void *save;
	1750
	1751	save = jrecord_push(jrec, JTYPE_VATTR);
	1752	if (vat->va_type != VNON)
	1753	jrecord_leaf(jrec, JLEAF_VTYPE, &vat->va_type, sizeof(vat->va_type));
	1754	if (vat->va_mode != (mode_t)VNOVAL)
	1755	jrecord_leaf(jrec, JLEAF_MODES, &vat->va_mode, sizeof(vat->va_mode));
	1756	if (vat->va_nlink != VNOVAL)
	1757	jrecord_leaf(jrec, JLEAF_NLINK, &vat->va_nlink, sizeof(vat->va_nlink));
	1758	if (vat->va_uid != VNOVAL)
	1759	jrecord_leaf(jrec, JLEAF_UID, &vat->va_uid, sizeof(vat->va_uid));
	1760	if (vat->va_gid != VNOVAL)
	1761	jrecord_leaf(jrec, JLEAF_GID, &vat->va_gid, sizeof(vat->va_gid));
	1762	if (vat->va_fsid != VNOVAL)
	1763	jrecord_leaf(jrec, JLEAF_FSID, &vat->va_fsid, sizeof(vat->va_fsid));
	1764	if (vat->va_fileid != VNOVAL)
	1765	jrecord_leaf(jrec, JLEAF_INUM, &vat->va_fileid, sizeof(vat->va_fileid));
	1766	if (vat->va_size != VNOVAL)
	1767	jrecord_leaf(jrec, JLEAF_SIZE, &vat->va_size, sizeof(vat->va_size));
	1768	if (vat->va_atime.tv_sec != VNOVAL)
	1769	jrecord_leaf(jrec, JLEAF_ATIME, &vat->va_atime, sizeof(vat->va_atime));
	1770	if (vat->va_mtime.tv_sec != VNOVAL)
	1771	jrecord_leaf(jrec, JLEAF_MTIME, &vat->va_mtime, sizeof(vat->va_mtime));
	1772	if (vat->va_ctime.tv_sec != VNOVAL)
	1773	jrecord_leaf(jrec, JLEAF_CTIME, &vat->va_ctime, sizeof(vat->va_ctime));
	1774	if (vat->va_gen != VNOVAL)
	1775	jrecord_leaf(jrec, JLEAF_GEN, &vat->va_gen, sizeof(vat->va_gen));
	1776	if (vat->va_flags != VNOVAL)
	1777	jrecord_leaf(jrec, JLEAF_FLAGS, &vat->va_flags, sizeof(vat->va_flags));
	1778	if (vat->va_rdev != VNOVAL)
	1779	jrecord_leaf(jrec, JLEAF_UDEV, &vat->va_rdev, sizeof(vat->va_rdev));
	1780	#if 0
	1781	if (vat->va_filerev != VNOVAL)
	1782	jrecord_leaf(jrec, JLEAF_FILEREV, &vat->va_filerev, sizeof(vat->va_filerev));
	1783	#endif
	1784	jrecord_pop(jrec, save);
	1785	}
	1786
	1787	/*
	1788	* Write out the creds used to issue a file operation. If a process is
	1789	* available write out additional tracking information related to the
	1790	* process.
	1791	*
	1792	* XXX additional tracking info
	1793	* XXX tty line info
	1794	*/
	1795	static void
	1796	jrecord_write_cred(struct jrecord jrec, struct thread td, struct ucred *cred)
	1797	{
	1798	void *save;
	1799	struct proc *p;
	1800
	1801	save = jrecord_push(jrec, JTYPE_CRED);
	1802	jrecord_leaf(jrec, JLEAF_UID, &cred->cr_uid, sizeof(cred->cr_uid));
	1803	jrecord_leaf(jrec, JLEAF_GID, &cred->cr_gid, sizeof(cred->cr_gid));
	1804	if (td && (p = td->td_proc) != NULL) {
	1805	jrecord_leaf(jrec, JLEAF_PID, &p->p_pid, sizeof(p->p_pid));
	1806	jrecord_leaf(jrec, JLEAF_COMM, p->p_comm, sizeof(p->p_comm));
	1807	}
	1808	jrecord_pop(jrec, save);
	1809	}
	1810
	1811	/*
	1812	* Write out information required to identify a vnode
	1813	*
	1814	* XXX this needs work. We should write out the inode number as well,
	1815	* and in fact avoid writing out the file path for seqential writes
	1816	* occuring within e.g. a certain period of time.
	1817	*/
	1818	static void
	1819	jrecord_write_vnode_ref(struct jrecord jrec, struct vnode vp)
	1820	{
	1821	struct namecache *ncp;
	1822
	1823	TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
	1824	if ((ncp->nc_flag & (NCF_UNRESOLVED\|NCF_DESTROYED)) == 0)
	1825	break;
	1826	}
	1827	if (ncp)
	1828	jrecord_write_path(jrec, JLEAF_PATH_REF, ncp);
	1829	}
	1830
	1831	static void
	1832	jrecord_write_vnode_link(struct jrecord jrec, struct vnode vp,
	1833	struct namecache *notncp)
	1834	{
	1835	struct namecache *ncp;
	1836
	1837	TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
	1838	if (ncp == notncp)
	1839	continue;
	1840	if ((ncp->nc_flag & (NCF_UNRESOLVED\|NCF_DESTROYED)) == 0)
	1841	break;
	1842	}
	1843	if (ncp)
	1844	jrecord_write_path(jrec, JLEAF_PATH_REF, ncp);
	1845	}
	1846
	1847	#if 0
	1848	/*
	1849	* Write out the current contents of the file within the specified
	1850	* range. This is typically called from within an UNDO section. A
	1851	* locked vnode must be passed.
	1852	*/
	1853	static int
	1854	jrecord_write_filearea(struct jrecord jrec, struct vnode vp,
	1855	off_t begoff, off_t endoff)
	1856	{
	1857	}
	1858	#endif
	1859
	1860	/*
	1861	* Write out the data represented by a pagelist
	1862	*/
	1863	static void
	1864	jrecord_write_pagelist(struct jrecord *jrec, int16_t rectype,
	1865	struct vm_page *pglist, int rtvals, int pgcount,
	1866	off_t offset)
	1867	{
	1868	struct msf_buf *msf;
	1869	int error;
	1870	int b;
	1871	int i;
	1872
	1873	i = 0;
	1874	while (i < pgcount) {
	1875	/*
	1876	* Find the next valid section. Skip any invalid elements
	1877	*/
	1878	if (rtvals[i] != VM_PAGER_OK) {
	1879	++i;
	1880	offset += PAGE_SIZE;
	1881	continue;
	1882	}
	1883
	1884	/*
	1885	* Figure out how big the valid section is, capping I/O at what the
	1886	* MSFBUF can represent.
	1887	*/
	1888	b = i;
	1889	while (i < pgcount && i - b != XIO_INTERNAL_PAGES &&
	1890	rtvals[i] == VM_PAGER_OK
	1891	) {
	1892	++i;
	1893	}
	1894
	1895	/*
	1896	* And write it out.
	1897	*/
	1898	if (i - b) {
	1899	error = msf_map_pagelist(&msf, pglist + b, i - b, 0);
	1900	if (error == 0) {
	1901	printf("RECORD PUTPAGES %d\n", msf_buf_bytes(msf));
	1902	jrecord_leaf(jrec, JLEAF_SEEKPOS, &offset, sizeof(offset));
	1903	jrecord_leaf(jrec, rectype,
	1904	msf_buf_kva(msf), msf_buf_bytes(msf));
	1905	msf_buf_free(msf);
	1906	} else {
	1907	printf("jrecord_write_pagelist: mapping failure\n");
	1908	}
	1909	offset += (off_t)(i - b) << PAGE_SHIFT;
	1910	}
	1911	}
	1912	}
	1913
	1914	/*
	1915	* Write out the data represented by a UIO.
	1916	*/
	1917	struct jwuio_info {
	1918	struct jrecord *jrec;
	1919	int16_t rectype;
	1920	};
	1921
	1922	static int jrecord_write_uio_callback(void info, char buf, int bytes);
	1923
	1924	static void
	1925	jrecord_write_uio(struct jrecord jrec, int16_t rectype, struct uio uio)
	1926	{
	1927	struct jwuio_info info = { jrec, rectype };
	1928	int error;
	1929
	1930	if (uio->uio_segflg != UIO_NOCOPY) {
	1931	jrecord_leaf(jrec, JLEAF_SEEKPOS, &uio->uio_offset,
	1932	sizeof(uio->uio_offset));
	1933	error = msf_uio_iterate(uio, jrecord_write_uio_callback, &info);
	1934	if (error)
	1935	printf("XXX warning uio iterate failed %d\n", error);
	1936	}
	1937	}
	1938
	1939	static int
	1940	jrecord_write_uio_callback(void info_arg, char buf, int bytes)
	1941	{
	1942	struct jwuio_info *info = info_arg;
	1943
	1944	jrecord_leaf(info->jrec, info->rectype, buf, bytes);
	1945	return(0);
	1946	}
	1947
	1948	static void
	1949	jrecord_file_data(struct jrecord jrec, struct vnode vp,
	1950	off_t off, off_t bytes)
	1951	{
	1952	const int bufsize = 8192;
	1953	char *buf;
	1954	int error;
	1955	int n;
	1956
	1957	buf = malloc(bufsize, M_JOURNAL, M_WAITOK);
	1958	jrecord_leaf(jrec, JLEAF_SEEKPOS, &off, sizeof(off));
	1959	while (bytes) {
	1960	n = (bytes > bufsize) ? bufsize : (int)bytes;
	1961	error = vn_rdwr(UIO_READ, vp, buf, n, off, UIO_SYSSPACE, IO_NODELOCKED,
	1962	proc0.p_ucred, NULL, curthread);
	1963	if (error) {
	1964	jrecord_leaf(jrec, JLEAF_ERROR, &error, sizeof(error));
	1965	break;
	1966	}
	1967	jrecord_leaf(jrec, JLEAF_FILEDATA, buf, n);
	1968	bytes -= n;
	1969	off += n;
	1970	}
	1971	free(buf, M_JOURNAL);
	1972	}
	1973
	1974	/************************************************************************
	1975	* LOW LEVEL UNDO SUPPORT ROUTINE *
	1976	************************************************************************
	1977	*
	1978	* This function is used to support UNDO records. It will generate an
	1979	* appropriate record with the requested portion of the file data. Note
	1980	* that file data is only recorded if JRUNDO_FILEDATA is passed. If bytes
	1981	* is -1, it will be set to the size of the file.
	1982	*/
	1983	static void
	1984	jrecord_undo_file(struct jrecord jrec, struct vnode vp, int jrflags,
	1985	off_t off, off_t bytes)
	1986	{
	1987	struct vattr attr;
	1988	void save1; / warning, save pointers do not always remain valid */
	1989	void *save2;
	1990	int error;
	1991
	1992	/*
	1993	* Setup. Start the UNDO record, obtain a shared lock on the vnode,
	1994	* and retrieve attribute info.
	1995	*/
	1996	save1 = jrecord_push(jrec, JTYPE_UNDO);
	1997	error = VOP_GETATTR(vp, &attr, curthread);
	1998	if (error)
	1999	goto done;
	2000
	2001	/*
	2002	* Generate UNDO records as requested.
	2003	*/
	2004	if (jrflags & JRUNDO_VATTR) {
	2005	save2 = jrecord_push(jrec, JTYPE_VATTR);
	2006	jrecord_leaf(jrec, JLEAF_VTYPE, &attr.va_type, sizeof(attr.va_type));
	2007	if ((jrflags & JRUNDO_SIZE) && attr.va_size != VNOVAL)
	2008	jrecord_leaf(jrec, JLEAF_SIZE, &attr.va_size, sizeof(attr.va_size));
	2009	if ((jrflags & JRUNDO_UID) && attr.va_uid != VNOVAL)
	2010	jrecord_leaf(jrec, JLEAF_UID, &attr.va_uid, sizeof(attr.va_uid));
	2011	if ((jrflags & JRUNDO_GID) && attr.va_gid != VNOVAL)
	2012	jrecord_leaf(jrec, JLEAF_GID, &attr.va_gid, sizeof(attr.va_gid));
	2013	if ((jrflags & JRUNDO_FSID) && attr.va_fsid != VNOVAL)
	2014	jrecord_leaf(jrec, JLEAF_FSID, &attr.va_fsid, sizeof(attr.va_fsid));
	2015	if ((jrflags & JRUNDO_MODES) && attr.va_mode != (mode_t)VNOVAL)
	2016	jrecord_leaf(jrec, JLEAF_MODES, &attr.va_mode, sizeof(attr.va_mode));
	2017	if ((jrflags & JRUNDO_INUM) && attr.va_fileid != VNOVAL)
	2018	jrecord_leaf(jrec, JLEAF_INUM, &attr.va_fileid, sizeof(attr.va_fileid));
	2019	if ((jrflags & JRUNDO_ATIME) && attr.va_atime.tv_sec != VNOVAL)
	2020	jrecord_leaf(jrec, JLEAF_ATIME, &attr.va_atime, sizeof(attr.va_atime));
	2021	if ((jrflags & JRUNDO_MTIME) && attr.va_mtime.tv_sec != VNOVAL)
	2022	jrecord_leaf(jrec, JLEAF_MTIME, &attr.va_mtime, sizeof(attr.va_mtime));
	2023	if ((jrflags & JRUNDO_CTIME) && attr.va_ctime.tv_sec != VNOVAL)
	2024	jrecord_leaf(jrec, JLEAF_CTIME, &attr.va_ctime, sizeof(attr.va_ctime));
	2025	if ((jrflags & JRUNDO_GEN) && attr.va_gen != VNOVAL)
	2026	jrecord_leaf(jrec, JLEAF_GEN, &attr.va_gen, sizeof(attr.va_gen));
	2027	if ((jrflags & JRUNDO_FLAGS) && attr.va_flags != VNOVAL)
	2028	jrecord_leaf(jrec, JLEAF_FLAGS, &attr.va_flags, sizeof(attr.va_flags));
	2029	if ((jrflags & JRUNDO_UDEV) && attr.va_rdev != VNOVAL)
	2030	jrecord_leaf(jrec, JLEAF_UDEV, &attr.va_rdev, sizeof(attr.va_rdev));
	2031	jrecord_pop(jrec, save2);
	2032	}
	2033
	2034	/*
	2035	* Output the file data being overwritten by reading the file and
	2036	* writing it out to the journal prior to the write operation. We
	2037	* do not need to write out data past the current file EOF.
	2038	*
	2039	* XXX support JRUNDO_CONDLINK - do not write out file data for files
	2040	* with a link count > 1. The undo code needs to locate the inode and
	2041	* regenerate the hardlink.
	2042	*/
	2043	if (jrflags & JRUNDO_FILEDATA) {
	2044	if (attr.va_size != VNOVAL) {
	2045	if (bytes == -1)
	2046	bytes = attr.va_size - off;
	2047	if (off + bytes > attr.va_size)
	2048	bytes = attr.va_size - off;
	2049	if (bytes > 0)
	2050	jrecord_file_data(jrec, vp, off, bytes);
	2051	} else {
	2052	error = EINVAL;
	2053	}
	2054	}
	2055	done:
	2056	if (error)
	2057	jrecord_leaf(jrec, JLEAF_ERROR, &error, sizeof(error));
	2058	jrecord_pop(jrec, save1);
	2059	}
	2060
	2061	/************************************************************************
	2062	* JOURNAL VNOPS *
	2063	************************************************************************
	2064	*
	2065	* These are function shims replacing the normal filesystem ops. We become
	2066	* responsible for calling the underlying filesystem ops. We have the choice
	2067	* of executing the underlying op first and then generating the journal entry,
	2068	* or starting the journal entry, executing the underlying op, and then
	2069	* either completing or aborting it.
	2070	*
	2071	* The journal is supposed to be a high-level entity, which generally means
	2072	* identifying files by name rather then by inode. Supplying both allows
	2073	* the journal to be used both for inode-number-compatible 'mirrors' and
	2074	* for simple filesystem replication.
	2075	*
	2076	* Writes are particularly difficult to deal with because a single write may
	2077	* represent a hundred megabyte buffer or more, and both writes and truncations
	2078	* require the 'old' data to be written out as well as the new data if the
	2079	* log is reversable. Other issues:
	2080	*
	2081	* - How to deal with operations on unlinked files (no path available),
	2082	* but which may still be filesystem visible due to hard links.
	2083	*
	2084	* - How to deal with modifications made via a memory map.
	2085	*
	2086	* - Future cache coherency support will require cache coherency API calls
	2087	* both prior to and after the call to the underlying VFS.
	2088	*
	2089	* ALSO NOTE: We do not have to shim compatibility VOPs like MKDIR which have
	2090	* new VFS equivalents (NMKDIR).
	2091	*/
	2092
	2093	/*
	2094	* Journal vop_settattr { a_vp, a_vap, a_cred, a_td }
	2095	*/
	2096	static
	2097	int
	2098	journal_setattr(struct vop_setattr_args *ap)
	2099	{
	2100	struct jrecord_list jreclist;
	2101	struct jrecord jreccache;
	2102	struct jrecord *jrec;
	2103	struct mount *mp;
	2104	int error;
	2105
	2106	mp = ap->a_head.a_ops->vv_mount;
	2107	if (jreclist_init(mp, &jreclist, &jreccache, JTYPE_SETATTR)) {
	2108	jreclist_undo_file(&jreclist, ap->a_vp, JRUNDO_VATTR, 0, 0);
	2109	}
	2110	error = vop_journal_operate_ap(&ap->a_head);
	2111	if (error == 0) {
	2112	TAILQ_FOREACH(jrec, &jreclist, user_entry) {
	2113	jrecord_write_cred(jrec, ap->a_td, ap->a_cred);
	2114	jrecord_write_vnode_ref(jrec, ap->a_vp);
	2115	jrecord_write_vattr(jrec, ap->a_vap);
	2116	}
	2117	}
	2118	jreclist_done(&jreclist, error);
	2119	return (error);
	2120	}
	2121
	2122	/*
	2123	* Journal vop_write { a_vp, a_uio, a_ioflag, a_cred }
	2124	*/
	2125	static
	2126	int
	2127	journal_write(struct vop_write_args *ap)
	2128	{
	2129	struct jrecord_list jreclist;
	2130	struct jrecord jreccache;
	2131	struct jrecord *jrec;
	2132	struct mount *mp;
	2133	struct uio uio_copy;
	2134	struct iovec uio_one_iovec;
	2135	int error;
	2136
	2137	/*
	2138	* This is really nasty. UIO's don't retain sufficient information to
	2139	* be reusable once they've gone through the VOP chain. The iovecs get
	2140	* cleared, so we have to copy the UIO.
	2141	*
	2142	* XXX fix the UIO code to not destroy iov's during a scan so we can
	2143	* reuse the uio over and over again.
	2144	*
	2145	* XXX UNDO code needs to journal the old data prior to the write.
	2146	*/
	2147	uio_copy = *ap->a_uio;
	2148	if (uio_copy.uio_iovcnt == 1) {
	2149	uio_one_iovec = ap->a_uio->uio_iov[0];
	2150	uio_copy.uio_iov = &uio_one_iovec;
	2151	} else {
	2152	uio_copy.uio_iov = malloc(uio_copy.uio_iovcnt * sizeof(struct iovec),
	2153	M_JOURNAL, M_WAITOK);
	2154	bcopy(ap->a_uio->uio_iov, uio_copy.uio_iov,
	2155	uio_copy.uio_iovcnt * sizeof(struct iovec));
	2156	}
	2157
	2158	/*
	2159	* Write out undo data. Note that uio_offset is incorrect if
	2160	* IO_APPEND is set, but fortunately we have no undo file data to
	2161	* write out in that case.
	2162	*/
	2163	mp = ap->a_head.a_ops->vv_mount;
	2164	if (jreclist_init(mp, &jreclist, &jreccache, JTYPE_WRITE)) {
	2165	if (ap->a_ioflag & IO_APPEND) {
	2166	jreclist_undo_file(&jreclist, ap->a_vp, JRUNDO_SIZE\|JRUNDO_MTIME, 0, 0);
	2167	} else {
	2168	jreclist_undo_file(&jreclist, ap->a_vp,
	2169	JRUNDO_FILEDATA\|JRUNDO_SIZE\|JRUNDO_MTIME,
	2170	uio_copy.uio_offset, uio_copy.uio_resid);
	2171	}
	2172	}
	2173	error = vop_journal_operate_ap(&ap->a_head);
	2174
	2175	/*
	2176	* XXX bad hack to figure out the offset for O_APPEND writes (note:
	2177	* uio field state after the VFS operation).
	2178	*/
	2179	uio_copy.uio_offset = ap->a_uio->uio_offset -
	2180	(uio_copy.uio_resid - ap->a_uio->uio_resid);
	2181
	2182	/*
	2183	* Output the write data to the journal.
	2184	*/
	2185	if (error == 0) {
	2186	TAILQ_FOREACH(jrec, &jreclist, user_entry) {
	2187	jrecord_write_cred(jrec, NULL, ap->a_cred);
	2188	jrecord_write_vnode_ref(jrec, ap->a_vp);
	2189	jrecord_write_uio(jrec, JLEAF_FILEDATA, &uio_copy);
	2190	}
	2191	}
	2192	jreclist_done(&jreclist, error);
	2193
	2194	if (uio_copy.uio_iov != &uio_one_iovec)
	2195	free(uio_copy.uio_iov, M_JOURNAL);
	2196	return (error);
	2197	}
	2198
	2199	/*
	2200	* Journal vop_fsync { a_vp, a_waitfor, a_td }
	2201	*/
	2202	static
	2203	int
	2204	journal_fsync(struct vop_fsync_args *ap)
	2205	{
	2206	#if 0
	2207	struct mount *mp;
	2208	struct journal *jo;
	2209	#endif
	2210	int error;
	2211
	2212	error = vop_journal_operate_ap(&ap->a_head);
	2213	#if 0
	2214	mp = ap->a_head.a_ops->vv_mount;
	2215	if (error == 0) {
	2216	TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
	2217	/* XXX synchronize pending journal records */
	2218	}
	2219	}
	2220	#endif
	2221	return (error);
	2222	}
	2223
	2224	/*
	2225	* Journal vop_putpages { a_vp, a_m, a_count, a_sync, a_rtvals, a_offset }
	2226	*
	2227	* note: a_count is in bytes.
	2228	*/
	2229	static
	2230	int
	2231	journal_putpages(struct vop_putpages_args *ap)
	2232	{
	2233	struct jrecord_list jreclist;
	2234	struct jrecord jreccache;
	2235	struct jrecord *jrec;
	2236	struct mount *mp;
	2237	int error;
	2238
	2239	mp = ap->a_head.a_ops->vv_mount;
	2240	if (jreclist_init(mp, &jreclist, &jreccache, JTYPE_PUTPAGES) &&
	2241	ap->a_count > 0
	2242	) {
	2243	jreclist_undo_file(&jreclist, ap->a_vp,
	2244	JRUNDO_FILEDATA\|JRUNDO_SIZE\|JRUNDO_MTIME,
	2245	ap->a_offset, btoc(ap->a_count));
	2246	}
	2247	error = vop_journal_operate_ap(&ap->a_head);
	2248	if (error == 0 && ap->a_count > 0) {
	2249	TAILQ_FOREACH(jrec, &jreclist, user_entry) {
	2250	jrecord_write_vnode_ref(jrec, ap->a_vp);
	2251	jrecord_write_pagelist(jrec, JLEAF_FILEDATA, ap->a_m, ap->a_rtvals,
	2252	btoc(ap->a_count), ap->a_offset);
	2253	}
	2254	}
	2255	jreclist_done(&jreclist, error);
	2256	return (error);
	2257	}
	2258
	2259	/*
	2260	* Journal vop_setacl { a_vp, a_type, a_aclp, a_cred, a_td }
	2261	*/
	2262	static
	2263	int
	2264	journal_setacl(struct vop_setacl_args *ap)
	2265	{
	2266	struct jrecord_list jreclist;
	2267	struct jrecord jreccache;
	2268	struct jrecord *jrec;
	2269	struct mount *mp;
	2270	int error;
	2271
	2272	mp = ap->a_head.a_ops->vv_mount;
	2273	jreclist_init(mp, &jreclist, &jreccache, JTYPE_SETACL);
	2274	error = vop_journal_operate_ap(&ap->a_head);
	2275	if (error == 0) {
	2276	TAILQ_FOREACH(jrec, &jreclist, user_entry) {
	2277	#if 0
	2278	if ((jo->flags & MC_JOURNAL_WANT_REVERSABLE))
	2279	jrecord_undo_file(jrec, ap->a_vp, JRUNDO_XXX, 0, 0);
	2280	#endif
	2281	jrecord_write_cred(jrec, ap->a_td, ap->a_cred);
	2282	jrecord_write_vnode_ref(jrec, ap->a_vp);
	2283	/* XXX type, aclp */
	2284	}
	2285	}
	2286	jreclist_done(&jreclist, error);
	2287	return (error);
	2288	}
	2289
	2290	/*
	2291	* Journal vop_setextattr { a_vp, a_name, a_uio, a_cred, a_td }
	2292	*/
	2293	static
	2294	int
	2295	journal_setextattr(struct vop_setextattr_args *ap)
	2296	{
	2297	struct jrecord_list jreclist;
	2298	struct jrecord jreccache;
	2299	struct jrecord *jrec;
	2300	struct mount *mp;
	2301	int error;
	2302
	2303	mp = ap->a_head.a_ops->vv_mount;
	2304	jreclist_init(mp, &jreclist, &jreccache, JTYPE_SETEXTATTR);
	2305	error = vop_journal_operate_ap(&ap->a_head);
	2306	if (error == 0) {
	2307	TAILQ_FOREACH(jrec, &jreclist, user_entry) {
	2308	#if 0
	2309	if ((jo->flags & MC_JOURNAL_WANT_REVERSABLE))
	2310	jrecord_undo_file(jrec, ap->a_vp, JRUNDO_XXX, 0, 0);
	2311	#endif
	2312	jrecord_write_cred(jrec, ap->a_td, ap->a_cred);
	2313	jrecord_write_vnode_ref(jrec, ap->a_vp);
	2314	jrecord_leaf(jrec, JLEAF_ATTRNAME, ap->a_name, strlen(ap->a_name));
	2315	jrecord_write_uio(jrec, JLEAF_FILEDATA, ap->a_uio);
	2316	}
	2317	}
	2318	jreclist_done(&jreclist, error);
	2319	return (error);
	2320	}
	2321
	2322	/*
	2323	* Journal vop_ncreate { a_ncp, a_vpp, a_cred, a_vap }
	2324	*/
	2325	static
	2326	int
	2327	journal_ncreate(struct vop_ncreate_args *ap)
	2328	{
	2329	struct jrecord_list jreclist;
	2330	struct jrecord jreccache;
	2331	struct jrecord *jrec;
	2332	struct mount *mp;
	2333	int error;
	2334
	2335	mp = ap->a_head.a_ops->vv_mount;
	2336	jreclist_init(mp, &jreclist, &jreccache, JTYPE_CREATE);
	2337	error = vop_journal_operate_ap(&ap->a_head);
	2338	if (error == 0) {
	2339	TAILQ_FOREACH(jrec, &jreclist, user_entry) {
	2340	jrecord_write_cred(jrec, NULL, ap->a_cred);
	2341	jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
	2342	if (*ap->a_vpp)
	2343	jrecord_write_vnode_ref(jrec, *ap->a_vpp);
	2344	jrecord_write_vattr(jrec, ap->a_vap);
	2345	}
	2346	}
	2347	jreclist_done(&jreclist, error);
	2348	return (error);
	2349	}
	2350
	2351	/*
	2352	* Journal vop_nmknod { a_ncp, a_vpp, a_cred, a_vap }
	2353	*/
	2354	static
	2355	int
	2356	journal_nmknod(struct vop_nmknod_args *ap)
	2357	{
	2358	struct jrecord_list jreclist;
	2359	struct jrecord jreccache;
	2360	struct jrecord *jrec;
	2361	struct mount *mp;
	2362	int error;
	2363
	2364	mp = ap->a_head.a_ops->vv_mount;
	2365	jreclist_init(mp, &jreclist, &jreccache, JTYPE_MKNOD);
	2366	error = vop_journal_operate_ap(&ap->a_head);
	2367	if (error == 0) {
	2368	TAILQ_FOREACH(jrec, &jreclist, user_entry) {
	2369	jrecord_write_cred(jrec, NULL, ap->a_cred);
	2370	jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
	2371	jrecord_write_vattr(jrec, ap->a_vap);
	2372	if (*ap->a_vpp)
	2373	jrecord_write_vnode_ref(jrec, *ap->a_vpp);
	2374	}
	2375	}
	2376	jreclist_done(&jreclist, error);
	2377	return (error);
	2378	}
	2379
	2380	/*
	2381	* Journal vop_nlink { a_ncp, a_vp, a_cred }
	2382	*/
	2383	static
	2384	int
	2385	journal_nlink(struct vop_nlink_args *ap)
	2386	{
	2387	struct jrecord_list jreclist;
	2388	struct jrecord jreccache;
	2389	struct jrecord *jrec;
	2390	struct mount *mp;
	2391	int error;
	2392
	2393	mp = ap->a_head.a_ops->vv_mount;
	2394	jreclist_init(mp, &jreclist, &jreccache, JTYPE_LINK);
	2395	error = vop_journal_operate_ap(&ap->a_head);
	2396	if (error == 0) {
	2397	TAILQ_FOREACH(jrec, &jreclist, user_entry) {
	2398	jrecord_write_cred(jrec, NULL, ap->a_cred);
	2399	jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
	2400	/* XXX PATH to VP and inode number */
	2401	/* XXX this call may not record the correct path when
	2402	* multiple paths are available */
	2403	jrecord_write_vnode_link(jrec, ap->a_vp, ap->a_ncp);
	2404	}
	2405	}
	2406	jreclist_done(&jreclist, error);
	2407	return (error);
	2408	}
	2409
	2410	/*
	2411	* Journal vop_symlink { a_ncp, a_vpp, a_cred, a_vap, a_target }
	2412	*/
	2413	static
	2414	int
	2415	journal_nsymlink(struct vop_nsymlink_args *ap)
	2416	{
	2417	struct jrecord_list jreclist;
	2418	struct jrecord jreccache;
	2419	struct jrecord *jrec;
	2420	struct mount *mp;
	2421	int error;
	2422
	2423	mp = ap->a_head.a_ops->vv_mount;
	2424	jreclist_init(mp, &jreclist, &jreccache, JTYPE_SYMLINK);
	2425	error = vop_journal_operate_ap(&ap->a_head);
	2426	if (error == 0) {
	2427	TAILQ_FOREACH(jrec, &jreclist, user_entry) {
	2428	jrecord_write_cred(jrec, NULL, ap->a_cred);
	2429	jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
	2430	jrecord_leaf(jrec, JLEAF_SYMLINKDATA,
	2431	ap->a_target, strlen(ap->a_target));
	2432	if (*ap->a_vpp)
	2433	jrecord_write_vnode_ref(jrec, *ap->a_vpp);
	2434	}
	2435	}
	2436	jreclist_done(&jreclist, error);
	2437	return (error);
	2438	}
	2439
	2440	/*
	2441	* Journal vop_nwhiteout { a_ncp, a_cred, a_flags }
	2442	*/
	2443	static
	2444	int
	2445	journal_nwhiteout(struct vop_nwhiteout_args *ap)
	2446	{
	2447	struct jrecord_list jreclist;
	2448	struct jrecord jreccache;
	2449	struct jrecord *jrec;
	2450	struct mount *mp;
	2451	int error;
	2452
	2453	mp = ap->a_head.a_ops->vv_mount;
	2454	jreclist_init(mp, &jreclist, &jreccache, JTYPE_WHITEOUT);
	2455	error = vop_journal_operate_ap(&ap->a_head);
	2456	if (error == 0) {
	2457	TAILQ_FOREACH(jrec, &jreclist, user_entry) {
	2458	jrecord_write_cred(jrec, NULL, ap->a_cred);
	2459	jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
	2460	}
	2461	}
	2462	jreclist_done(&jreclist, error);
	2463	return (error);
	2464	}
	2465
	2466	/*
	2467	* Journal vop_nremove { a_ncp, a_cred }
	2468	*/
	2469	static
	2470	int
	2471	journal_nremove(struct vop_nremove_args *ap)
	2472	{
	2473	struct jrecord_list jreclist;
	2474	struct jrecord jreccache;
	2475	struct jrecord *jrec;
	2476	struct mount *mp;
	2477	int error;
	2478
	2479	mp = ap->a_head.a_ops->vv_mount;
	2480	if (jreclist_init(mp, &jreclist, &jreccache, JTYPE_REMOVE) &&
	2481	ap->a_ncp->nc_vp
	2482	) {
	2483	jreclist_undo_file(&jreclist, ap->a_ncp->nc_vp,
	2484	JRUNDO_ALL\|JRUNDO_GETVP\|JRUNDO_CONDLINK, 0, -1);
	2485	}
	2486	error = vop_journal_operate_ap(&ap->a_head);
	2487	if (error == 0) {
	2488	TAILQ_FOREACH(jrec, &jreclist, user_entry) {
	2489	jrecord_write_cred(jrec, NULL, ap->a_cred);
	2490	jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
	2491	}
	2492	}
	2493	jreclist_done(&jreclist, error);
	2494	return (error);
	2495	}
	2496
	2497	/*
	2498	* Journal vop_nmkdir { a_ncp, a_vpp, a_cred, a_vap }
	2499	*/
	2500	static
	2501	int
	2502	journal_nmkdir(struct vop_nmkdir_args *ap)
	2503	{
	2504	struct jrecord_list jreclist;
	2505	struct jrecord jreccache;
	2506	struct jrecord *jrec;
	2507	struct mount *mp;
	2508	int error;
	2509
	2510	mp = ap->a_head.a_ops->vv_mount;
	2511	jreclist_init(mp, &jreclist, &jreccache, JTYPE_MKDIR);
	2512	error = vop_journal_operate_ap(&ap->a_head);
	2513	if (error == 0) {
	2514	TAILQ_FOREACH(jrec, &jreclist, user_entry) {
	2515	#if 0
	2516	if (jo->flags & MC_JOURNAL_WANT_AUDIT) {
	2517	jrecord_write_audit(jrec);
	2518	}
	2519	#endif
	2520	jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
	2521	jrecord_write_cred(jrec, NULL, ap->a_cred);
	2522	jrecord_write_vattr(jrec, ap->a_vap);
	2523	jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
	2524	if (*ap->a_vpp)
	2525	jrecord_write_vnode_ref(jrec, *ap->a_vpp);
	2526	}
	2527	}
	2528	jreclist_done(&jreclist, error);
	2529	return (error);
	2530	}
	2531
	2532	/*
	2533	* Journal vop_nrmdir { a_ncp, a_cred }
	2534	*/
	2535	static
	2536	int
	2537	journal_nrmdir(struct vop_nrmdir_args *ap)
	2538	{
	2539	struct jrecord_list jreclist;
	2540	struct jrecord jreccache;
	2541	struct jrecord *jrec;
	2542	struct mount *mp;
	2543	int error;
	2544
	2545	mp = ap->a_head.a_ops->vv_mount;
	2546	if (jreclist_init(mp, &jreclist, &jreccache, JTYPE_RMDIR)) {
	2547	jreclist_undo_file(&jreclist, ap->a_ncp->nc_vp,
	2548	JRUNDO_VATTR\|JRUNDO_GETVP, 0, 0);
	2549	}
	2550	error = vop_journal_operate_ap(&ap->a_head);
	2551	if (error == 0) {
	2552	TAILQ_FOREACH(jrec, &jreclist, user_entry) {
	2553	jrecord_write_cred(jrec, NULL, ap->a_cred);
	2554	jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
	2555	}
	2556	}
	2557	jreclist_done(&jreclist, error);
	2558	return (error);
	2559	}
	2560
	2561	/*
	2562	* Journal vop_nrename { a_fncp, a_tncp, a_cred }
	2563	*/
	2564	static
	2565	int
	2566	journal_nrename(struct vop_nrename_args *ap)
	2567	{
	2568	struct jrecord_list jreclist;
	2569	struct jrecord jreccache;
	2570	struct jrecord *jrec;
	2571	struct mount *mp;
	2572	int error;
	2573
	2574	mp = ap->a_head.a_ops->vv_mount;
	2575	if (jreclist_init(mp, &jreclist, &jreccache, JTYPE_RENAME) &&
	2576	ap->a_tncp->nc_vp
	2577	) {
	2578	jreclist_undo_file(&jreclist, ap->a_tncp->nc_vp,
	2579	JRUNDO_ALL\|JRUNDO_GETVP\|JRUNDO_CONDLINK, 0, -1);
	2580	}
	2581	error = vop_journal_operate_ap(&ap->a_head);
	2582	if (error == 0) {
	2583	TAILQ_FOREACH(jrec, &jreclist, user_entry) {
	2584	jrecord_write_cred(jrec, NULL, ap->a_cred);
	2585	jrecord_write_path(jrec, JLEAF_PATH1, ap->a_fncp);
	2586	jrecord_write_path(jrec, JLEAF_PATH2, ap->a_tncp);
	2587	}
	2588	}
	2589	jreclist_done(&jreclist, error);
	2590	return (error);
	2591	}
	2592