gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
	3	*
	4	* The soft updates code is derived from the appendix of a University
	5	* of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
	6	* "Soft Updates: A Solution to the Metadata Update Problem in File
	7	* Systems", CSE-TR-254-95, August 1995).
	8	*
	9	* Further information about soft updates can be obtained from:
	10	*
	11	* Marshall Kirk McKusick http://www.mckusick.com/softdep/
	12	* 1614 Oxford Street mckusick@mckusick.com
	13	* Berkeley, CA 94709-1608 +1-510-843-9542
	14	* USA
	15	*
	16	* Redistribution and use in source and binary forms, with or without
	17	* modification, are permitted provided that the following conditions
	18	* are met:
	19	*
	20	* 1. Redistributions of source code must retain the above copyright
	21	* notice, this list of conditions and the following disclaimer.
	22	* 2. Redistributions in binary form must reproduce the above copyright
	23	* notice, this list of conditions and the following disclaimer in the
	24	* documentation and/or other materials provided with the distribution.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
	27	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	28	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	29	* DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
	30	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
	39	* $FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.57.2.11 2002/02/05 18:46:53 dillon Exp $
	40	*/
	41
	42	/*
	43	* For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
	44	*/
	45	#ifndef DIAGNOSTIC
	46	#define DIAGNOSTIC
	47	#endif
	48	#ifndef DEBUG
	49	#define DEBUG
	50	#endif
	51
	52	#include <sys/param.h>
	53	#include <sys/kernel.h>
	54	#include <sys/systm.h>
	55	#include <sys/buf.h>
	56	#include <sys/malloc.h>
	57	#include <sys/mount.h>
	58	#include <sys/proc.h>
	59	#include <sys/syslog.h>
	60	#include <sys/vnode.h>
	61	#include <sys/conf.h>
	62	#include <ufs/ufs/dir.h>
	63	#include <ufs/ufs/quota.h>
	64	#include <ufs/ufs/inode.h>
	65	#include <ufs/ufs/ufsmount.h>
	66	#include <ufs/ffs/fs.h>
	67	#include <ufs/ffs/softdep.h>
	68	#include <ufs/ffs/ffs_extern.h>
	69	#include <ufs/ufs/ufs_extern.h>
	70
	71	/*
	72	* These definitions need to be adapted to the system to which
	73	* this file is being ported.
	74	*/
	75	/*
	76	* malloc types defined for the softdep system.
	77	*/
	78	MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
	79	MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
	80	MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
	81	MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
	82	MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
	83	MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
	84	MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
	85	MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
	86	MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
	87	MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
	88	MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
	89	MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
	90	MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
	91
	92	#define M_SOFTDEP_FLAGS (M_WAITOK \| M_USE_RESERVE)
	93
	94	#define D_PAGEDEP 0
	95	#define D_INODEDEP 1
	96	#define D_NEWBLK 2
	97	#define D_BMSAFEMAP 3
	98	#define D_ALLOCDIRECT 4
	99	#define D_INDIRDEP 5
	100	#define D_ALLOCINDIR 6
	101	#define D_FREEFRAG 7
	102	#define D_FREEBLKS 8
	103	#define D_FREEFILE 9
	104	#define D_DIRADD 10
	105	#define D_MKDIR 11
	106	#define D_DIRREM 12
	107	#define D_LAST D_DIRREM
	108
	109	/*
	110	* translate from workitem type to memory type
	111	* MUST match the defines above, such that memtype[D_XXX] == M_XXX
	112	*/
	113	static struct malloc_type *memtype[] = {
	114	M_PAGEDEP,
	115	M_INODEDEP,
	116	M_NEWBLK,
	117	M_BMSAFEMAP,
	118	M_ALLOCDIRECT,
	119	M_INDIRDEP,
	120	M_ALLOCINDIR,
	121	M_FREEFRAG,
	122	M_FREEBLKS,
	123	M_FREEFILE,
	124	M_DIRADD,
	125	M_MKDIR,
	126	M_DIRREM
	127	};
	128
	129	#define DtoM(type) (memtype[type])
	130
	131	/*
	132	* Names of malloc types.
	133	*/
	134	#define TYPENAME(type) \
	135	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
	136	#define CURPROC curproc
	137	/*
	138	* End system adaptaion definitions.
	139	*/
	140
	141	/*
	142	* Internal function prototypes.
	143	*/
	144	static void softdep_error __P((char *, int));
	145	static void drain_output __P((struct vnode *, int));
	146	static int getdirtybuf __P((struct buf **, int));
	147	static void clear_remove __P((struct proc *));
	148	static void clear_inodedeps __P((struct proc *));
	149	static int flush_pagedep_deps __P((struct vnode , struct mount ,
	150	struct diraddhd *));
	151	static int flush_inodedep_deps __P((struct fs *, ino_t));
	152	static int handle_written_filepage __P((struct pagedep , struct buf ));
	153	static void diradd_inode_written __P((struct diradd , struct inodedep ));
	154	static int handle_written_inodeblock __P((struct inodedep , struct buf ));
	155	static void handle_allocdirect_partdone __P((struct allocdirect *));
	156	static void handle_allocindir_partdone __P((struct allocindir *));
	157	static void initiate_write_filepage __P((struct pagedep , struct buf ));
	158	static void handle_written_mkdir __P((struct mkdir *, int));
	159	static void initiate_write_inodeblock __P((struct inodedep , struct buf ));
	160	static void handle_workitem_freefile __P((struct freefile *));
	161	static void handle_workitem_remove __P((struct dirrem *));
	162	static struct dirrem newdirrem __P((struct buf , struct inode *,
	163	struct inode , int, struct dirrem *));
	164	static void free_diradd __P((struct diradd *));
	165	static void free_allocindir __P((struct allocindir , struct inodedep ));
	166	static int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t,
	167	long *));
	168	static void deallocate_dependencies __P((struct buf , struct inodedep ));
	169	static void free_allocdirect __P((struct allocdirectlst *,
	170	struct allocdirect *, int));
	171	static int check_inode_unwritten __P((struct inodedep *));
	172	static int free_inodedep __P((struct inodedep *));
	173	static void handle_workitem_freeblocks __P((struct freeblks *));
	174	static void merge_inode_lists __P((struct inodedep *));
	175	static void setup_allocindir_phase2 __P((struct buf , struct inode ,
	176	struct allocindir *));
	177	static struct allocindir newallocindir __P((struct inode , int, ufs_daddr_t,
	178	ufs_daddr_t));
	179	static void handle_workitem_freefrag __P((struct freefrag *));
	180	static struct freefrag newfreefrag __P((struct inode , ufs_daddr_t, long));
	181	static void allocdirect_merge __P((struct allocdirectlst *,
	182	struct allocdirect , struct allocdirect ));
	183	static struct bmsafemap bmsafemap_lookup __P((struct buf ));
	184	static int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
	185	struct newblk **));
	186	static int inodedep_lookup __P((struct fs , ino_t, int, struct inodedep *));
	187	static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
	188	struct pagedep **));
	189	static void pause_timer __P((void *));
	190	static int request_cleanup __P((int, int));
	191	static int process_worklist_item __P((struct mount *, int));
	192	static void add_to_worklist __P((struct worklist *));
	193
	194	/*
	195	* Exported softdep operations.
	196	*/
	197	static void softdep_disk_io_initiation __P((struct buf *));
	198	static void softdep_disk_write_complete __P((struct buf *));
	199	static void softdep_deallocate_dependencies __P((struct buf *));
	200	static int softdep_fsync __P((struct vnode *));
	201	static int softdep_process_worklist __P((struct mount *));
	202	static void softdep_move_dependencies __P((struct buf , struct buf ));
	203	static int softdep_count_dependencies __P((struct buf *bp, int));
	204
	205	struct bio_ops bioops = {
	206	softdep_disk_io_initiation, /* io_start */
	207	softdep_disk_write_complete, /* io_complete */
	208	softdep_deallocate_dependencies, /* io_deallocate */
	209	softdep_fsync, /* io_fsync */
	210	softdep_process_worklist, /* io_sync */
	211	softdep_move_dependencies, /* io_movedeps */
	212	softdep_count_dependencies, /* io_countdeps */
	213	};
	214
	215	/*
	216	* Locking primitives.
	217	*
	218	* For a uniprocessor, all we need to do is protect against disk
	219	* interrupts. For a multiprocessor, this lock would have to be
	220	* a mutex. A single mutex is used throughout this file, though
	221	* finer grain locking could be used if contention warranted it.
	222	*
	223	* For a multiprocessor, the sleep call would accept a lock and
	224	* release it after the sleep processing was complete. In a uniprocessor
	225	* implementation there is no such interlock, so we simple mark
	226	* the places where it needs to be done with the `interlocked' form
	227	* of the lock calls. Since the uniprocessor sleep already interlocks
	228	* the spl, there is nothing that really needs to be done.
	229	*/
	230	#ifndef /* NOT */ DEBUG
	231	static struct lockit {
	232	int lkt_spl;
	233	} lk = { 0 };
	234	#define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio()
	235	#define FREE_LOCK(lk) splx((lk)->lkt_spl)
	236
	237	#else /* DEBUG */
	238	static struct lockit {
	239	int lkt_spl;
	240	pid_t lkt_held;
	241	} lk = { 0, -1 };
	242	static int lockcnt;
	243
	244	static void acquire_lock __P((struct lockit *));
	245	static void free_lock __P((struct lockit *));
	246	void softdep_panic __P((char *));
	247
	248	#define ACQUIRE_LOCK(lk) acquire_lock(lk)
	249	#define FREE_LOCK(lk) free_lock(lk)
	250
	251	static void
	252	acquire_lock(lk)
	253	struct lockit *lk;
	254	{
	255	pid_t holder;
	256
	257	if (lk->lkt_held != -1) {
	258	holder = lk->lkt_held;
	259	FREE_LOCK(lk);
	260	if (holder == CURPROC->p_pid)
	261	panic("softdep_lock: locking against myself");
	262	else
	263	panic("softdep_lock: lock held by %d", holder);
	264	}
	265	lk->lkt_spl = splbio();
	266	lk->lkt_held = CURPROC->p_pid;
	267	lockcnt++;
	268	}
	269
	270	static void
	271	free_lock(lk)
	272	struct lockit *lk;
	273	{
	274
	275	if (lk->lkt_held == -1)
	276	panic("softdep_unlock: lock not held");
	277	lk->lkt_held = -1;
	278	splx(lk->lkt_spl);
	279	}
	280
	281	/*
	282	* Function to release soft updates lock and panic.
	283	*/
	284	void
	285	softdep_panic(msg)
	286	char *msg;
	287	{
	288
	289	if (lk.lkt_held != -1)
	290	FREE_LOCK(&lk);
	291	panic(msg);
	292	}
	293	#endif /* DEBUG */
	294
	295	static int interlocked_sleep __P((struct lockit , int, void , int,
	296	const char *, int));
	297
	298	/*
	299	* When going to sleep, we must save our SPL so that it does
	300	* not get lost if some other process uses the lock while we
	301	* are sleeping. We restore it after we have slept. This routine
	302	* wraps the interlocking with functions that sleep. The list
	303	* below enumerates the available set of operations.
	304	*/
	305	#define UNKNOWN 0
	306	#define SLEEP 1
	307	#define LOCKBUF 2
	308
	309	static int
	310	interlocked_sleep(lk, op, ident, flags, wmesg, timo)
	311	struct lockit *lk;
	312	int op;
	313	void *ident;
	314	int flags;
	315	const char *wmesg;
	316	int timo;
	317	{
	318	pid_t holder;
	319	int s, retval;
	320
	321	s = lk->lkt_spl;
	322	# ifdef DEBUG
	323	if (lk->lkt_held == -1)
	324	panic("interlocked_sleep: lock not held");
	325	lk->lkt_held = -1;
	326	# endif /* DEBUG */
	327	switch (op) {
	328	case SLEEP:
	329	retval = tsleep(ident, flags, wmesg, timo);
	330	break;
	331	case LOCKBUF:
	332	retval = BUF_LOCK((struct buf *)ident, flags);
	333	break;
	334	default:
	335	panic("interlocked_sleep: unknown operation");
	336	}
	337	# ifdef DEBUG
	338	if (lk->lkt_held != -1) {
	339	holder = lk->lkt_held;
	340	FREE_LOCK(lk);
	341	if (holder == CURPROC->p_pid)
	342	panic("interlocked_sleep: locking against self");
	343	else
	344	panic("interlocked_sleep: lock held by %d", holder);
	345	}
	346	lk->lkt_held = CURPROC->p_pid;
	347	lockcnt++;
	348	# endif /* DEBUG */
	349	lk->lkt_spl = s;
	350	return (retval);
	351	}
	352
	353	/*
	354	* Place holder for real semaphores.
	355	*/
	356	struct sema {
	357	int value;
	358	pid_t holder;
	359	char *name;
	360	int prio;
	361	int timo;
	362	};
	363	static void sema_init __P((struct sema , char , int, int));
	364	static int sema_get __P((struct sema , struct lockit ));
	365	static void sema_release __P((struct sema *));
	366
	367	static void
	368	sema_init(semap, name, prio, timo)
	369	struct sema *semap;
	370	char *name;
	371	int prio, timo;
	372	{
	373
	374	semap->holder = -1;
	375	semap->value = 0;
	376	semap->name = name;
	377	semap->prio = prio;
	378	semap->timo = timo;
	379	}
	380
	381	static int
	382	sema_get(semap, interlock)
	383	struct sema *semap;
	384	struct lockit *interlock;
	385	{
	386
	387	if (semap->value++ > 0) {
	388	if (interlock != NULL) {
	389	interlocked_sleep(interlock, SLEEP, (caddr_t)semap,
	390	semap->prio, semap->name, semap->timo);
	391	FREE_LOCK(interlock);
	392	} else {
	393	tsleep((caddr_t)semap, semap->prio, semap->name,
	394	semap->timo);
	395	}
	396	return (0);
	397	}
	398	semap->holder = CURPROC->p_pid;
	399	if (interlock != NULL)
	400	FREE_LOCK(interlock);
	401	return (1);
	402	}
	403
	404	static void
	405	sema_release(semap)
	406	struct sema *semap;
	407	{
	408
	409	if (semap->value <= 0 \|\| semap->holder != CURPROC->p_pid) {
	410	if (lk.lkt_held != -1)
	411	FREE_LOCK(&lk);
	412	panic("sema_release: not held");
	413	}
	414	if (--semap->value > 0) {
	415	semap->value = 0;
	416	wakeup(semap);
	417	}
	418	semap->holder = -1;
	419	}
	420
	421	/*
	422	* Worklist queue management.
	423	* These routines require that the lock be held.
	424	*/
	425	#ifndef /* NOT */ DEBUG
	426	#define WORKLIST_INSERT(head, item) do { \
	427	(item)->wk_state \|= ONWORKLIST; \
	428	LIST_INSERT_HEAD(head, item, wk_list); \
	429	} while (0)
	430	#define WORKLIST_REMOVE(item) do { \
	431	(item)->wk_state &= ~ONWORKLIST; \
	432	LIST_REMOVE(item, wk_list); \
	433	} while (0)
	434	#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
	435
	436	#else /* DEBUG */
	437	static void worklist_insert __P((struct workhead , struct worklist ));
	438	static void worklist_remove __P((struct worklist *));
	439	static void workitem_free __P((struct worklist *, int));
	440
	441	#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
	442	#define WORKLIST_REMOVE(item) worklist_remove(item)
	443	#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
	444
	445	static void
	446	worklist_insert(head, item)
	447	struct workhead *head;
	448	struct worklist *item;
	449	{
	450
	451	if (lk.lkt_held == -1)
	452	panic("worklist_insert: lock not held");
	453	if (item->wk_state & ONWORKLIST) {
	454	FREE_LOCK(&lk);
	455	panic("worklist_insert: already on list");
	456	}
	457	item->wk_state \|= ONWORKLIST;
	458	LIST_INSERT_HEAD(head, item, wk_list);
	459	}
	460
	461	static void
	462	worklist_remove(item)
	463	struct worklist *item;
	464	{
	465
	466	if (lk.lkt_held == -1)
	467	panic("worklist_remove: lock not held");
	468	if ((item->wk_state & ONWORKLIST) == 0) {
	469	FREE_LOCK(&lk);
	470	panic("worklist_remove: not on list");
	471	}
	472	item->wk_state &= ~ONWORKLIST;
	473	LIST_REMOVE(item, wk_list);
	474	}
	475
	476	static void
	477	workitem_free(item, type)
	478	struct worklist *item;
	479	int type;
	480	{
	481
	482	if (item->wk_state & ONWORKLIST) {
	483	if (lk.lkt_held != -1)
	484	FREE_LOCK(&lk);
	485	panic("workitem_free: still on list");
	486	}
	487	if (item->wk_type != type) {
	488	if (lk.lkt_held != -1)
	489	FREE_LOCK(&lk);
	490	panic("workitem_free: type mismatch");
	491	}
	492	FREE(item, DtoM(type));
	493	}
	494	#endif /* DEBUG */
	495
	496	/*
	497	* Workitem queue management
	498	*/
	499	static struct workhead softdep_workitem_pending;
	500	static int num_on_worklist; /* number of worklist items to be processed */
	501	static int softdep_worklist_busy; /* 1 => trying to do unmount */
	502	static int softdep_worklist_req; /* serialized waiters */
	503	static int max_softdeps; /* maximum number of structs before slowdown */
	504	static int tickdelay = 2; /* number of ticks to pause during slowdown */
	505	static int stat_countp; / statistic to count in proc_waiting timeout */
	506	static int proc_waiting; /* tracks whether we have a timeout posted */
	507	static struct callout_handle handle; /* handle on posted proc_waiting timeout */
	508	static struct proc filesys_syncer; / proc of filesystem syncer process */
	509	static int req_clear_inodedeps; /* syncer process flush some inodedeps */
	510	#define FLUSH_INODES 1
	511	static int req_clear_remove; /* syncer process flush some freeblks */
	512	#define FLUSH_REMOVE 2
	513	/*
	514	* runtime statistics
	515	*/
	516	static int stat_worklist_push; /* number of worklist cleanups */
	517	static int stat_blk_limit_push; /* number of times block limit neared */
	518	static int stat_ino_limit_push; /* number of times inode limit neared */
	519	static int stat_blk_limit_hit; /* number of times block slowdown imposed */
	520	static int stat_ino_limit_hit; /* number of times inode slowdown imposed */
	521	static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
	522	static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
	523	static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
	524	static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
	525	static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
	526	#ifdef DEBUG
	527	#include <vm/vm.h>
	528	#include <sys/sysctl.h>
	529	SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
	530	SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
	531	SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
	532	SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
	533	SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
	534	SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
	535	SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
	536	SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
	537	SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
	538	SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
	539	SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
	540	SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
	541	#endif /* DEBUG */
	542
	543	/*
	544	* Add an item to the end of the work queue.
	545	* This routine requires that the lock be held.
	546	* This is the only routine that adds items to the list.
	547	* The following routine is the only one that removes items
	548	* and does so in order from first to last.
	549	*/
	550	static void
	551	add_to_worklist(wk)
	552	struct worklist *wk;
	553	{
	554	static struct worklist *worklist_tail;
	555
	556	if (wk->wk_state & ONWORKLIST) {
	557	if (lk.lkt_held != -1)
	558	FREE_LOCK(&lk);
	559	panic("add_to_worklist: already on list");
	560	}
	561	wk->wk_state \|= ONWORKLIST;
	562	if (LIST_FIRST(&softdep_workitem_pending) == NULL)
	563	LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
	564	else
	565	LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
	566	worklist_tail = wk;
	567	num_on_worklist += 1;
	568	}
	569
	570	/*
	571	* Process that runs once per second to handle items in the background queue.
	572	*
	573	* Note that we ensure that everything is done in the order in which they
	574	* appear in the queue. The code below depends on this property to ensure
	575	* that blocks of a file are freed before the inode itself is freed. This
	576	* ordering ensures that no new <vfsid, inum, lbn> triples will be generated
	577	* until all the old ones have been purged from the dependency lists.
	578	*/
	579	static int
	580	softdep_process_worklist(matchmnt)
	581	struct mount *matchmnt;
	582	{
	583	struct proc *p = CURPROC;
	584	int matchcnt, loopcount;
	585	long starttime;
	586
	587	/*
	588	* Record the process identifier of our caller so that we can give
	589	* this process preferential treatment in request_cleanup below.
	590	*/
	591	filesys_syncer = p;
	592	matchcnt = 0;
	593
	594	/*
	595	* There is no danger of having multiple processes run this
	596	* code, but we have to single-thread it when softdep_flushfiles()
	597	* is in operation to get an accurate count of the number of items
	598	* related to its mount point that are in the list.
	599	*/
	600	if (matchmnt == NULL) {
	601	if (softdep_worklist_busy < 0)
	602	return(-1);
	603	softdep_worklist_busy += 1;
	604	}
	605
	606	/*
	607	* If requested, try removing inode or removal dependencies.
	608	*/
	609	if (req_clear_inodedeps) {
	610	clear_inodedeps(p);
	611	req_clear_inodedeps -= 1;
	612	wakeup_one(&proc_waiting);
	613	}
	614	if (req_clear_remove) {
	615	clear_remove(p);
	616	req_clear_remove -= 1;
	617	wakeup_one(&proc_waiting);
	618	}
	619	loopcount = 1;
	620	starttime = time_second;
	621	while (num_on_worklist > 0) {
	622	matchcnt += process_worklist_item(matchmnt, 0);
	623
	624	/*
	625	* If a umount operation wants to run the worklist
	626	* accurately, abort.
	627	*/
	628	if (softdep_worklist_req && matchmnt == NULL) {
	629	matchcnt = -1;
	630	break;
	631	}
	632
	633	/*
	634	* If requested, try removing inode or removal dependencies.
	635	*/
	636	if (req_clear_inodedeps) {
	637	clear_inodedeps(p);
	638	req_clear_inodedeps -= 1;
	639	wakeup_one(&proc_waiting);
	640	}
	641	if (req_clear_remove) {
	642	clear_remove(p);
	643	req_clear_remove -= 1;
	644	wakeup_one(&proc_waiting);
	645	}
	646	/*
	647	* We do not generally want to stop for buffer space, but if
	648	* we are really being a buffer hog, we will stop and wait.
	649	*/
	650	if (loopcount++ % 128 == 0)
	651	bwillwrite();
	652	/*
	653	* Never allow processing to run for more than one
	654	* second. Otherwise the other syncer tasks may get
	655	* excessively backlogged.
	656	*/
	657	if (starttime != time_second && matchmnt == NULL) {
	658	matchcnt = -1;
	659	break;
	660	}
	661	}
	662	if (matchmnt == NULL) {
	663	--softdep_worklist_busy;
	664	if (softdep_worklist_req && softdep_worklist_busy == 0)
	665	wakeup(&softdep_worklist_req);
	666	}
	667	return (matchcnt);
	668	}
	669
	670	/*
	671	* Process one item on the worklist.
	672	*/
	673	static int
	674	process_worklist_item(matchmnt, flags)
	675	struct mount *matchmnt;
	676	int flags;
	677	{
	678	struct worklist *wk;
	679	struct dirrem *dirrem;
	680	struct fs *matchfs;
	681	struct vnode *vp;
	682	int matchcnt = 0;
	683
	684	matchfs = NULL;
	685	if (matchmnt != NULL)
	686	matchfs = VFSTOUFS(matchmnt)->um_fs;
	687	ACQUIRE_LOCK(&lk);
	688	/*
	689	* Normally we just process each item on the worklist in order.
	690	* However, if we are in a situation where we cannot lock any
	691	* inodes, we have to skip over any dirrem requests whose
	692	* vnodes are resident and locked.
	693	*/
	694	LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
	695	if ((flags & LK_NOWAIT) == 0 \|\| wk->wk_type != D_DIRREM)
	696	break;
	697	dirrem = WK_DIRREM(wk);
	698	vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev,
	699	dirrem->dm_oldinum);
	700	if (vp == NULL \|\| !VOP_ISLOCKED(vp, CURPROC))
	701	break;
	702	}
	703	if (wk == 0) {
	704	FREE_LOCK(&lk);
	705	return (0);
	706	}
	707	WORKLIST_REMOVE(wk);
	708	num_on_worklist -= 1;
	709	FREE_LOCK(&lk);
	710	switch (wk->wk_type) {
	711
	712	case D_DIRREM:
	713	/* removal of a directory entry */
	714	if (WK_DIRREM(wk)->dm_mnt == matchmnt)
	715	matchcnt += 1;
	716	handle_workitem_remove(WK_DIRREM(wk));
	717	break;
	718
	719	case D_FREEBLKS:
	720	/* releasing blocks and/or fragments from a file */
	721	if (WK_FREEBLKS(wk)->fb_fs == matchfs)
	722	matchcnt += 1;
	723	handle_workitem_freeblocks(WK_FREEBLKS(wk));
	724	break;
	725
	726	case D_FREEFRAG:
	727	/* releasing a fragment when replaced as a file grows */
	728	if (WK_FREEFRAG(wk)->ff_fs == matchfs)
	729	matchcnt += 1;
	730	handle_workitem_freefrag(WK_FREEFRAG(wk));
	731	break;
	732
	733	case D_FREEFILE:
	734	/* releasing an inode when its link count drops to 0 */
	735	if (WK_FREEFILE(wk)->fx_fs == matchfs)
	736	matchcnt += 1;
	737	handle_workitem_freefile(WK_FREEFILE(wk));
	738	break;
	739
	740	default:
	741	panic("%s_process_worklist: Unknown type %s",
	742	"softdep", TYPENAME(wk->wk_type));
	743	/* NOTREACHED */
	744	}
	745	return (matchcnt);
	746	}
	747
	748	/*
	749	* Move dependencies from one buffer to another.
	750	*/
	751	static void
	752	softdep_move_dependencies(oldbp, newbp)
	753	struct buf *oldbp;
	754	struct buf *newbp;
	755	{
	756	struct worklist wk, wktail;
	757
	758	if (LIST_FIRST(&newbp->b_dep) != NULL)
	759	panic("softdep_move_dependencies: need merge code");
	760	wktail = 0;
	761	ACQUIRE_LOCK(&lk);
	762	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
	763	LIST_REMOVE(wk, wk_list);
	764	if (wktail == 0)
	765	LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
	766	else
	767	LIST_INSERT_AFTER(wktail, wk, wk_list);
	768	wktail = wk;
	769	}
	770	FREE_LOCK(&lk);
	771	}
	772
	773	/*
	774	* Purge the work list of all items associated with a particular mount point.
	775	*/
	776	int
	777	softdep_flushfiles(oldmnt, flags, p)
	778	struct mount *oldmnt;
	779	int flags;
	780	struct proc *p;
	781	{
	782	struct vnode *devvp;
	783	int error, loopcnt;
	784
	785	/*
	786	* Await our turn to clear out the queue, then serialize access.
	787	*/
	788	while (softdep_worklist_busy != 0) {
	789	softdep_worklist_req += 1;
	790	tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0);
	791	softdep_worklist_req -= 1;
	792	}
	793	softdep_worklist_busy = -1;
	794
	795	if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) {
	796	softdep_worklist_busy = 0;
	797	if (softdep_worklist_req)
	798	wakeup(&softdep_worklist_req);
	799	return (error);
	800	}
	801	/*
	802	* Alternately flush the block device associated with the mount
	803	* point and process any dependencies that the flushing
	804	* creates. In theory, this loop can happen at most twice,
	805	* but we give it a few extra just to be sure.
	806	*/
	807	devvp = VFSTOUFS(oldmnt)->um_devvp;
	808	for (loopcnt = 10; loopcnt > 0; ) {
	809	if (softdep_process_worklist(oldmnt) == 0) {
	810	loopcnt--;
	811	/*
	812	* Do another flush in case any vnodes were brought in
	813	* as part of the cleanup operations.
	814	*/
	815	if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
	816	break;
	817	/*
	818	* If we still found nothing to do, we are really done.
	819	*/
	820	if (softdep_process_worklist(oldmnt) == 0)
	821	break;
	822	}
	823	vn_lock(devvp, LK_EXCLUSIVE \| LK_RETRY, p);
	824	error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
	825	VOP_UNLOCK(devvp, 0, p);
	826	if (error)
	827	break;
	828	}
	829	softdep_worklist_busy = 0;
	830	if (softdep_worklist_req)
	831	wakeup(&softdep_worklist_req);
	832
	833	/*
	834	* If we are unmounting then it is an error to fail. If we
	835	* are simply trying to downgrade to read-only, then filesystem
	836	* activity can keep us busy forever, so we just fail with EBUSY.
	837	*/
	838	if (loopcnt == 0) {
	839	if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
	840	panic("softdep_flushfiles: looping");
	841	error = EBUSY;
	842	}
	843	return (error);
	844	}
	845
	846	/*
	847	* Structure hashing.
	848	*
	849	* There are three types of structures that can be looked up:
	850	* 1) pagedep structures identified by mount point, inode number,
	851	* and logical block.
	852	* 2) inodedep structures identified by mount point and inode number.
	853	* 3) newblk structures identified by mount point and
	854	* physical block number.
	855	*
	856	* The "pagedep" and "inodedep" dependency structures are hashed
	857	* separately from the file blocks and inodes to which they correspond.
	858	* This separation helps when the in-memory copy of an inode or
	859	* file block must be replaced. It also obviates the need to access
	860	* an inode or file page when simply updating (or de-allocating)
	861	* dependency structures. Lookup of newblk structures is needed to
	862	* find newly allocated blocks when trying to associate them with
	863	* their allocdirect or allocindir structure.
	864	*
	865	* The lookup routines optionally create and hash a new instance when
	866	* an existing entry is not found.
	867	*/
	868	#define DEPALLOC 0x0001 /* allocate structure if lookup fails */
	869	#define NODELAY 0x0002 /* cannot do background work */
	870
	871	/*
	872	* Structures and routines associated with pagedep caching.
	873	*/
	874	LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
	875	u_long pagedep_hash; /* size of hash table - 1 */
	876	#define PAGEDEP_HASH(mp, inum, lbn) \
	877	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
	878	pagedep_hash])
	879	static struct sema pagedep_in_progress;
	880
	881	/*
	882	* Look up a pagedep. Return 1 if found, 0 if not found.
	883	* If not found, allocate if DEPALLOC flag is passed.
	884	* Found or allocated entry is returned in pagedeppp.
	885	* This routine must be called with splbio interrupts blocked.
	886	*/
	887	static int
	888	pagedep_lookup(ip, lbn, flags, pagedeppp)
	889	struct inode *ip;
	890	ufs_lbn_t lbn;
	891	int flags;
	892	struct pagedep **pagedeppp;
	893	{
	894	struct pagedep *pagedep;
	895	struct pagedep_hashhead *pagedephd;
	896	struct mount *mp;
	897	int i;
	898
	899	#ifdef DEBUG
	900	if (lk.lkt_held == -1)
	901	panic("pagedep_lookup: lock not held");
	902	#endif
	903	mp = ITOV(ip)->v_mount;
	904	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
	905	top:
	906	LIST_FOREACH(pagedep, pagedephd, pd_hash)
	907	if (ip->i_number == pagedep->pd_ino &&
	908	lbn == pagedep->pd_lbn &&
	909	mp == pagedep->pd_mnt)
	910	break;
	911	if (pagedep) {
	912	*pagedeppp = pagedep;
	913	return (1);
	914	}
	915	if ((flags & DEPALLOC) == 0) {
	916	*pagedeppp = NULL;
	917	return (0);
	918	}
	919	if (sema_get(&pagedep_in_progress, &lk) == 0) {
	920	ACQUIRE_LOCK(&lk);
	921	goto top;
	922	}
	923	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
	924	M_SOFTDEP_FLAGS);
	925	bzero(pagedep, sizeof(struct pagedep));
	926	pagedep->pd_list.wk_type = D_PAGEDEP;
	927	pagedep->pd_mnt = mp;
	928	pagedep->pd_ino = ip->i_number;
	929	pagedep->pd_lbn = lbn;
	930	LIST_INIT(&pagedep->pd_dirremhd);
	931	LIST_INIT(&pagedep->pd_pendinghd);
	932	for (i = 0; i < DAHASHSZ; i++)
	933	LIST_INIT(&pagedep->pd_diraddhd[i]);
	934	ACQUIRE_LOCK(&lk);
	935	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
	936	sema_release(&pagedep_in_progress);
	937	*pagedeppp = pagedep;
	938	return (0);
	939	}
	940
	941	/*
	942	* Structures and routines associated with inodedep caching.
	943	*/
	944	LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
	945	static u_long inodedep_hash; /* size of hash table - 1 */
	946	static long num_inodedep; /* number of inodedep allocated */
	947	#define INODEDEP_HASH(fs, inum) \
	948	(&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
	949	static struct sema inodedep_in_progress;
	950
	951	/*
	952	* Look up a inodedep. Return 1 if found, 0 if not found.
	953	* If not found, allocate if DEPALLOC flag is passed.
	954	* Found or allocated entry is returned in inodedeppp.
	955	* This routine must be called with splbio interrupts blocked.
	956	*/
	957	static int
	958	inodedep_lookup(fs, inum, flags, inodedeppp)
	959	struct fs *fs;
	960	ino_t inum;
	961	int flags;
	962	struct inodedep **inodedeppp;
	963	{
	964	struct inodedep *inodedep;
	965	struct inodedep_hashhead *inodedephd;
	966	int firsttry;
	967
	968	#ifdef DEBUG
	969	if (lk.lkt_held == -1)
	970	panic("inodedep_lookup: lock not held");
	971	#endif
	972	firsttry = 1;
	973	inodedephd = INODEDEP_HASH(fs, inum);
	974	top:
	975	LIST_FOREACH(inodedep, inodedephd, id_hash)
	976	if (inum == inodedep->id_ino && fs == inodedep->id_fs)
	977	break;
	978	if (inodedep) {
	979	*inodedeppp = inodedep;
	980	return (1);
	981	}
	982	if ((flags & DEPALLOC) == 0) {
	983	*inodedeppp = NULL;
	984	return (0);
	985	}
	986	/*
	987	* If we are over our limit, try to improve the situation.
	988	*/
	989	if (num_inodedep > max_softdeps && firsttry &&
	990	speedup_syncer() == 0 && (flags & NODELAY) == 0 &&
	991	request_cleanup(FLUSH_INODES, 1)) {
	992	firsttry = 0;
	993	goto top;
	994	}
	995	if (sema_get(&inodedep_in_progress, &lk) == 0) {
	996	ACQUIRE_LOCK(&lk);
	997	goto top;
	998	}
	999	num_inodedep += 1;
	1000	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
	1001	M_INODEDEP, M_SOFTDEP_FLAGS);
	1002	inodedep->id_list.wk_type = D_INODEDEP;
	1003	inodedep->id_fs = fs;
	1004	inodedep->id_ino = inum;
	1005	inodedep->id_state = ALLCOMPLETE;
	1006	inodedep->id_nlinkdelta = 0;
	1007	inodedep->id_savedino = NULL;
	1008	inodedep->id_savedsize = -1;
	1009	inodedep->id_buf = NULL;
	1010	LIST_INIT(&inodedep->id_pendinghd);
	1011	LIST_INIT(&inodedep->id_inowait);
	1012	LIST_INIT(&inodedep->id_bufwait);
	1013	TAILQ_INIT(&inodedep->id_inoupdt);
	1014	TAILQ_INIT(&inodedep->id_newinoupdt);
	1015	ACQUIRE_LOCK(&lk);
	1016	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
	1017	sema_release(&inodedep_in_progress);
	1018	*inodedeppp = inodedep;
	1019	return (0);
	1020	}
	1021
	1022	/*
	1023	* Structures and routines associated with newblk caching.
	1024	*/
	1025	LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
	1026	u_long newblk_hash; /* size of hash table - 1 */
	1027	#define NEWBLK_HASH(fs, inum) \
	1028	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
	1029	static struct sema newblk_in_progress;
	1030
	1031	/*
	1032	* Look up a newblk. Return 1 if found, 0 if not found.
	1033	* If not found, allocate if DEPALLOC flag is passed.
	1034	* Found or allocated entry is returned in newblkpp.
	1035	*/
	1036	static int
	1037	newblk_lookup(fs, newblkno, flags, newblkpp)
	1038	struct fs *fs;
	1039	ufs_daddr_t newblkno;
	1040	int flags;
	1041	struct newblk **newblkpp;
	1042	{
	1043	struct newblk *newblk;
	1044	struct newblk_hashhead *newblkhd;
	1045
	1046	newblkhd = NEWBLK_HASH(fs, newblkno);
	1047	top:
	1048	LIST_FOREACH(newblk, newblkhd, nb_hash)
	1049	if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
	1050	break;
	1051	if (newblk) {
	1052	*newblkpp = newblk;
	1053	return (1);
	1054	}
	1055	if ((flags & DEPALLOC) == 0) {
	1056	*newblkpp = NULL;
	1057	return (0);
	1058	}
	1059	if (sema_get(&newblk_in_progress, 0) == 0)
	1060	goto top;
	1061	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
	1062	M_NEWBLK, M_SOFTDEP_FLAGS);
	1063	newblk->nb_state = 0;
	1064	newblk->nb_fs = fs;
	1065	newblk->nb_newblkno = newblkno;
	1066	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
	1067	sema_release(&newblk_in_progress);
	1068	*newblkpp = newblk;
	1069	return (0);
	1070	}
	1071
	1072	/*
	1073	* Executed during filesystem system initialization before
	1074	* mounting any file systems.
	1075	*/
	1076	void
	1077	softdep_initialize()
	1078	{
	1079
	1080	LIST_INIT(&mkdirlisthd);
	1081	LIST_INIT(&softdep_workitem_pending);
	1082	max_softdeps = min(desiredvnodes * 8,
	1083	M_INODEDEP->ks_limit / (2 * sizeof(struct inodedep)));
	1084	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
	1085	&pagedep_hash);
	1086	sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
	1087	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
	1088	sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
	1089	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
	1090	sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
	1091	}
	1092
	1093	/*
	1094	* Called at mount time to notify the dependency code that a
	1095	* filesystem wishes to use it.
	1096	*/
	1097	int
	1098	softdep_mount(devvp, mp, fs, cred)
	1099	struct vnode *devvp;
	1100	struct mount *mp;
	1101	struct fs *fs;
	1102	struct ucred *cred;
	1103	{
	1104	struct csum cstotal;
	1105	struct cg *cgp;
	1106	struct buf *bp;
	1107	int error, cyl;
	1108
	1109	mp->mnt_flag &= ~MNT_ASYNC;
	1110	mp->mnt_flag \|= MNT_SOFTDEP;
	1111	/*
	1112	* When doing soft updates, the counters in the
	1113	* superblock may have gotten out of sync, so we have
	1114	* to scan the cylinder groups and recalculate them.
	1115	*/
	1116	if (fs->fs_clean != 0)
	1117	return (0);
	1118	bzero(&cstotal, sizeof cstotal);
	1119	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
	1120	if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
	1121	fs->fs_cgsize, cred, &bp)) != 0) {
	1122	brelse(bp);
	1123	return (error);
	1124	}
	1125	cgp = (struct cg *)bp->b_data;
	1126	cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
	1127	cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
	1128	cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
	1129	cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
	1130	fs->fs_cs(fs, cyl) = cgp->cg_cs;
	1131	brelse(bp);
	1132	}
	1133	#ifdef DEBUG
	1134	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
	1135	printf("ffs_mountfs: superblock updated for soft updates\n");
	1136	#endif
	1137	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
	1138	return (0);
	1139	}
	1140
	1141	/*
	1142	* Protecting the freemaps (or bitmaps).
	1143	*
	1144	* To eliminate the need to execute fsck before mounting a file system
	1145	* after a power failure, one must (conservatively) guarantee that the
	1146	* on-disk copy of the bitmaps never indicate that a live inode or block is
	1147	* free. So, when a block or inode is allocated, the bitmap should be
	1148	* updated (on disk) before any new pointers. When a block or inode is
	1149	* freed, the bitmap should not be updated until all pointers have been
	1150	* reset. The latter dependency is handled by the delayed de-allocation
	1151	* approach described below for block and inode de-allocation. The former
	1152	* dependency is handled by calling the following procedure when a block or
	1153	* inode is allocated. When an inode is allocated an "inodedep" is created
	1154	* with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
	1155	* Each "inodedep" is also inserted into the hash indexing structure so
	1156	* that any additional link additions can be made dependent on the inode
	1157	* allocation.
	1158	*
	1159	* The ufs file system maintains a number of free block counts (e.g., per
	1160	* cylinder group, per cylinder and per <cylinder, rotational position> pair)
	1161	* in addition to the bitmaps. These counts are used to improve efficiency
	1162	* during allocation and therefore must be consistent with the bitmaps.
	1163	* There is no convenient way to guarantee post-crash consistency of these
	1164	* counts with simple update ordering, for two main reasons: (1) The counts
	1165	* and bitmaps for a single cylinder group block are not in the same disk
	1166	* sector. If a disk write is interrupted (e.g., by power failure), one may
	1167	* be written and the other not. (2) Some of the counts are located in the
	1168	* superblock rather than the cylinder group block. So, we focus our soft
	1169	* updates implementation on protecting the bitmaps. When mounting a
	1170	* filesystem, we recompute the auxiliary counts from the bitmaps.
	1171	*/
	1172
	1173	/*
	1174	* Called just after updating the cylinder group block to allocate an inode.
	1175	*/
	1176	void
	1177	softdep_setup_inomapdep(bp, ip, newinum)
	1178	struct buf bp; / buffer for cylgroup block with inode map */
	1179	struct inode ip; / inode related to allocation */
	1180	ino_t newinum; /* new inode number being allocated */
	1181	{
	1182	struct inodedep *inodedep;
	1183	struct bmsafemap *bmsafemap;
	1184
	1185	/*
	1186	* Create a dependency for the newly allocated inode.
	1187	* Panic if it already exists as something is seriously wrong.
	1188	* Otherwise add it to the dependency list for the buffer holding
	1189	* the cylinder group map from which it was allocated.
	1190	*/
	1191	ACQUIRE_LOCK(&lk);
	1192	if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC\|NODELAY, &inodedep))) {
	1193	FREE_LOCK(&lk);
	1194	panic("softdep_setup_inomapdep: found inode");
	1195	}
	1196	inodedep->id_buf = bp;
	1197	inodedep->id_state &= ~DEPCOMPLETE;
	1198	bmsafemap = bmsafemap_lookup(bp);
	1199	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
	1200	FREE_LOCK(&lk);
	1201	}
	1202
	1203	/*
	1204	* Called just after updating the cylinder group block to
	1205	* allocate block or fragment.
	1206	*/
	1207	void
	1208	softdep_setup_blkmapdep(bp, fs, newblkno)
	1209	struct buf bp; / buffer for cylgroup block with block map */
	1210	struct fs fs; / filesystem doing allocation */
	1211	ufs_daddr_t newblkno; /* number of newly allocated block */
	1212	{
	1213	struct newblk *newblk;
	1214	struct bmsafemap *bmsafemap;
	1215
	1216	/*
	1217	* Create a dependency for the newly allocated block.
	1218	* Add it to the dependency list for the buffer holding
	1219	* the cylinder group map from which it was allocated.
	1220	*/
	1221	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
	1222	panic("softdep_setup_blkmapdep: found block");
	1223	ACQUIRE_LOCK(&lk);
	1224	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
	1225	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
	1226	FREE_LOCK(&lk);
	1227	}
	1228
	1229	/*
	1230	* Find the bmsafemap associated with a cylinder group buffer.
	1231	* If none exists, create one. The buffer must be locked when
	1232	* this routine is called and this routine must be called with
	1233	* splbio interrupts blocked.
	1234	*/
	1235	static struct bmsafemap *
	1236	bmsafemap_lookup(bp)
	1237	struct buf *bp;
	1238	{
	1239	struct bmsafemap *bmsafemap;
	1240	struct worklist *wk;
	1241
	1242	#ifdef DEBUG
	1243	if (lk.lkt_held == -1)
	1244	panic("bmsafemap_lookup: lock not held");
	1245	#endif
	1246	LIST_FOREACH(wk, &bp->b_dep, wk_list)
	1247	if (wk->wk_type == D_BMSAFEMAP)
	1248	return (WK_BMSAFEMAP(wk));
	1249	FREE_LOCK(&lk);
	1250	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
	1251	M_BMSAFEMAP, M_SOFTDEP_FLAGS);
	1252	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
	1253	bmsafemap->sm_list.wk_state = 0;
	1254	bmsafemap->sm_buf = bp;
	1255	LIST_INIT(&bmsafemap->sm_allocdirecthd);
	1256	LIST_INIT(&bmsafemap->sm_allocindirhd);
	1257	LIST_INIT(&bmsafemap->sm_inodedephd);
	1258	LIST_INIT(&bmsafemap->sm_newblkhd);
	1259	ACQUIRE_LOCK(&lk);
	1260	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
	1261	return (bmsafemap);
	1262	}
	1263
	1264	/*
	1265	* Direct block allocation dependencies.
	1266	*
	1267	* When a new block is allocated, the corresponding disk locations must be
	1268	* initialized (with zeros or new data) before the on-disk inode points to
	1269	* them. Also, the freemap from which the block was allocated must be
	1270	* updated (on disk) before the inode's pointer. These two dependencies are
	1271	* independent of each other and are needed for all file blocks and indirect
	1272	* blocks that are pointed to directly by the inode. Just before the
	1273	* "in-core" version of the inode is updated with a newly allocated block
	1274	* number, a procedure (below) is called to setup allocation dependency
	1275	* structures. These structures are removed when the corresponding
	1276	* dependencies are satisfied or when the block allocation becomes obsolete
	1277	* (i.e., the file is deleted, the block is de-allocated, or the block is a
	1278	* fragment that gets upgraded). All of these cases are handled in
	1279	* procedures described later.
	1280	*
	1281	* When a file extension causes a fragment to be upgraded, either to a larger
	1282	* fragment or to a full block, the on-disk location may change (if the
	1283	* previous fragment could not simply be extended). In this case, the old
	1284	* fragment must be de-allocated, but not until after the inode's pointer has
	1285	* been updated. In most cases, this is handled by later procedures, which
	1286	* will construct a "freefrag" structure to be added to the workitem queue
	1287	* when the inode update is complete (or obsolete). The main exception to
	1288	* this is when an allocation occurs while a pending allocation dependency
	1289	* (for the same block pointer) remains. This case is handled in the main
	1290	* allocation dependency setup procedure by immediately freeing the
	1291	* unreferenced fragments.
	1292	*/
	1293	void
	1294	softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
	1295	struct inode ip; / inode to which block is being added */
	1296	ufs_lbn_t lbn; /* block pointer within inode */
	1297	ufs_daddr_t newblkno; /* disk block number being added */
	1298	ufs_daddr_t oldblkno; /* previous block number, 0 unless frag */
	1299	long newsize; /* size of new block */
	1300	long oldsize; /* size of new block */
	1301	struct buf bp; / bp for allocated block */
	1302	{
	1303	struct allocdirect adp, oldadp;
	1304	struct allocdirectlst *adphead;
	1305	struct bmsafemap *bmsafemap;
	1306	struct inodedep *inodedep;
	1307	struct pagedep *pagedep;
	1308	struct newblk *newblk;
	1309
	1310	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
	1311	M_ALLOCDIRECT, M_SOFTDEP_FLAGS);
	1312	bzero(adp, sizeof(struct allocdirect));
	1313	adp->ad_list.wk_type = D_ALLOCDIRECT;
	1314	adp->ad_lbn = lbn;
	1315	adp->ad_newblkno = newblkno;
	1316	adp->ad_oldblkno = oldblkno;
	1317	adp->ad_newsize = newsize;
	1318	adp->ad_oldsize = oldsize;
	1319	adp->ad_state = ATTACHED;
	1320	if (newblkno == oldblkno)
	1321	adp->ad_freefrag = NULL;
	1322	else
	1323	adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
	1324
	1325	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
	1326	panic("softdep_setup_allocdirect: lost block");
	1327
	1328	ACQUIRE_LOCK(&lk);
	1329	inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC \| NODELAY, &inodedep);
	1330	adp->ad_inodedep = inodedep;
	1331
	1332	if (newblk->nb_state == DEPCOMPLETE) {
	1333	adp->ad_state \|= DEPCOMPLETE;
	1334	adp->ad_buf = NULL;
	1335	} else {
	1336	bmsafemap = newblk->nb_bmsafemap;
	1337	adp->ad_buf = bmsafemap->sm_buf;
	1338	LIST_REMOVE(newblk, nb_deps);
	1339	LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
	1340	}
	1341	LIST_REMOVE(newblk, nb_hash);
	1342	FREE(newblk, M_NEWBLK);
	1343
	1344	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
	1345	if (lbn >= NDADDR) {
	1346	/* allocating an indirect block */
	1347	if (oldblkno != 0) {
	1348	FREE_LOCK(&lk);
	1349	panic("softdep_setup_allocdirect: non-zero indir");
	1350	}
	1351	} else {
	1352	/*
	1353	* Allocating a direct block.
	1354	*
	1355	* If we are allocating a directory block, then we must
	1356	* allocate an associated pagedep to track additions and
	1357	* deletions.
	1358	*/
	1359	if ((ip->i_mode & IFMT) == IFDIR &&
	1360	pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
	1361	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
	1362	}
	1363	/*
	1364	* The list of allocdirects must be kept in sorted and ascending
	1365	* order so that the rollback routines can quickly determine the
	1366	* first uncommitted block (the size of the file stored on disk
	1367	* ends at the end of the lowest committed fragment, or if there
	1368	* are no fragments, at the end of the highest committed block).
	1369	* Since files generally grow, the typical case is that the new
	1370	* block is to be added at the end of the list. We speed this
	1371	* special case by checking against the last allocdirect in the
	1372	* list before laboriously traversing the list looking for the
	1373	* insertion point.
	1374	*/
	1375	adphead = &inodedep->id_newinoupdt;
	1376	oldadp = TAILQ_LAST(adphead, allocdirectlst);
	1377	if (oldadp == NULL \|\| oldadp->ad_lbn <= lbn) {
	1378	/* insert at end of list */
	1379	TAILQ_INSERT_TAIL(adphead, adp, ad_next);
	1380	if (oldadp != NULL && oldadp->ad_lbn == lbn)
	1381	allocdirect_merge(adphead, adp, oldadp);
	1382	FREE_LOCK(&lk);
	1383	return;
	1384	}
	1385	TAILQ_FOREACH(oldadp, adphead, ad_next) {
	1386	if (oldadp->ad_lbn >= lbn)
	1387	break;
	1388	}
	1389	if (oldadp == NULL) {
	1390	FREE_LOCK(&lk);
	1391	panic("softdep_setup_allocdirect: lost entry");
	1392	}
	1393	/* insert in middle of list */
	1394	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
	1395	if (oldadp->ad_lbn == lbn)
	1396	allocdirect_merge(adphead, adp, oldadp);
	1397	FREE_LOCK(&lk);
	1398	}
	1399
	1400	/*
	1401	* Replace an old allocdirect dependency with a newer one.
	1402	* This routine must be called with splbio interrupts blocked.
	1403	*/
	1404	static void
	1405	allocdirect_merge(adphead, newadp, oldadp)
	1406	struct allocdirectlst adphead; / head of list holding allocdirects */
	1407	struct allocdirect newadp; / allocdirect being added */
	1408	struct allocdirect oldadp; / existing allocdirect being checked */
	1409	{
	1410	struct freefrag *freefrag;
	1411
	1412	#ifdef DEBUG
	1413	if (lk.lkt_held == -1)
	1414	panic("allocdirect_merge: lock not held");
	1415	#endif
	1416	if (newadp->ad_oldblkno != oldadp->ad_newblkno \|\|
	1417	newadp->ad_oldsize != oldadp->ad_newsize \|\|
	1418	newadp->ad_lbn >= NDADDR) {
	1419	FREE_LOCK(&lk);
	1420	panic("allocdirect_check: old %d != new %d \|\| lbn %ld >= %d",
	1421	newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
	1422	NDADDR);
	1423	}
	1424	newadp->ad_oldblkno = oldadp->ad_oldblkno;
	1425	newadp->ad_oldsize = oldadp->ad_oldsize;
	1426	/*
	1427	* If the old dependency had a fragment to free or had never
	1428	* previously had a block allocated, then the new dependency
	1429	* can immediately post its freefrag and adopt the old freefrag.
	1430	* This action is done by swapping the freefrag dependencies.
	1431	* The new dependency gains the old one's freefrag, and the
	1432	* old one gets the new one and then immediately puts it on
	1433	* the worklist when it is freed by free_allocdirect. It is
	1434	* not possible to do this swap when the old dependency had a
	1435	* non-zero size but no previous fragment to free. This condition
	1436	* arises when the new block is an extension of the old block.
	1437	* Here, the first part of the fragment allocated to the new
	1438	* dependency is part of the block currently claimed on disk by
	1439	* the old dependency, so cannot legitimately be freed until the
	1440	* conditions for the new dependency are fulfilled.
	1441	*/
	1442	if (oldadp->ad_freefrag != NULL \|\| oldadp->ad_oldblkno == 0) {
	1443	freefrag = newadp->ad_freefrag;
	1444	newadp->ad_freefrag = oldadp->ad_freefrag;
	1445	oldadp->ad_freefrag = freefrag;
	1446	}
	1447	free_allocdirect(adphead, oldadp, 0);
	1448	}
	1449
	1450	/*
	1451	* Allocate a new freefrag structure if needed.
	1452	*/
	1453	static struct freefrag *
	1454	newfreefrag(ip, blkno, size)
	1455	struct inode *ip;
	1456	ufs_daddr_t blkno;
	1457	long size;
	1458	{
	1459	struct freefrag *freefrag;
	1460	struct fs *fs;
	1461
	1462	if (blkno == 0)
	1463	return (NULL);
	1464	fs = ip->i_fs;
	1465	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
	1466	panic("newfreefrag: frag size");
	1467	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
	1468	M_FREEFRAG, M_SOFTDEP_FLAGS);
	1469	freefrag->ff_list.wk_type = D_FREEFRAG;
	1470	freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */
	1471	freefrag->ff_inum = ip->i_number;
	1472	freefrag->ff_fs = fs;
	1473	freefrag->ff_devvp = ip->i_devvp;
	1474	freefrag->ff_blkno = blkno;
	1475	freefrag->ff_fragsize = size;
	1476	return (freefrag);
	1477	}
	1478
	1479	/*
	1480	* This workitem de-allocates fragments that were replaced during
	1481	* file block allocation.
	1482	*/
	1483	static void
	1484	handle_workitem_freefrag(freefrag)
	1485	struct freefrag *freefrag;
	1486	{
	1487	struct inode tip;
	1488
	1489	tip.i_fs = freefrag->ff_fs;
	1490	tip.i_devvp = freefrag->ff_devvp;
	1491	tip.i_dev = freefrag->ff_devvp->v_rdev;
	1492	tip.i_number = freefrag->ff_inum;
	1493	tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above */
	1494	ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
	1495	FREE(freefrag, M_FREEFRAG);
	1496	}
	1497
	1498	/*
	1499	* Indirect block allocation dependencies.
	1500	*
	1501	* The same dependencies that exist for a direct block also exist when
	1502	* a new block is allocated and pointed to by an entry in a block of
	1503	* indirect pointers. The undo/redo states described above are also
	1504	* used here. Because an indirect block contains many pointers that
	1505	* may have dependencies, a second copy of the entire in-memory indirect
	1506	* block is kept. The buffer cache copy is always completely up-to-date.
	1507	* The second copy, which is used only as a source for disk writes,
	1508	* contains only the safe pointers (i.e., those that have no remaining
	1509	* update dependencies). The second copy is freed when all pointers
	1510	* are safe. The cache is not allowed to replace indirect blocks with
	1511	* pending update dependencies. If a buffer containing an indirect
	1512	* block with dependencies is written, these routines will mark it
	1513	* dirty again. It can only be successfully written once all the
	1514	* dependencies are removed. The ffs_fsync routine in conjunction with
	1515	* softdep_sync_metadata work together to get all the dependencies
	1516	* removed so that a file can be successfully written to disk. Three
	1517	* procedures are used when setting up indirect block pointer
	1518	* dependencies. The division is necessary because of the organization
	1519	* of the "balloc" routine and because of the distinction between file
	1520	* pages and file metadata blocks.
	1521	*/
	1522
	1523	/*
	1524	* Allocate a new allocindir structure.
	1525	*/
	1526	static struct allocindir *
	1527	newallocindir(ip, ptrno, newblkno, oldblkno)
	1528	struct inode ip; / inode for file being extended */
	1529	int ptrno; /* offset of pointer in indirect block */
	1530	ufs_daddr_t newblkno; /* disk block number being added */
	1531	ufs_daddr_t oldblkno; /* previous block number, 0 if none */
	1532	{
	1533	struct allocindir *aip;
	1534
	1535	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
	1536	M_ALLOCINDIR, M_SOFTDEP_FLAGS);
	1537	bzero(aip, sizeof(struct allocindir));
	1538	aip->ai_list.wk_type = D_ALLOCINDIR;
	1539	aip->ai_state = ATTACHED;
	1540	aip->ai_offset = ptrno;
	1541	aip->ai_newblkno = newblkno;
	1542	aip->ai_oldblkno = oldblkno;
	1543	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
	1544	return (aip);
	1545	}
	1546
	1547	/*
	1548	* Called just before setting an indirect block pointer
	1549	* to a newly allocated file page.
	1550	*/
	1551	void
	1552	softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
	1553	struct inode ip; / inode for file being extended */
	1554	ufs_lbn_t lbn; /* allocated block number within file */
	1555	struct buf bp; / buffer with indirect blk referencing page */
	1556	int ptrno; /* offset of pointer in indirect block */
	1557	ufs_daddr_t newblkno; /* disk block number being added */
	1558	ufs_daddr_t oldblkno; /* previous block number, 0 if none */
	1559	struct buf nbp; / buffer holding allocated page */
	1560	{
	1561	struct allocindir *aip;
	1562	struct pagedep *pagedep;
	1563
	1564	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
	1565	ACQUIRE_LOCK(&lk);
	1566	/*
	1567	* If we are allocating a directory page, then we must
	1568	* allocate an associated pagedep to track additions and
	1569	* deletions.
	1570	*/
	1571	if ((ip->i_mode & IFMT) == IFDIR &&
	1572	pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
	1573	WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
	1574	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
	1575	FREE_LOCK(&lk);
	1576	setup_allocindir_phase2(bp, ip, aip);
	1577	}
	1578
	1579	/*
	1580	* Called just before setting an indirect block pointer to a
	1581	* newly allocated indirect block.
	1582	*/
	1583	void
	1584	softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
	1585	struct buf nbp; / newly allocated indirect block */
	1586	struct inode ip; / inode for file being extended */
	1587	struct buf bp; / indirect block referencing allocated block */
	1588	int ptrno; /* offset of pointer in indirect block */
	1589	ufs_daddr_t newblkno; /* disk block number being added */
	1590	{
	1591	struct allocindir *aip;
	1592
	1593	aip = newallocindir(ip, ptrno, newblkno, 0);
	1594	ACQUIRE_LOCK(&lk);
	1595	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
	1596	FREE_LOCK(&lk);
	1597	setup_allocindir_phase2(bp, ip, aip);
	1598	}
	1599
	1600	/*
	1601	* Called to finish the allocation of the "aip" allocated
	1602	* by one of the two routines above.
	1603	*/
	1604	static void
	1605	setup_allocindir_phase2(bp, ip, aip)
	1606	struct buf bp; / in-memory copy of the indirect block */
	1607	struct inode ip; / inode for file being extended */
	1608	struct allocindir aip; / allocindir allocated by the above routines */
	1609	{
	1610	struct worklist *wk;
	1611	struct indirdep indirdep, newindirdep;
	1612	struct bmsafemap *bmsafemap;
	1613	struct allocindir *oldaip;
	1614	struct freefrag *freefrag;
	1615	struct newblk *newblk;
	1616
	1617	if (bp->b_lblkno >= 0)
	1618	panic("setup_allocindir_phase2: not indir blk");
	1619	for (indirdep = NULL, newindirdep = NULL; ; ) {
	1620	ACQUIRE_LOCK(&lk);
	1621	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	1622	if (wk->wk_type != D_INDIRDEP)
	1623	continue;
	1624	indirdep = WK_INDIRDEP(wk);
	1625	break;
	1626	}
	1627	if (indirdep == NULL && newindirdep) {
	1628	indirdep = newindirdep;
	1629	WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
	1630	newindirdep = NULL;
	1631	}
	1632	FREE_LOCK(&lk);
	1633	if (indirdep) {
	1634	if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
	1635	&newblk) == 0)
	1636	panic("setup_allocindir: lost block");
	1637	ACQUIRE_LOCK(&lk);
	1638	if (newblk->nb_state == DEPCOMPLETE) {
	1639	aip->ai_state \|= DEPCOMPLETE;
	1640	aip->ai_buf = NULL;
	1641	} else {
	1642	bmsafemap = newblk->nb_bmsafemap;
	1643	aip->ai_buf = bmsafemap->sm_buf;
	1644	LIST_REMOVE(newblk, nb_deps);
	1645	LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
	1646	aip, ai_deps);
	1647	}
	1648	LIST_REMOVE(newblk, nb_hash);
	1649	FREE(newblk, M_NEWBLK);
	1650	aip->ai_indirdep = indirdep;
	1651	/*
	1652	* Check to see if there is an existing dependency
	1653	* for this block. If there is, merge the old
	1654	* dependency into the new one.
	1655	*/
	1656	if (aip->ai_oldblkno == 0)
	1657	oldaip = NULL;
	1658	else
	1659
	1660	LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
	1661	if (oldaip->ai_offset == aip->ai_offset)
	1662	break;
	1663	if (oldaip != NULL) {
	1664	if (oldaip->ai_newblkno != aip->ai_oldblkno) {
	1665	FREE_LOCK(&lk);
	1666	panic("setup_allocindir_phase2: blkno");
	1667	}
	1668	aip->ai_oldblkno = oldaip->ai_oldblkno;
	1669	freefrag = oldaip->ai_freefrag;
	1670	oldaip->ai_freefrag = aip->ai_freefrag;
	1671	aip->ai_freefrag = freefrag;
	1672	free_allocindir(oldaip, NULL);
	1673	}
	1674	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
	1675	((ufs_daddr_t *)indirdep->ir_savebp->b_data)
	1676	[aip->ai_offset] = aip->ai_oldblkno;
	1677	FREE_LOCK(&lk);
	1678	}
	1679	if (newindirdep) {
	1680	if (indirdep->ir_savebp != NULL)
	1681	brelse(newindirdep->ir_savebp);
	1682	WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
	1683	}
	1684	if (indirdep)
	1685	break;
	1686	MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
	1687	M_INDIRDEP, M_SOFTDEP_FLAGS);
	1688	newindirdep->ir_list.wk_type = D_INDIRDEP;
	1689	newindirdep->ir_state = ATTACHED;
	1690	LIST_INIT(&newindirdep->ir_deplisthd);
	1691	LIST_INIT(&newindirdep->ir_donehd);
	1692	if (bp->b_blkno == bp->b_lblkno) {
	1693	VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
	1694	NULL, NULL);
	1695	}
	1696	newindirdep->ir_savebp =
	1697	getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
	1698	BUF_KERNPROC(newindirdep->ir_savebp);
	1699	bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
	1700	}
	1701	}
	1702
	1703	/*
	1704	* Block de-allocation dependencies.
	1705	*
	1706	* When blocks are de-allocated, the on-disk pointers must be nullified before
	1707	* the blocks are made available for use by other files. (The true
	1708	* requirement is that old pointers must be nullified before new on-disk
	1709	* pointers are set. We chose this slightly more stringent requirement to
	1710	* reduce complexity.) Our implementation handles this dependency by updating
	1711	* the inode (or indirect block) appropriately but delaying the actual block
	1712	* de-allocation (i.e., freemap and free space count manipulation) until
	1713	* after the updated versions reach stable storage. After the disk is
	1714	* updated, the blocks can be safely de-allocated whenever it is convenient.
	1715	* This implementation handles only the common case of reducing a file's
	1716	* length to zero. Other cases are handled by the conventional synchronous
	1717	* write approach.
	1718	*
	1719	* The ffs implementation with which we worked double-checks
	1720	* the state of the block pointers and file size as it reduces
	1721	* a file's length. Some of this code is replicated here in our
	1722	* soft updates implementation. The freeblks->fb_chkcnt field is
	1723	* used to transfer a part of this information to the procedure
	1724	* that eventually de-allocates the blocks.
	1725	*
	1726	* This routine should be called from the routine that shortens
	1727	* a file's length, before the inode's size or block pointers
	1728	* are modified. It will save the block pointer information for
	1729	* later release and zero the inode so that the calling routine
	1730	* can release it.
	1731	*/
	1732	void
	1733	softdep_setup_freeblocks(ip, length)
	1734	struct inode ip; / The inode whose length is to be reduced */
	1735	off_t length; /* The new length for the file */
	1736	{
	1737	struct freeblks *freeblks;
	1738	struct inodedep *inodedep;
	1739	struct allocdirect *adp;
	1740	struct vnode *vp;
	1741	struct buf *bp;
	1742	struct fs *fs;
	1743	int i, error, delay;
	1744
	1745	fs = ip->i_fs;
	1746	if (length != 0)
	1747	panic("softde_setup_freeblocks: non-zero length");
	1748	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
	1749	M_FREEBLKS, M_SOFTDEP_FLAGS);
	1750	bzero(freeblks, sizeof(struct freeblks));
	1751	freeblks->fb_list.wk_type = D_FREEBLKS;
	1752	freeblks->fb_uid = ip->i_uid;
	1753	freeblks->fb_previousinum = ip->i_number;
	1754	freeblks->fb_devvp = ip->i_devvp;
	1755	freeblks->fb_fs = fs;
	1756	freeblks->fb_oldsize = ip->i_size;
	1757	freeblks->fb_newsize = length;
	1758	freeblks->fb_chkcnt = ip->i_blocks;
	1759	for (i = 0; i < NDADDR; i++) {
	1760	freeblks->fb_dblks[i] = ip->i_db[i];
	1761	ip->i_db[i] = 0;
	1762	}
	1763	for (i = 0; i < NIADDR; i++) {
	1764	freeblks->fb_iblks[i] = ip->i_ib[i];
	1765	ip->i_ib[i] = 0;
	1766	}
	1767	ip->i_blocks = 0;
	1768	ip->i_size = 0;
	1769	/*
	1770	* Push the zero'ed inode to to its disk buffer so that we are free
	1771	* to delete its dependencies below. Once the dependencies are gone
	1772	* the buffer can be safely released.
	1773	*/
	1774	if ((error = bread(ip->i_devvp,
	1775	fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
	1776	(int)fs->fs_bsize, NOCRED, &bp)) != 0)
	1777	softdep_error("softdep_setup_freeblocks", error);
	1778	((struct dinode )bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
	1779	ip->i_din;
	1780	/*
	1781	* Find and eliminate any inode dependencies.
	1782	*/
	1783	ACQUIRE_LOCK(&lk);
	1784	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
	1785	if ((inodedep->id_state & IOSTARTED) != 0) {
	1786	FREE_LOCK(&lk);
	1787	panic("softdep_setup_freeblocks: inode busy");
	1788	}
	1789	/*
	1790	* Add the freeblks structure to the list of operations that
	1791	* must await the zero'ed inode being written to disk. If we
	1792	* still have a bitmap dependency (delay == 0), then the inode
	1793	* has never been written to disk, so we can process the
	1794	* freeblks below once we have deleted the dependencies.
	1795	*/
	1796	delay = (inodedep->id_state & DEPCOMPLETE);
	1797	if (delay)
	1798	WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
	1799	/*
	1800	* Because the file length has been truncated to zero, any
	1801	* pending block allocation dependency structures associated
	1802	* with this inode are obsolete and can simply be de-allocated.
	1803	* We must first merge the two dependency lists to get rid of
	1804	* any duplicate freefrag structures, then purge the merged list.
	1805	*/
	1806	merge_inode_lists(inodedep);
	1807	while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
	1808	free_allocdirect(&inodedep->id_inoupdt, adp, 1);
	1809	FREE_LOCK(&lk);
	1810	bdwrite(bp);
	1811	/*
	1812	* We must wait for any I/O in progress to finish so that
	1813	* all potential buffers on the dirty list will be visible.
	1814	* Once they are all there, walk the list and get rid of
	1815	* any dependencies.
	1816	*/
	1817	vp = ITOV(ip);
	1818	ACQUIRE_LOCK(&lk);
	1819	drain_output(vp, 1);
	1820	while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) {
	1821	bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
	1822	(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
	1823	deallocate_dependencies(bp, inodedep);
	1824	bp->b_flags \|= B_INVAL \| B_NOCACHE;
	1825	FREE_LOCK(&lk);
	1826	brelse(bp);
	1827	ACQUIRE_LOCK(&lk);
	1828	}
	1829	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
	1830	(void)free_inodedep(inodedep);
	1831	FREE_LOCK(&lk);
	1832	/*
	1833	* If the inode has never been written to disk (delay == 0),
	1834	* then we can process the freeblks now that we have deleted
	1835	* the dependencies.
	1836	*/
	1837	if (!delay)
	1838	handle_workitem_freeblocks(freeblks);
	1839	}
	1840
	1841	/*
	1842	* Reclaim any dependency structures from a buffer that is about to
	1843	* be reallocated to a new vnode. The buffer must be locked, thus,
	1844	* no I/O completion operations can occur while we are manipulating
	1845	* its associated dependencies. The mutex is held so that other I/O's
	1846	* associated with related dependencies do not occur.
	1847	*/
	1848	static void
	1849	deallocate_dependencies(bp, inodedep)
	1850	struct buf *bp;
	1851	struct inodedep *inodedep;
	1852	{
	1853	struct worklist *wk;
	1854	struct indirdep *indirdep;
	1855	struct allocindir *aip;
	1856	struct pagedep *pagedep;
	1857	struct dirrem *dirrem;
	1858	struct diradd *dap;
	1859	int i;
	1860
	1861	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
	1862	switch (wk->wk_type) {
	1863
	1864	case D_INDIRDEP:
	1865	indirdep = WK_INDIRDEP(wk);
	1866	/*
	1867	* None of the indirect pointers will ever be visible,
	1868	* so they can simply be tossed. GOINGAWAY ensures
	1869	* that allocated pointers will be saved in the buffer
	1870	* cache until they are freed. Note that they will
	1871	* only be able to be found by their physical address
	1872	* since the inode mapping the logical address will
	1873	* be gone. The save buffer used for the safe copy
	1874	* was allocated in setup_allocindir_phase2 using
	1875	* the physical address so it could be used for this
	1876	* purpose. Hence we swap the safe copy with the real
	1877	* copy, allowing the safe copy to be freed and holding
	1878	* on to the real copy for later use in indir_trunc.
	1879	*/
	1880	if (indirdep->ir_state & GOINGAWAY) {
	1881	FREE_LOCK(&lk);
	1882	panic("deallocate_dependencies: already gone");
	1883	}
	1884	indirdep->ir_state \|= GOINGAWAY;
	1885	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
	1886	free_allocindir(aip, inodedep);
	1887	if (bp->b_lblkno >= 0 \|\|
	1888	bp->b_blkno != indirdep->ir_savebp->b_lblkno) {
	1889	FREE_LOCK(&lk);
	1890	panic("deallocate_dependencies: not indir");
	1891	}
	1892	bcopy(bp->b_data, indirdep->ir_savebp->b_data,
	1893	bp->b_bcount);
	1894	WORKLIST_REMOVE(wk);
	1895	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
	1896	continue;
	1897
	1898	case D_PAGEDEP:
	1899	pagedep = WK_PAGEDEP(wk);
	1900	/*
	1901	* None of the directory additions will ever be
	1902	* visible, so they can simply be tossed.
	1903	*/
	1904	for (i = 0; i < DAHASHSZ; i++)
	1905	while ((dap =
	1906	LIST_FIRST(&pagedep->pd_diraddhd[i])))
	1907	free_diradd(dap);
	1908	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
	1909	free_diradd(dap);
	1910	/*
	1911	* Copy any directory remove dependencies to the list
	1912	* to be processed after the zero'ed inode is written.
	1913	* If the inode has already been written, then they
	1914	* can be dumped directly onto the work list.
	1915	*/
	1916	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
	1917	LIST_REMOVE(dirrem, dm_next);
	1918	dirrem->dm_dirinum = pagedep->pd_ino;
	1919	if (inodedep == NULL \|\|
	1920	(inodedep->id_state & ALLCOMPLETE) ==
	1921	ALLCOMPLETE)
	1922	add_to_worklist(&dirrem->dm_list);
	1923	else
	1924	WORKLIST_INSERT(&inodedep->id_bufwait,
	1925	&dirrem->dm_list);
	1926	}
	1927	WORKLIST_REMOVE(&pagedep->pd_list);
	1928	LIST_REMOVE(pagedep, pd_hash);
	1929	WORKITEM_FREE(pagedep, D_PAGEDEP);
	1930	continue;
	1931
	1932	case D_ALLOCINDIR:
	1933	free_allocindir(WK_ALLOCINDIR(wk), inodedep);
	1934	continue;
	1935
	1936	case D_ALLOCDIRECT:
	1937	case D_INODEDEP:
	1938	FREE_LOCK(&lk);
	1939	panic("deallocate_dependencies: Unexpected type %s",
	1940	TYPENAME(wk->wk_type));
	1941	/* NOTREACHED */
	1942
	1943	default:
	1944	FREE_LOCK(&lk);
	1945	panic("deallocate_dependencies: Unknown type %s",
	1946	TYPENAME(wk->wk_type));
	1947	/* NOTREACHED */
	1948	}
	1949	}
	1950	}
	1951
	1952	/*
	1953	* Free an allocdirect. Generate a new freefrag work request if appropriate.
	1954	* This routine must be called with splbio interrupts blocked.
	1955	*/
	1956	static void
	1957	free_allocdirect(adphead, adp, delay)
	1958	struct allocdirectlst *adphead;
	1959	struct allocdirect *adp;
	1960	int delay;
	1961	{
	1962
	1963	#ifdef DEBUG
	1964	if (lk.lkt_held == -1)
	1965	panic("free_allocdirect: lock not held");
	1966	#endif
	1967	if ((adp->ad_state & DEPCOMPLETE) == 0)
	1968	LIST_REMOVE(adp, ad_deps);
	1969	TAILQ_REMOVE(adphead, adp, ad_next);
	1970	if ((adp->ad_state & COMPLETE) == 0)
	1971	WORKLIST_REMOVE(&adp->ad_list);
	1972	if (adp->ad_freefrag != NULL) {
	1973	if (delay)
	1974	WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
	1975	&adp->ad_freefrag->ff_list);
	1976	else
	1977	add_to_worklist(&adp->ad_freefrag->ff_list);
	1978	}
	1979	WORKITEM_FREE(adp, D_ALLOCDIRECT);
	1980	}
	1981
	1982	/*
	1983	* Prepare an inode to be freed. The actual free operation is not
	1984	* done until the zero'ed inode has been written to disk.
	1985	*/
	1986	void
	1987	softdep_freefile(pvp, ino, mode)
	1988	struct vnode *pvp;
	1989	ino_t ino;
	1990	int mode;
	1991	{
	1992	struct inode *ip = VTOI(pvp);
	1993	struct inodedep *inodedep;
	1994	struct freefile *freefile;
	1995
	1996	/*
	1997	* This sets up the inode de-allocation dependency.
	1998	*/
	1999	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
	2000	M_FREEFILE, M_SOFTDEP_FLAGS);
	2001	freefile->fx_list.wk_type = D_FREEFILE;
	2002	freefile->fx_list.wk_state = 0;
	2003	freefile->fx_mode = mode;
	2004	freefile->fx_oldinum = ino;
	2005	freefile->fx_devvp = ip->i_devvp;
	2006	freefile->fx_fs = ip->i_fs;
	2007
	2008	/*
	2009	* If the inodedep does not exist, then the zero'ed inode has
	2010	* been written to disk. If the allocated inode has never been
	2011	* written to disk, then the on-disk inode is zero'ed. In either
	2012	* case we can free the file immediately.
	2013	*/
	2014	ACQUIRE_LOCK(&lk);
	2015	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 \|\|
	2016	check_inode_unwritten(inodedep)) {
	2017	FREE_LOCK(&lk);
	2018	handle_workitem_freefile(freefile);
	2019	return;
	2020	}
	2021	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
	2022	FREE_LOCK(&lk);
	2023	}
	2024
	2025	/*
	2026	* Check to see if an inode has never been written to disk. If
	2027	* so free the inodedep and return success, otherwise return failure.
	2028	* This routine must be called with splbio interrupts blocked.
	2029	*
	2030	* If we still have a bitmap dependency, then the inode has never
	2031	* been written to disk. Drop the dependency as it is no longer
	2032	* necessary since the inode is being deallocated. We set the
	2033	* ALLCOMPLETE flags since the bitmap now properly shows that the
	2034	* inode is not allocated. Even if the inode is actively being
	2035	* written, it has been rolled back to its zero'ed state, so we
	2036	* are ensured that a zero inode is what is on the disk. For short
	2037	* lived files, this change will usually result in removing all the
	2038	* dependencies from the inode so that it can be freed immediately.
	2039	*/
	2040	static int
	2041	check_inode_unwritten(inodedep)
	2042	struct inodedep *inodedep;
	2043	{
	2044
	2045	if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\|
	2046	LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\|
	2047	LIST_FIRST(&inodedep->id_bufwait) != NULL \|\|
	2048	LIST_FIRST(&inodedep->id_inowait) != NULL \|\|
	2049	TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\|
	2050	TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\|
	2051	inodedep->id_nlinkdelta != 0)
	2052	return (0);
	2053	inodedep->id_state \|= ALLCOMPLETE;
	2054	LIST_REMOVE(inodedep, id_deps);
	2055	inodedep->id_buf = NULL;
	2056	if (inodedep->id_state & ONWORKLIST)
	2057	WORKLIST_REMOVE(&inodedep->id_list);
	2058	if (inodedep->id_savedino != NULL) {
	2059	FREE(inodedep->id_savedino, M_INODEDEP);
	2060	inodedep->id_savedino = NULL;
	2061	}
	2062	if (free_inodedep(inodedep) == 0) {
	2063	FREE_LOCK(&lk);
	2064	panic("check_inode_unwritten: busy inode");
	2065	}
	2066	return (1);
	2067	}
	2068
	2069	/*
	2070	* Try to free an inodedep structure. Return 1 if it could be freed.
	2071	*/
	2072	static int
	2073	free_inodedep(inodedep)
	2074	struct inodedep *inodedep;
	2075	{
	2076
	2077	if ((inodedep->id_state & ONWORKLIST) != 0 \|\|
	2078	(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE \|\|
	2079	LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\|
	2080	LIST_FIRST(&inodedep->id_bufwait) != NULL \|\|
	2081	LIST_FIRST(&inodedep->id_inowait) != NULL \|\|
	2082	TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\|
	2083	TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\|
	2084	inodedep->id_nlinkdelta != 0 \|\| inodedep->id_savedino != NULL)
	2085	return (0);
	2086	LIST_REMOVE(inodedep, id_hash);
	2087	WORKITEM_FREE(inodedep, D_INODEDEP);
	2088	num_inodedep -= 1;
	2089	return (1);
	2090	}
	2091
	2092	/*
	2093	* This workitem routine performs the block de-allocation.
	2094	* The workitem is added to the pending list after the updated
	2095	* inode block has been written to disk. As mentioned above,
	2096	* checks regarding the number of blocks de-allocated (compared
	2097	* to the number of blocks allocated for the file) are also
	2098	* performed in this function.
	2099	*/
	2100	static void
	2101	handle_workitem_freeblocks(freeblks)
	2102	struct freeblks *freeblks;
	2103	{
	2104	struct inode tip;
	2105	ufs_daddr_t bn;
	2106	struct fs *fs;
	2107	int i, level, bsize;
	2108	long nblocks, blocksreleased = 0;
	2109	int error, allerror = 0;
	2110	ufs_lbn_t baselbns[NIADDR], tmpval;
	2111
	2112	tip.i_number = freeblks->fb_previousinum;
	2113	tip.i_devvp = freeblks->fb_devvp;
	2114	tip.i_dev = freeblks->fb_devvp->v_rdev;
	2115	tip.i_fs = freeblks->fb_fs;
	2116	tip.i_size = freeblks->fb_oldsize;
	2117	tip.i_uid = freeblks->fb_uid;
	2118	fs = freeblks->fb_fs;
	2119	tmpval = 1;
	2120	baselbns[0] = NDADDR;
	2121	for (i = 1; i < NIADDR; i++) {
	2122	tmpval *= NINDIR(fs);
	2123	baselbns[i] = baselbns[i - 1] + tmpval;
	2124	}
	2125	nblocks = btodb(fs->fs_bsize);
	2126	blocksreleased = 0;
	2127	/*
	2128	* Indirect blocks first.
	2129	*/
	2130	for (level = (NIADDR - 1); level >= 0; level--) {
	2131	if ((bn = freeblks->fb_iblks[level]) == 0)
	2132	continue;
	2133	if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
	2134	baselbns[level], &blocksreleased)) == 0)
	2135	allerror = error;
	2136	ffs_blkfree(&tip, bn, fs->fs_bsize);
	2137	blocksreleased += nblocks;
	2138	}
	2139	/*
	2140	* All direct blocks or frags.
	2141	*/
	2142	for (i = (NDADDR - 1); i >= 0; i--) {
	2143	if ((bn = freeblks->fb_dblks[i]) == 0)
	2144	continue;
	2145	bsize = blksize(fs, &tip, i);
	2146	ffs_blkfree(&tip, bn, bsize);
	2147	blocksreleased += btodb(bsize);
	2148	}
	2149
	2150	#ifdef DIAGNOSTIC
	2151	if (freeblks->fb_chkcnt != blocksreleased)
	2152	printf("handle_workitem_freeblocks: block count\n");
	2153	if (allerror)
	2154	softdep_error("handle_workitem_freeblks", allerror);
	2155	#endif /* DIAGNOSTIC */
	2156	WORKITEM_FREE(freeblks, D_FREEBLKS);
	2157	}
	2158
	2159	/*
	2160	* Release blocks associated with the inode ip and stored in the indirect
	2161	* block dbn. If level is greater than SINGLE, the block is an indirect block
	2162	* and recursive calls to indirtrunc must be used to cleanse other indirect
	2163	* blocks.
	2164	*/
	2165	static int
	2166	indir_trunc(ip, dbn, level, lbn, countp)
	2167	struct inode *ip;
	2168	ufs_daddr_t dbn;
	2169	int level;
	2170	ufs_lbn_t lbn;
	2171	long *countp;
	2172	{
	2173	struct buf *bp;
	2174	ufs_daddr_t *bap;
	2175	ufs_daddr_t nb;
	2176	struct fs *fs;
	2177	struct worklist *wk;
	2178	struct indirdep *indirdep;
	2179	int i, lbnadd, nblocks;
	2180	int error, allerror = 0;
	2181
	2182	fs = ip->i_fs;
	2183	lbnadd = 1;
	2184	for (i = level; i > 0; i--)
	2185	lbnadd *= NINDIR(fs);
	2186	/*
	2187	* Get buffer of block pointers to be freed. This routine is not
	2188	* called until the zero'ed inode has been written, so it is safe
	2189	* to free blocks as they are encountered. Because the inode has
	2190	* been zero'ed, calls to bmap on these blocks will fail. So, we
	2191	* have to use the on-disk address and the block device for the
	2192	* filesystem to look them up. If the file was deleted before its
	2193	* indirect blocks were all written to disk, the routine that set
	2194	* us up (deallocate_dependencies) will have arranged to leave
	2195	* a complete copy of the indirect block in memory for our use.
	2196	* Otherwise we have to read the blocks in from the disk.
	2197	*/
	2198	ACQUIRE_LOCK(&lk);
	2199	if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
	2200	(wk = LIST_FIRST(&bp->b_dep)) != NULL) {
	2201	if (wk->wk_type != D_INDIRDEP \|\|
	2202	(indirdep = WK_INDIRDEP(wk))->ir_savebp != bp \|\|
	2203	(indirdep->ir_state & GOINGAWAY) == 0) {
	2204	FREE_LOCK(&lk);
	2205	panic("indir_trunc: lost indirdep");
	2206	}
	2207	WORKLIST_REMOVE(wk);
	2208	WORKITEM_FREE(indirdep, D_INDIRDEP);
	2209	if (LIST_FIRST(&bp->b_dep) != NULL) {
	2210	FREE_LOCK(&lk);
	2211	panic("indir_trunc: dangling dep");
	2212	}
	2213	FREE_LOCK(&lk);
	2214	} else {
	2215	FREE_LOCK(&lk);
	2216	error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
	2217	if (error)
	2218	return (error);
	2219	}
	2220	/*
	2221	* Recursively free indirect blocks.
	2222	*/
	2223	bap = (ufs_daddr_t *)bp->b_data;
	2224	nblocks = btodb(fs->fs_bsize);
	2225	for (i = NINDIR(fs) - 1; i >= 0; i--) {
	2226	if ((nb = bap[i]) == 0)
	2227	continue;
	2228	if (level != 0) {
	2229	if ((error = indir_trunc(ip, fsbtodb(fs, nb),
	2230	level - 1, lbn + (i * lbnadd), countp)) != 0)
	2231	allerror = error;
	2232	}
	2233	ffs_blkfree(ip, nb, fs->fs_bsize);
	2234	*countp += nblocks;
	2235	}
	2236	bp->b_flags \|= B_INVAL \| B_NOCACHE;
	2237	brelse(bp);
	2238	return (allerror);
	2239	}
	2240
	2241	/*
	2242	* Free an allocindir.
	2243	* This routine must be called with splbio interrupts blocked.
	2244	*/
	2245	static void
	2246	free_allocindir(aip, inodedep)
	2247	struct allocindir *aip;
	2248	struct inodedep *inodedep;
	2249	{
	2250	struct freefrag *freefrag;
	2251
	2252	#ifdef DEBUG
	2253	if (lk.lkt_held == -1)
	2254	panic("free_allocindir: lock not held");
	2255	#endif
	2256	if ((aip->ai_state & DEPCOMPLETE) == 0)
	2257	LIST_REMOVE(aip, ai_deps);
	2258	if (aip->ai_state & ONWORKLIST)
	2259	WORKLIST_REMOVE(&aip->ai_list);
	2260	LIST_REMOVE(aip, ai_next);
	2261	if ((freefrag = aip->ai_freefrag) != NULL) {
	2262	if (inodedep == NULL)
	2263	add_to_worklist(&freefrag->ff_list);
	2264	else
	2265	WORKLIST_INSERT(&inodedep->id_bufwait,
	2266	&freefrag->ff_list);
	2267	}
	2268	WORKITEM_FREE(aip, D_ALLOCINDIR);
	2269	}
	2270
	2271	/*
	2272	* Directory entry addition dependencies.
	2273	*
	2274	* When adding a new directory entry, the inode (with its incremented link
	2275	* count) must be written to disk before the directory entry's pointer to it.
	2276	* Also, if the inode is newly allocated, the corresponding freemap must be
	2277	* updated (on disk) before the directory entry's pointer. These requirements
	2278	* are met via undo/redo on the directory entry's pointer, which consists
	2279	* simply of the inode number.
	2280	*
	2281	* As directory entries are added and deleted, the free space within a
	2282	* directory block can become fragmented. The ufs file system will compact
	2283	* a fragmented directory block to make space for a new entry. When this
	2284	* occurs, the offsets of previously added entries change. Any "diradd"
	2285	* dependency structures corresponding to these entries must be updated with
	2286	* the new offsets.
	2287	*/
	2288
	2289	/*
	2290	* This routine is called after the in-memory inode's link
	2291	* count has been incremented, but before the directory entry's
	2292	* pointer to the inode has been set.
	2293	*/
	2294	void
	2295	softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
	2296	struct buf bp; / buffer containing directory block */
	2297	struct inode dp; / inode for directory */
	2298	off_t diroffset; /* offset of new entry in directory */
	2299	long newinum; /* inode referenced by new directory entry */
	2300	struct buf newdirbp; / non-NULL => contents of new mkdir */
	2301	{
	2302	int offset; /* offset of new entry within directory block */
	2303	ufs_lbn_t lbn; /* block in directory containing new entry */
	2304	struct fs *fs;
	2305	struct diradd *dap;
	2306	struct pagedep *pagedep;
	2307	struct inodedep *inodedep;
	2308	struct mkdir mkdir1, mkdir2;
	2309
	2310	/*
	2311	* Whiteouts have no dependencies.
	2312	*/
	2313	if (newinum == WINO) {
	2314	if (newdirbp != NULL)
	2315	bdwrite(newdirbp);
	2316	return;
	2317	}
	2318
	2319	fs = dp->i_fs;
	2320	lbn = lblkno(fs, diroffset);
	2321	offset = blkoff(fs, diroffset);
	2322	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
	2323	M_SOFTDEP_FLAGS);
	2324	bzero(dap, sizeof(struct diradd));
	2325	dap->da_list.wk_type = D_DIRADD;
	2326	dap->da_offset = offset;
	2327	dap->da_newinum = newinum;
	2328	dap->da_state = ATTACHED;
	2329	if (newdirbp == NULL) {
	2330	dap->da_state \|= DEPCOMPLETE;
	2331	ACQUIRE_LOCK(&lk);
	2332	} else {
	2333	dap->da_state \|= MKDIR_BODY \| MKDIR_PARENT;
	2334	MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
	2335	M_SOFTDEP_FLAGS);
	2336	mkdir1->md_list.wk_type = D_MKDIR;
	2337	mkdir1->md_state = MKDIR_BODY;
	2338	mkdir1->md_diradd = dap;
	2339	MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
	2340	M_SOFTDEP_FLAGS);
	2341	mkdir2->md_list.wk_type = D_MKDIR;
	2342	mkdir2->md_state = MKDIR_PARENT;
	2343	mkdir2->md_diradd = dap;
	2344	/*
	2345	* Dependency on "." and ".." being written to disk.
	2346	*/
	2347	mkdir1->md_buf = newdirbp;
	2348	ACQUIRE_LOCK(&lk);
	2349	LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
	2350	WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
	2351	FREE_LOCK(&lk);
	2352	bdwrite(newdirbp);
	2353	/*
	2354	* Dependency on link count increase for parent directory
	2355	*/
	2356	ACQUIRE_LOCK(&lk);
	2357	if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
	2358	\|\| (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
	2359	dap->da_state &= ~MKDIR_PARENT;
	2360	WORKITEM_FREE(mkdir2, D_MKDIR);
	2361	} else {
	2362	LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
	2363	WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
	2364	}
	2365	}
	2366	/*
	2367	* Link into parent directory pagedep to await its being written.
	2368	*/
	2369	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
	2370	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
	2371	dap->da_pagedep = pagedep;
	2372	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
	2373	da_pdlist);
	2374	/*
	2375	* Link into its inodedep. Put it on the id_bufwait list if the inode
	2376	* is not yet written. If it is written, do the post-inode write
	2377	* processing to put it on the id_pendinghd list.
	2378	*/
	2379	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
	2380	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
	2381	diradd_inode_written(dap, inodedep);
	2382	else
	2383	WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
	2384	FREE_LOCK(&lk);
	2385	}
	2386
	2387	/*
	2388	* This procedure is called to change the offset of a directory
	2389	* entry when compacting a directory block which must be owned
	2390	* exclusively by the caller. Note that the actual entry movement
	2391	* must be done in this procedure to ensure that no I/O completions
	2392	* occur while the move is in progress.
	2393	*/
	2394	void
	2395	softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
	2396	struct inode dp; / inode for directory */
	2397	caddr_t base; /* address of dp->i_offset */
	2398	caddr_t oldloc; /* address of old directory location */
	2399	caddr_t newloc; /* address of new directory location */
	2400	int entrysize; /* size of directory entry */
	2401	{
	2402	int offset, oldoffset, newoffset;
	2403	struct pagedep *pagedep;
	2404	struct diradd *dap;
	2405	ufs_lbn_t lbn;
	2406
	2407	ACQUIRE_LOCK(&lk);
	2408	lbn = lblkno(dp->i_fs, dp->i_offset);
	2409	offset = blkoff(dp->i_fs, dp->i_offset);
	2410	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
	2411	goto done;
	2412	oldoffset = offset + (oldloc - base);
	2413	newoffset = offset + (newloc - base);
	2414
	2415	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
	2416	if (dap->da_offset != oldoffset)
	2417	continue;
	2418	dap->da_offset = newoffset;
	2419	if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
	2420	break;
	2421	LIST_REMOVE(dap, da_pdlist);
	2422	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
	2423	dap, da_pdlist);
	2424	break;
	2425	}
	2426	if (dap == NULL) {
	2427
	2428	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
	2429	if (dap->da_offset == oldoffset) {
	2430	dap->da_offset = newoffset;
	2431	break;
	2432	}
	2433	}
	2434	}
	2435	done:
	2436	bcopy(oldloc, newloc, entrysize);
	2437	FREE_LOCK(&lk);
	2438	}
	2439
	2440	/*
	2441	* Free a diradd dependency structure. This routine must be called
	2442	* with splbio interrupts blocked.
	2443	*/
	2444	static void
	2445	free_diradd(dap)
	2446	struct diradd *dap;
	2447	{
	2448	struct dirrem *dirrem;
	2449	struct pagedep *pagedep;
	2450	struct inodedep *inodedep;
	2451	struct mkdir mkdir, nextmd;
	2452
	2453	#ifdef DEBUG
	2454	if (lk.lkt_held == -1)
	2455	panic("free_diradd: lock not held");
	2456	#endif
	2457	WORKLIST_REMOVE(&dap->da_list);
	2458	LIST_REMOVE(dap, da_pdlist);
	2459	if ((dap->da_state & DIRCHG) == 0) {
	2460	pagedep = dap->da_pagedep;
	2461	} else {
	2462	dirrem = dap->da_previous;
	2463	pagedep = dirrem->dm_pagedep;
	2464	dirrem->dm_dirinum = pagedep->pd_ino;
	2465	add_to_worklist(&dirrem->dm_list);
	2466	}
	2467	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
	2468	0, &inodedep) != 0)
	2469	(void) free_inodedep(inodedep);
	2470	if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) {
	2471	for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
	2472	nextmd = LIST_NEXT(mkdir, md_mkdirs);
	2473	if (mkdir->md_diradd != dap)
	2474	continue;
	2475	dap->da_state &= ~mkdir->md_state;
	2476	WORKLIST_REMOVE(&mkdir->md_list);
	2477	LIST_REMOVE(mkdir, md_mkdirs);
	2478	WORKITEM_FREE(mkdir, D_MKDIR);
	2479	}
	2480	if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) {
	2481	FREE_LOCK(&lk);
	2482	panic("free_diradd: unfound ref");
	2483	}
	2484	}
	2485	WORKITEM_FREE(dap, D_DIRADD);
	2486	}
	2487
	2488	/*
	2489	* Directory entry removal dependencies.
	2490	*
	2491	* When removing a directory entry, the entry's inode pointer must be
	2492	* zero'ed on disk before the corresponding inode's link count is decremented
	2493	* (possibly freeing the inode for re-use). This dependency is handled by
	2494	* updating the directory entry but delaying the inode count reduction until
	2495	* after the directory block has been written to disk. After this point, the
	2496	* inode count can be decremented whenever it is convenient.
	2497	*/
	2498
	2499	/*
	2500	* This routine should be called immediately after removing
	2501	* a directory entry. The inode's link count should not be
	2502	* decremented by the calling procedure -- the soft updates
	2503	* code will do this task when it is safe.
	2504	*/
	2505	void
	2506	softdep_setup_remove(bp, dp, ip, isrmdir)
	2507	struct buf bp; / buffer containing directory block */
	2508	struct inode dp; / inode for the directory being modified */
	2509	struct inode ip; / inode for directory entry being removed */
	2510	int isrmdir; /* indicates if doing RMDIR */
	2511	{
	2512	struct dirrem dirrem, prevdirrem;
	2513
	2514	/*
	2515	* Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
	2516	*/
	2517	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
	2518
	2519	/*
	2520	* If the COMPLETE flag is clear, then there were no active
	2521	* entries and we want to roll back to a zeroed entry until
	2522	* the new inode is committed to disk. If the COMPLETE flag is
	2523	* set then we have deleted an entry that never made it to
	2524	* disk. If the entry we deleted resulted from a name change,
	2525	* then the old name still resides on disk. We cannot delete
	2526	* its inode (returned to us in prevdirrem) until the zeroed
	2527	* directory entry gets to disk. The new inode has never been
	2528	* referenced on the disk, so can be deleted immediately.
	2529	*/
	2530	if ((dirrem->dm_state & COMPLETE) == 0) {
	2531	LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
	2532	dm_next);
	2533	FREE_LOCK(&lk);
	2534	} else {
	2535	if (prevdirrem != NULL)
	2536	LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
	2537	prevdirrem, dm_next);
	2538	dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
	2539	FREE_LOCK(&lk);
	2540	handle_workitem_remove(dirrem);
	2541	}
	2542	}
	2543
	2544	/*
	2545	* Allocate a new dirrem if appropriate and return it along with
	2546	* its associated pagedep. Called without a lock, returns with lock.
	2547	*/
	2548	static long num_dirrem; /* number of dirrem allocated */
	2549	static struct dirrem *
	2550	newdirrem(bp, dp, ip, isrmdir, prevdirremp)
	2551	struct buf bp; / buffer containing directory block */
	2552	struct inode dp; / inode for the directory being modified */
	2553	struct inode ip; / inode for directory entry being removed */
	2554	int isrmdir; /* indicates if doing RMDIR */
	2555	struct dirrem *prevdirremp; / previously referenced inode, if any */
	2556	{
	2557	int offset;
	2558	ufs_lbn_t lbn;
	2559	struct diradd *dap;
	2560	struct dirrem *dirrem;
	2561	struct pagedep *pagedep;
	2562
	2563	/*
	2564	* Whiteouts have no deletion dependencies.
	2565	*/
	2566	if (ip == NULL)
	2567	panic("newdirrem: whiteout");
	2568	/*
	2569	* If we are over our limit, try to improve the situation.
	2570	* Limiting the number of dirrem structures will also limit
	2571	* the number of freefile and freeblks structures.
	2572	*/
	2573	if (num_dirrem > max_softdeps / 2 && speedup_syncer() == 0)
	2574	(void) request_cleanup(FLUSH_REMOVE, 0);
	2575	num_dirrem += 1;
	2576	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
	2577	M_DIRREM, M_SOFTDEP_FLAGS);
	2578	bzero(dirrem, sizeof(struct dirrem));
	2579	dirrem->dm_list.wk_type = D_DIRREM;
	2580	dirrem->dm_state = isrmdir ? RMDIR : 0;
	2581	dirrem->dm_mnt = ITOV(ip)->v_mount;
	2582	dirrem->dm_oldinum = ip->i_number;
	2583	*prevdirremp = NULL;
	2584
	2585	ACQUIRE_LOCK(&lk);
	2586	lbn = lblkno(dp->i_fs, dp->i_offset);
	2587	offset = blkoff(dp->i_fs, dp->i_offset);
	2588	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
	2589	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
	2590	dirrem->dm_pagedep = pagedep;
	2591	/*
	2592	* Check for a diradd dependency for the same directory entry.
	2593	* If present, then both dependencies become obsolete and can
	2594	* be de-allocated. Check for an entry on both the pd_dirraddhd
	2595	* list and the pd_pendinghd list.
	2596	*/
	2597
	2598	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
	2599	if (dap->da_offset == offset)
	2600	break;
	2601	if (dap == NULL) {
	2602
	2603	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
	2604	if (dap->da_offset == offset)
	2605	break;
	2606	if (dap == NULL)
	2607	return (dirrem);
	2608	}
	2609	/*
	2610	* Must be ATTACHED at this point.
	2611	*/
	2612	if ((dap->da_state & ATTACHED) == 0) {
	2613	FREE_LOCK(&lk);
	2614	panic("newdirrem: not ATTACHED");
	2615	}
	2616	if (dap->da_newinum != ip->i_number) {
	2617	FREE_LOCK(&lk);
	2618	panic("newdirrem: inum %d should be %d",
	2619	ip->i_number, dap->da_newinum);
	2620	}
	2621	/*
	2622	* If we are deleting a changed name that never made it to disk,
	2623	* then return the dirrem describing the previous inode (which
	2624	* represents the inode currently referenced from this entry on disk).
	2625	*/
	2626	if ((dap->da_state & DIRCHG) != 0) {
	2627	*prevdirremp = dap->da_previous;
	2628	dap->da_state &= ~DIRCHG;
	2629	dap->da_pagedep = pagedep;
	2630	}
	2631	/*
	2632	* We are deleting an entry that never made it to disk.
	2633	* Mark it COMPLETE so we can delete its inode immediately.
	2634	*/
	2635	dirrem->dm_state \|= COMPLETE;
	2636	free_diradd(dap);
	2637	return (dirrem);
	2638	}
	2639
	2640	/*
	2641	* Directory entry change dependencies.
	2642	*
	2643	* Changing an existing directory entry requires that an add operation
	2644	* be completed first followed by a deletion. The semantics for the addition
	2645	* are identical to the description of adding a new entry above except
	2646	* that the rollback is to the old inode number rather than zero. Once
	2647	* the addition dependency is completed, the removal is done as described
	2648	* in the removal routine above.
	2649	*/
	2650
	2651	/*
	2652	* This routine should be called immediately after changing
	2653	* a directory entry. The inode's link count should not be
	2654	* decremented by the calling procedure -- the soft updates
	2655	* code will perform this task when it is safe.
	2656	*/
	2657	void
	2658	softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
	2659	struct buf bp; / buffer containing directory block */
	2660	struct inode dp; / inode for the directory being modified */
	2661	struct inode ip; / inode for directory entry being removed */
	2662	long newinum; /* new inode number for changed entry */
	2663	int isrmdir; /* indicates if doing RMDIR */
	2664	{
	2665	int offset;
	2666	struct diradd *dap = NULL;
	2667	struct dirrem dirrem, prevdirrem;
	2668	struct pagedep *pagedep;
	2669	struct inodedep *inodedep;
	2670
	2671	offset = blkoff(dp->i_fs, dp->i_offset);
	2672
	2673	/*
	2674	* Whiteouts do not need diradd dependencies.
	2675	*/
	2676	if (newinum != WINO) {
	2677	MALLOC(dap, struct diradd *, sizeof(struct diradd),
	2678	M_DIRADD, M_SOFTDEP_FLAGS);
	2679	bzero(dap, sizeof(struct diradd));
	2680	dap->da_list.wk_type = D_DIRADD;
	2681	dap->da_state = DIRCHG \| ATTACHED \| DEPCOMPLETE;
	2682	dap->da_offset = offset;
	2683	dap->da_newinum = newinum;
	2684	}
	2685
	2686	/*
	2687	* Allocate a new dirrem and ACQUIRE_LOCK.
	2688	*/
	2689	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
	2690	pagedep = dirrem->dm_pagedep;
	2691	/*
	2692	* The possible values for isrmdir:
	2693	* 0 - non-directory file rename
	2694	* 1 - directory rename within same directory
	2695	* inum - directory rename to new directory of given inode number
	2696	* When renaming to a new directory, we are both deleting and
	2697	* creating a new directory entry, so the link count on the new
	2698	* directory should not change. Thus we do not need the followup
	2699	* dirrem which is usually done in handle_workitem_remove. We set
	2700	* the DIRCHG flag to tell handle_workitem_remove to skip the
	2701	* followup dirrem.
	2702	*/
	2703	if (isrmdir > 1)
	2704	dirrem->dm_state \|= DIRCHG;
	2705
	2706	/*
	2707	* Whiteouts have no additional dependencies,
	2708	* so just put the dirrem on the correct list.
	2709	*/
	2710	if (newinum == WINO) {
	2711	if ((dirrem->dm_state & COMPLETE) == 0) {
	2712	LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
	2713	dm_next);
	2714	} else {
	2715	dirrem->dm_dirinum = pagedep->pd_ino;
	2716	add_to_worklist(&dirrem->dm_list);
	2717	}
	2718	FREE_LOCK(&lk);
	2719	return;
	2720	}
	2721
	2722	/*
	2723	* If the COMPLETE flag is clear, then there were no active
	2724	* entries and we want to roll back to the previous inode until
	2725	* the new inode is committed to disk. If the COMPLETE flag is
	2726	* set, then we have deleted an entry that never made it to disk.
	2727	* If the entry we deleted resulted from a name change, then the old
	2728	* inode reference still resides on disk. Any rollback that we do
	2729	* needs to be to that old inode (returned to us in prevdirrem). If
	2730	* the entry we deleted resulted from a create, then there is
	2731	* no entry on the disk, so we want to roll back to zero rather
	2732	* than the uncommitted inode. In either of the COMPLETE cases we
	2733	* want to immediately free the unwritten and unreferenced inode.
	2734	*/
	2735	if ((dirrem->dm_state & COMPLETE) == 0) {
	2736	dap->da_previous = dirrem;
	2737	} else {
	2738	if (prevdirrem != NULL) {
	2739	dap->da_previous = prevdirrem;
	2740	} else {
	2741	dap->da_state &= ~DIRCHG;
	2742	dap->da_pagedep = pagedep;
	2743	}
	2744	dirrem->dm_dirinum = pagedep->pd_ino;
	2745	add_to_worklist(&dirrem->dm_list);
	2746	}
	2747	/*
	2748	* Link into its inodedep. Put it on the id_bufwait list if the inode
	2749	* is not yet written. If it is written, do the post-inode write
	2750	* processing to put it on the id_pendinghd list.
	2751	*/
	2752	if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 \|\|
	2753	(inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
	2754	dap->da_state \|= COMPLETE;
	2755	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
	2756	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
	2757	} else {
	2758	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
	2759	dap, da_pdlist);
	2760	WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
	2761	}
	2762	FREE_LOCK(&lk);
	2763	}
	2764
	2765	/*
	2766	* Called whenever the link count on an inode is changed.
	2767	* It creates an inode dependency so that the new reference(s)
	2768	* to the inode cannot be committed to disk until the updated
	2769	* inode has been written.
	2770	*/
	2771	void
	2772	softdep_change_linkcnt(ip)
	2773	struct inode ip; / the inode with the increased link count */
	2774	{
	2775	struct inodedep *inodedep;
	2776
	2777	ACQUIRE_LOCK(&lk);
	2778	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
	2779	if (ip->i_nlink < ip->i_effnlink) {
	2780	FREE_LOCK(&lk);
	2781	panic("softdep_change_linkcnt: bad delta");
	2782	}
	2783	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
	2784	FREE_LOCK(&lk);
	2785	}
	2786
	2787	/*
	2788	* This workitem decrements the inode's link count.
	2789	* If the link count reaches zero, the file is removed.
	2790	*/
	2791	static void
	2792	handle_workitem_remove(dirrem)
	2793	struct dirrem *dirrem;
	2794	{
	2795	struct proc p = CURPROC; / XXX */
	2796	struct inodedep *inodedep;
	2797	struct vnode *vp;
	2798	struct inode *ip;
	2799	ino_t oldinum;
	2800	int error;
	2801
	2802	if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
	2803	softdep_error("handle_workitem_remove: vget", error);
	2804	return;
	2805	}
	2806	ip = VTOI(vp);
	2807	ACQUIRE_LOCK(&lk);
	2808	if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){
	2809	FREE_LOCK(&lk);
	2810	panic("handle_workitem_remove: lost inodedep");
	2811	}
	2812	/*
	2813	* Normal file deletion.
	2814	*/
	2815	if ((dirrem->dm_state & RMDIR) == 0) {
	2816	ip->i_nlink--;
	2817	ip->i_flag \|= IN_CHANGE;
	2818	if (ip->i_nlink < ip->i_effnlink) {
	2819	FREE_LOCK(&lk);
	2820	panic("handle_workitem_remove: bad file delta");
	2821	}
	2822	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
	2823	FREE_LOCK(&lk);
	2824	vput(vp);
	2825	num_dirrem -= 1;
	2826	WORKITEM_FREE(dirrem, D_DIRREM);
	2827	return;
	2828	}
	2829	/*
	2830	* Directory deletion. Decrement reference count for both the
	2831	* just deleted parent directory entry and the reference for ".".
	2832	* Next truncate the directory to length zero. When the
	2833	* truncation completes, arrange to have the reference count on
	2834	* the parent decremented to account for the loss of "..".
	2835	*/
	2836	ip->i_nlink -= 2;
	2837	ip->i_flag \|= IN_CHANGE;
	2838	if (ip->i_nlink < ip->i_effnlink) {
	2839	FREE_LOCK(&lk);
	2840	panic("handle_workitem_remove: bad dir delta");
	2841	}
	2842	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
	2843	FREE_LOCK(&lk);
	2844	if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0)
	2845	softdep_error("handle_workitem_remove: truncate", error);
	2846	/*
	2847	* Rename a directory to a new parent. Since, we are both deleting
	2848	* and creating a new directory entry, the link count on the new
	2849	* directory should not change. Thus we skip the followup dirrem.
	2850	*/
	2851	if (dirrem->dm_state & DIRCHG) {
	2852	vput(vp);
	2853	num_dirrem -= 1;
	2854	WORKITEM_FREE(dirrem, D_DIRREM);
	2855	return;
	2856	}
	2857	/*
	2858	* If the inodedep does not exist, then the zero'ed inode has
	2859	* been written to disk. If the allocated inode has never been
	2860	* written to disk, then the on-disk inode is zero'ed. In either
	2861	* case we can remove the file immediately.
	2862	*/
	2863	ACQUIRE_LOCK(&lk);
	2864	dirrem->dm_state = 0;
	2865	oldinum = dirrem->dm_oldinum;
	2866	dirrem->dm_oldinum = dirrem->dm_dirinum;
	2867	if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 \|\|
	2868	check_inode_unwritten(inodedep)) {
	2869	FREE_LOCK(&lk);
	2870	vput(vp);
	2871	handle_workitem_remove(dirrem);
	2872	return;
	2873	}
	2874	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
	2875	FREE_LOCK(&lk);
	2876	vput(vp);
	2877	}
	2878
	2879	/*
	2880	* Inode de-allocation dependencies.
	2881	*
	2882	* When an inode's link count is reduced to zero, it can be de-allocated. We
	2883	* found it convenient to postpone de-allocation until after the inode is
	2884	* written to disk with its new link count (zero). At this point, all of the
	2885	* on-disk inode's block pointers are nullified and, with careful dependency
	2886	* list ordering, all dependencies related to the inode will be satisfied and
	2887	* the corresponding dependency structures de-allocated. So, if/when the
	2888	* inode is reused, there will be no mixing of old dependencies with new
	2889	* ones. This artificial dependency is set up by the block de-allocation
	2890	* procedure above (softdep_setup_freeblocks) and completed by the
	2891	* following procedure.
	2892	*/
	2893	static void
	2894	handle_workitem_freefile(freefile)
	2895	struct freefile *freefile;
	2896	{
	2897	struct vnode vp;
	2898	struct inode tip;
	2899	struct inodedep *idp;
	2900	int error;
	2901
	2902	#ifdef DEBUG
	2903	ACQUIRE_LOCK(&lk);
	2904	error = inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp);
	2905	FREE_LOCK(&lk);
	2906	if (error)
	2907	panic("handle_workitem_freefile: inodedep survived");
	2908	#endif
	2909	tip.i_devvp = freefile->fx_devvp;
	2910	tip.i_dev = freefile->fx_devvp->v_rdev;
	2911	tip.i_fs = freefile->fx_fs;
	2912	vp.v_data = &tip;
	2913	if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
	2914	softdep_error("handle_workitem_freefile", error);
	2915	WORKITEM_FREE(freefile, D_FREEFILE);
	2916	}
	2917
	2918	/*
	2919	* Disk writes.
	2920	*
	2921	* The dependency structures constructed above are most actively used when file
	2922	* system blocks are written to disk. No constraints are placed on when a
	2923	* block can be written, but unsatisfied update dependencies are made safe by
	2924	* modifying (or replacing) the source memory for the duration of the disk
	2925	* write. When the disk write completes, the memory block is again brought
	2926	* up-to-date.
	2927	*
	2928	* In-core inode structure reclamation.
	2929	*
	2930	* Because there are a finite number of "in-core" inode structures, they are
	2931	* reused regularly. By transferring all inode-related dependencies to the
	2932	* in-memory inode block and indexing them separately (via "inodedep"s), we
	2933	* can allow "in-core" inode structures to be reused at any time and avoid
	2934	* any increase in contention.
	2935	*
	2936	* Called just before entering the device driver to initiate a new disk I/O.
	2937	* The buffer must be locked, thus, no I/O completion operations can occur
	2938	* while we are manipulating its associated dependencies.
	2939	*/
	2940	static void
	2941	softdep_disk_io_initiation(bp)
	2942	struct buf bp; / structure describing disk write to occur */
	2943	{
	2944	struct worklist wk, nextwk;
	2945	struct indirdep *indirdep;
	2946
	2947	/*
	2948	* We only care about write operations. There should never
	2949	* be dependencies for reads.
	2950	*/
	2951	if (bp->b_flags & B_READ)
	2952	panic("softdep_disk_io_initiation: read");
	2953	/*
	2954	* Do any necessary pre-I/O processing.
	2955	*/
	2956	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
	2957	nextwk = LIST_NEXT(wk, wk_list);
	2958	switch (wk->wk_type) {
	2959
	2960	case D_PAGEDEP:
	2961	initiate_write_filepage(WK_PAGEDEP(wk), bp);
	2962	continue;
	2963
	2964	case D_INODEDEP:
	2965	initiate_write_inodeblock(WK_INODEDEP(wk), bp);
	2966	continue;
	2967
	2968	case D_INDIRDEP:
	2969	indirdep = WK_INDIRDEP(wk);
	2970	if (indirdep->ir_state & GOINGAWAY)
	2971	panic("disk_io_initiation: indirdep gone");
	2972	/*
	2973	* If there are no remaining dependencies, this
	2974	* will be writing the real pointers, so the
	2975	* dependency can be freed.
	2976	*/
	2977	if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
	2978	indirdep->ir_savebp->b_flags \|= B_INVAL \| B_NOCACHE;
	2979	brelse(indirdep->ir_savebp);
	2980	/* inline expand WORKLIST_REMOVE(wk); */
	2981	wk->wk_state &= ~ONWORKLIST;
	2982	LIST_REMOVE(wk, wk_list);
	2983	WORKITEM_FREE(indirdep, D_INDIRDEP);
	2984	continue;
	2985	}
	2986	/*
	2987	* Replace up-to-date version with safe version.
	2988	*/
	2989	MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
	2990	M_INDIRDEP, M_SOFTDEP_FLAGS);
	2991	ACQUIRE_LOCK(&lk);
	2992	indirdep->ir_state &= ~ATTACHED;
	2993	indirdep->ir_state \|= UNDONE;
	2994	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
	2995	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
	2996	bp->b_bcount);
	2997	FREE_LOCK(&lk);
	2998	continue;
	2999
	3000	case D_MKDIR:
	3001	case D_BMSAFEMAP:
	3002	case D_ALLOCDIRECT:
	3003	case D_ALLOCINDIR:
	3004	continue;
	3005
	3006	default:
	3007	panic("handle_disk_io_initiation: Unexpected type %s",
	3008	TYPENAME(wk->wk_type));
	3009	/* NOTREACHED */
	3010	}
	3011	}
	3012	}
	3013
	3014	/*
	3015	* Called from within the procedure above to deal with unsatisfied
	3016	* allocation dependencies in a directory. The buffer must be locked,
	3017	* thus, no I/O completion operations can occur while we are
	3018	* manipulating its associated dependencies.
	3019	*/
	3020	static void
	3021	initiate_write_filepage(pagedep, bp)
	3022	struct pagedep *pagedep;
	3023	struct buf *bp;
	3024	{
	3025	struct diradd *dap;
	3026	struct direct *ep;
	3027	int i;
	3028
	3029	if (pagedep->pd_state & IOSTARTED) {
	3030	/*
	3031	* This can only happen if there is a driver that does not
	3032	* understand chaining. Here biodone will reissue the call
	3033	* to strategy for the incomplete buffers.
	3034	*/
	3035	printf("initiate_write_filepage: already started\n");
	3036	return;
	3037	}
	3038	pagedep->pd_state \|= IOSTARTED;
	3039	ACQUIRE_LOCK(&lk);
	3040	for (i = 0; i < DAHASHSZ; i++) {
	3041	LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
	3042	ep = (struct direct *)
	3043	((char *)bp->b_data + dap->da_offset);
	3044	if (ep->d_ino != dap->da_newinum) {
	3045	FREE_LOCK(&lk);
	3046	panic("%s: dir inum %d != new %d",
	3047	"initiate_write_filepage",
	3048	ep->d_ino, dap->da_newinum);
	3049	}
	3050	if (dap->da_state & DIRCHG)
	3051	ep->d_ino = dap->da_previous->dm_oldinum;
	3052	else
	3053	ep->d_ino = 0;
	3054	dap->da_state &= ~ATTACHED;
	3055	dap->da_state \|= UNDONE;
	3056	}
	3057	}
	3058	FREE_LOCK(&lk);
	3059	}
	3060
	3061	/*
	3062	* Called from within the procedure above to deal with unsatisfied
	3063	* allocation dependencies in an inodeblock. The buffer must be
	3064	* locked, thus, no I/O completion operations can occur while we
	3065	* are manipulating its associated dependencies.
	3066	*/
	3067	static void
	3068	initiate_write_inodeblock(inodedep, bp)
	3069	struct inodedep *inodedep;
	3070	struct buf bp; / The inode block */
	3071	{
	3072	struct allocdirect adp, lastadp;
	3073	struct dinode *dp;
	3074	struct fs *fs;
	3075	ufs_lbn_t prevlbn = 0;
	3076	int i, deplist;
	3077
	3078	if (inodedep->id_state & IOSTARTED)
	3079	panic("initiate_write_inodeblock: already started");
	3080	inodedep->id_state \|= IOSTARTED;
	3081	fs = inodedep->id_fs;
	3082	dp = (struct dinode *)bp->b_data +
	3083	ino_to_fsbo(fs, inodedep->id_ino);
	3084	/*
	3085	* If the bitmap is not yet written, then the allocated
	3086	* inode cannot be written to disk.
	3087	*/
	3088	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
	3089	if (inodedep->id_savedino != NULL)
	3090	panic("initiate_write_inodeblock: already doing I/O");
	3091	MALLOC(inodedep->id_savedino, struct dinode *,
	3092	sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
	3093	inodedep->id_savedino = dp;
	3094	bzero((caddr_t)dp, sizeof(struct dinode));
	3095	return;
	3096	}
	3097	/*
	3098	* If no dependencies, then there is nothing to roll back.
	3099	*/
	3100	inodedep->id_savedsize = dp->di_size;
	3101	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
	3102	return;
	3103	/*
	3104	* Set the dependencies to busy.
	3105	*/
	3106	ACQUIRE_LOCK(&lk);
	3107	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
	3108	adp = TAILQ_NEXT(adp, ad_next)) {
	3109	#ifdef DIAGNOSTIC
	3110	if (deplist != 0 && prevlbn >= adp->ad_lbn) {
	3111	FREE_LOCK(&lk);
	3112	panic("softdep_write_inodeblock: lbn order");
	3113	}
	3114	prevlbn = adp->ad_lbn;
	3115	if (adp->ad_lbn < NDADDR &&
	3116	dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
	3117	FREE_LOCK(&lk);
	3118	panic("%s: direct pointer #%ld mismatch %d != %d",
	3119	"softdep_write_inodeblock", adp->ad_lbn,
	3120	dp->di_db[adp->ad_lbn], adp->ad_newblkno);
	3121	}
	3122	if (adp->ad_lbn >= NDADDR &&
	3123	dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) {
	3124	FREE_LOCK(&lk);
	3125	panic("%s: indirect pointer #%ld mismatch %d != %d",
	3126	"softdep_write_inodeblock", adp->ad_lbn - NDADDR,
	3127	dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno);
	3128	}
	3129	deplist \|= 1 << adp->ad_lbn;
	3130	if ((adp->ad_state & ATTACHED) == 0) {
	3131	FREE_LOCK(&lk);
	3132	panic("softdep_write_inodeblock: Unknown state 0x%x",
	3133	adp->ad_state);
	3134	}
	3135	#endif /* DIAGNOSTIC */
	3136	adp->ad_state &= ~ATTACHED;
	3137	adp->ad_state \|= UNDONE;
	3138	}
	3139	/*
	3140	* The on-disk inode cannot claim to be any larger than the last
	3141	* fragment that has been written. Otherwise, the on-disk inode
	3142	* might have fragments that were not the last block in the file
	3143	* which would corrupt the filesystem.
	3144	*/
	3145	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
	3146	lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
	3147	if (adp->ad_lbn >= NDADDR)
	3148	break;
	3149	dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
	3150	/* keep going until hitting a rollback to a frag */
	3151	if (adp->ad_oldsize == 0 \|\| adp->ad_oldsize == fs->fs_bsize)
	3152	continue;
	3153	dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
	3154	for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
	3155	#ifdef DIAGNOSTIC
	3156	if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
	3157	FREE_LOCK(&lk);
	3158	panic("softdep_write_inodeblock: lost dep1");
	3159	}
	3160	#endif /* DIAGNOSTIC */
	3161	dp->di_db[i] = 0;
	3162	}
	3163	for (i = 0; i < NIADDR; i++) {
	3164	#ifdef DIAGNOSTIC
	3165	if (dp->di_ib[i] != 0 &&
	3166	(deplist & ((1 << NDADDR) << i)) == 0) {
	3167	FREE_LOCK(&lk);
	3168	panic("softdep_write_inodeblock: lost dep2");
	3169	}
	3170	#endif /* DIAGNOSTIC */
	3171	dp->di_ib[i] = 0;
	3172	}
	3173	FREE_LOCK(&lk);
	3174	return;
	3175	}
	3176	/*
	3177	* If we have zero'ed out the last allocated block of the file,
	3178	* roll back the size to the last currently allocated block.
	3179	* We know that this last allocated block is a full-sized as
	3180	* we already checked for fragments in the loop above.
	3181	*/
	3182	if (lastadp != NULL &&
	3183	dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
	3184	for (i = lastadp->ad_lbn; i >= 0; i--)
	3185	if (dp->di_db[i] != 0)
	3186	break;
	3187	dp->di_size = (i + 1) * fs->fs_bsize;
	3188	}
	3189	/*
	3190	* The only dependencies are for indirect blocks.
	3191	*
	3192	* The file size for indirect block additions is not guaranteed.
	3193	* Such a guarantee would be non-trivial to achieve. The conventional
	3194	* synchronous write implementation also does not make this guarantee.
	3195	* Fsck should catch and fix discrepancies. Arguably, the file size
	3196	* can be over-estimated without destroying integrity when the file
	3197	* moves into the indirect blocks (i.e., is large). If we want to
	3198	* postpone fsck, we are stuck with this argument.
	3199	*/
	3200	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
	3201	dp->di_ib[adp->ad_lbn - NDADDR] = 0;
	3202	FREE_LOCK(&lk);
	3203	}
	3204
	3205	/*
	3206	* This routine is called during the completion interrupt
	3207	* service routine for a disk write (from the procedure called
	3208	* by the device driver to inform the file system caches of
	3209	* a request completion). It should be called early in this
	3210	* procedure, before the block is made available to other
	3211	* processes or other routines are called.
	3212	*/
	3213	static void
	3214	softdep_disk_write_complete(bp)
	3215	struct buf bp; / describes the completed disk write */
	3216	{
	3217	struct worklist *wk;
	3218	struct workhead reattach;
	3219	struct newblk *newblk;
	3220	struct allocindir *aip;
	3221	struct allocdirect *adp;
	3222	struct indirdep *indirdep;
	3223	struct inodedep *inodedep;
	3224	struct bmsafemap *bmsafemap;
	3225
	3226	#ifdef DEBUG
	3227	if (lk.lkt_held != -1)
	3228	panic("softdep_disk_write_complete: lock is held");
	3229	lk.lkt_held = -2;
	3230	#endif
	3231	LIST_INIT(&reattach);
	3232	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
	3233	WORKLIST_REMOVE(wk);
	3234	switch (wk->wk_type) {
	3235
	3236	case D_PAGEDEP:
	3237	if (handle_written_filepage(WK_PAGEDEP(wk), bp))
	3238	WORKLIST_INSERT(&reattach, wk);
	3239	continue;
	3240
	3241	case D_INODEDEP:
	3242	if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
	3243	WORKLIST_INSERT(&reattach, wk);
	3244	continue;
	3245
	3246	case D_BMSAFEMAP:
	3247	bmsafemap = WK_BMSAFEMAP(wk);
	3248	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
	3249	newblk->nb_state \|= DEPCOMPLETE;
	3250	newblk->nb_bmsafemap = NULL;
	3251	LIST_REMOVE(newblk, nb_deps);
	3252	}
	3253	while ((adp =
	3254	LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
	3255	adp->ad_state \|= DEPCOMPLETE;
	3256	adp->ad_buf = NULL;
	3257	LIST_REMOVE(adp, ad_deps);
	3258	handle_allocdirect_partdone(adp);
	3259	}
	3260	while ((aip =
	3261	LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
	3262	aip->ai_state \|= DEPCOMPLETE;
	3263	aip->ai_buf = NULL;
	3264	LIST_REMOVE(aip, ai_deps);
	3265	handle_allocindir_partdone(aip);
	3266	}
	3267	while ((inodedep =
	3268	LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
	3269	inodedep->id_state \|= DEPCOMPLETE;
	3270	LIST_REMOVE(inodedep, id_deps);
	3271	inodedep->id_buf = NULL;
	3272	}
	3273	WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
	3274	continue;
	3275
	3276	case D_MKDIR:
	3277	handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
	3278	continue;
	3279
	3280	case D_ALLOCDIRECT:
	3281	adp = WK_ALLOCDIRECT(wk);
	3282	adp->ad_state \|= COMPLETE;
	3283	handle_allocdirect_partdone(adp);
	3284	continue;
	3285
	3286	case D_ALLOCINDIR:
	3287	aip = WK_ALLOCINDIR(wk);
	3288	aip->ai_state \|= COMPLETE;
	3289	handle_allocindir_partdone(aip);
	3290	continue;
	3291
	3292	case D_INDIRDEP:
	3293	indirdep = WK_INDIRDEP(wk);
	3294	if (indirdep->ir_state & GOINGAWAY) {
	3295	lk.lkt_held = -1;
	3296	panic("disk_write_complete: indirdep gone");
	3297	}
	3298	bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
	3299	FREE(indirdep->ir_saveddata, M_INDIRDEP);
	3300	indirdep->ir_saveddata = 0;
	3301	indirdep->ir_state &= ~UNDONE;
	3302	indirdep->ir_state \|= ATTACHED;
	3303	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
	3304	handle_allocindir_partdone(aip);
	3305	if (aip == LIST_FIRST(&indirdep->ir_donehd)) {
	3306	lk.lkt_held = -1;
	3307	panic("disk_write_complete: not gone");
	3308	}
	3309	}
	3310	WORKLIST_INSERT(&reattach, wk);
	3311	if ((bp->b_flags & B_DELWRI) == 0)
	3312	stat_indir_blk_ptrs++;
	3313	bdirty(bp);
	3314	continue;
	3315
	3316	default:
	3317	lk.lkt_held = -1;
	3318	panic("handle_disk_write_complete: Unknown type %s",
	3319	TYPENAME(wk->wk_type));
	3320	/* NOTREACHED */
	3321	}
	3322	}
	3323	/*
	3324	* Reattach any requests that must be redone.
	3325	*/
	3326	while ((wk = LIST_FIRST(&reattach)) != NULL) {
	3327	WORKLIST_REMOVE(wk);
	3328	WORKLIST_INSERT(&bp->b_dep, wk);
	3329	}
	3330	#ifdef DEBUG
	3331	if (lk.lkt_held != -2)
	3332	panic("softdep_disk_write_complete: lock lost");
	3333	lk.lkt_held = -1;
	3334	#endif
	3335	}
	3336
	3337	/*
	3338	* Called from within softdep_disk_write_complete above. Note that
	3339	* this routine is always called from interrupt level with further
	3340	* splbio interrupts blocked.
	3341	*/
	3342	static void
	3343	handle_allocdirect_partdone(adp)
	3344	struct allocdirect adp; / the completed allocdirect */
	3345	{
	3346	struct allocdirect *listadp;
	3347	struct inodedep *inodedep;
	3348	long bsize;
	3349
	3350	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
	3351	return;
	3352	if (adp->ad_buf != NULL) {
	3353	lk.lkt_held = -1;
	3354	panic("handle_allocdirect_partdone: dangling dep");
	3355	}
	3356	/*
	3357	* The on-disk inode cannot claim to be any larger than the last
	3358	* fragment that has been written. Otherwise, the on-disk inode
	3359	* might have fragments that were not the last block in the file
	3360	* which would corrupt the filesystem. Thus, we cannot free any
	3361	* allocdirects after one whose ad_oldblkno claims a fragment as
	3362	* these blocks must be rolled back to zero before writing the inode.
	3363	* We check the currently active set of allocdirects in id_inoupdt.
	3364	*/
	3365	inodedep = adp->ad_inodedep;
	3366	bsize = inodedep->id_fs->fs_bsize;
	3367	TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) {
	3368	/* found our block */
	3369	if (listadp == adp)
	3370	break;
	3371	/* continue if ad_oldlbn is not a fragment */
	3372	if (listadp->ad_oldsize == 0 \|\|
	3373	listadp->ad_oldsize == bsize)
	3374	continue;
	3375	/* hit a fragment */
	3376	return;
	3377	}
	3378	/*
	3379	* If we have reached the end of the current list without
	3380	* finding the just finished dependency, then it must be
	3381	* on the future dependency list. Future dependencies cannot
	3382	* be freed until they are moved to the current list.
	3383	*/
	3384	if (listadp == NULL) {
	3385	#ifdef DEBUG
	3386	TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next)
	3387	/* found our block */
	3388	if (listadp == adp)
	3389	break;
	3390	if (listadp == NULL) {
	3391	lk.lkt_held = -1;
	3392	panic("handle_allocdirect_partdone: lost dep");
	3393	}
	3394	#endif /* DEBUG */
	3395	return;
	3396	}
	3397	/*
	3398	* If we have found the just finished dependency, then free
	3399	* it along with anything that follows it that is complete.
	3400	*/
	3401	for (; adp; adp = listadp) {
	3402	listadp = TAILQ_NEXT(adp, ad_next);
	3403	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
	3404	return;
	3405	free_allocdirect(&inodedep->id_inoupdt, adp, 1);
	3406	}
	3407	}
	3408
	3409	/*
	3410	* Called from within softdep_disk_write_complete above. Note that
	3411	* this routine is always called from interrupt level with further
	3412	* splbio interrupts blocked.
	3413	*/
	3414	static void
	3415	handle_allocindir_partdone(aip)
	3416	struct allocindir aip; / the completed allocindir */
	3417	{
	3418	struct indirdep *indirdep;
	3419
	3420	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
	3421	return;
	3422	if (aip->ai_buf != NULL) {
	3423	lk.lkt_held = -1;
	3424	panic("handle_allocindir_partdone: dangling dependency");
	3425	}
	3426	indirdep = aip->ai_indirdep;
	3427	if (indirdep->ir_state & UNDONE) {
	3428	LIST_REMOVE(aip, ai_next);
	3429	LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
	3430	return;
	3431	}
	3432	((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
	3433	aip->ai_newblkno;
	3434	LIST_REMOVE(aip, ai_next);
	3435	if (aip->ai_freefrag != NULL)
	3436	add_to_worklist(&aip->ai_freefrag->ff_list);
	3437	WORKITEM_FREE(aip, D_ALLOCINDIR);
	3438	}
	3439
	3440	/*
	3441	* Called from within softdep_disk_write_complete above to restore
	3442	* in-memory inode block contents to their most up-to-date state. Note
	3443	* that this routine is always called from interrupt level with further
	3444	* splbio interrupts blocked.
	3445	*/
	3446	static int
	3447	handle_written_inodeblock(inodedep, bp)
	3448	struct inodedep *inodedep;
	3449	struct buf bp; / buffer containing the inode block */
	3450	{
	3451	struct worklist wk, filefree;
	3452	struct allocdirect adp, nextadp;
	3453	struct dinode *dp;
	3454	int hadchanges;
	3455
	3456	if ((inodedep->id_state & IOSTARTED) == 0) {
	3457	lk.lkt_held = -1;
	3458	panic("handle_written_inodeblock: not started");
	3459	}
	3460	inodedep->id_state &= ~IOSTARTED;
	3461	inodedep->id_state \|= COMPLETE;
	3462	dp = (struct dinode *)bp->b_data +
	3463	ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
	3464	/*
	3465	* If we had to rollback the inode allocation because of
	3466	* bitmaps being incomplete, then simply restore it.
	3467	* Keep the block dirty so that it will not be reclaimed until
	3468	* all associated dependencies have been cleared and the
	3469	* corresponding updates written to disk.
	3470	*/
	3471	if (inodedep->id_savedino != NULL) {
	3472	dp = inodedep->id_savedino;
	3473	FREE(inodedep->id_savedino, M_INODEDEP);
	3474	inodedep->id_savedino = NULL;
	3475	if ((bp->b_flags & B_DELWRI) == 0)
	3476	stat_inode_bitmap++;
	3477	bdirty(bp);
	3478	return (1);
	3479	}
	3480	/*
	3481	* Roll forward anything that had to be rolled back before
	3482	* the inode could be updated.
	3483	*/
	3484	hadchanges = 0;
	3485	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
	3486	nextadp = TAILQ_NEXT(adp, ad_next);
	3487	if (adp->ad_state & ATTACHED) {
	3488	lk.lkt_held = -1;
	3489	panic("handle_written_inodeblock: new entry");
	3490	}
	3491	if (adp->ad_lbn < NDADDR) {
	3492	if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) {
	3493	lk.lkt_held = -1;
	3494	panic("%s: %s #%ld mismatch %d != %d",
	3495	"handle_written_inodeblock",
	3496	"direct pointer", adp->ad_lbn,
	3497	dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
	3498	}
	3499	dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
	3500	} else {
	3501	if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) {
	3502	lk.lkt_held = -1;
	3503	panic("%s: %s #%ld allocated as %d",
	3504	"handle_written_inodeblock",
	3505	"indirect pointer", adp->ad_lbn - NDADDR,
	3506	dp->di_ib[adp->ad_lbn - NDADDR]);
	3507	}
	3508	dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno;
	3509	}
	3510	adp->ad_state &= ~UNDONE;
	3511	adp->ad_state \|= ATTACHED;
	3512	hadchanges = 1;
	3513	}
	3514	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
	3515	stat_direct_blk_ptrs++;
	3516	/*
	3517	* Reset the file size to its most up-to-date value.
	3518	*/
	3519	if (inodedep->id_savedsize == -1) {
	3520	lk.lkt_held = -1;
	3521	panic("handle_written_inodeblock: bad size");
	3522	}
	3523	if (dp->di_size != inodedep->id_savedsize) {
	3524	dp->di_size = inodedep->id_savedsize;
	3525	hadchanges = 1;
	3526	}
	3527	inodedep->id_savedsize = -1;
	3528	/*
	3529	* If there were any rollbacks in the inode block, then it must be
	3530	* marked dirty so that its will eventually get written back in
	3531	* its correct form.
	3532	*/
	3533	if (hadchanges)
	3534	bdirty(bp);
	3535	/*
	3536	* Process any allocdirects that completed during the update.
	3537	*/
	3538	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
	3539	handle_allocdirect_partdone(adp);
	3540	/*
	3541	* Process deallocations that were held pending until the
	3542	* inode had been written to disk. Freeing of the inode
	3543	* is delayed until after all blocks have been freed to
	3544	* avoid creation of new <vfsid, inum, lbn> triples
	3545	* before the old ones have been deleted.
	3546	*/
	3547	filefree = NULL;
	3548	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
	3549	WORKLIST_REMOVE(wk);
	3550	switch (wk->wk_type) {
	3551
	3552	case D_FREEFILE:
	3553	/*
	3554	* We defer adding filefree to the worklist until
	3555	* all other additions have been made to ensure
	3556	* that it will be done after all the old blocks
	3557	* have been freed.
	3558	*/
	3559	if (filefree != NULL) {
	3560	lk.lkt_held = -1;
	3561	panic("handle_written_inodeblock: filefree");
	3562	}
	3563	filefree = wk;
	3564	continue;
	3565
	3566	case D_MKDIR:
	3567	handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
	3568	continue;
	3569
	3570	case D_DIRADD:
	3571	diradd_inode_written(WK_DIRADD(wk), inodedep);
	3572	continue;
	3573
	3574	case D_FREEBLKS:
	3575	case D_FREEFRAG:
	3576	case D_DIRREM:
	3577	add_to_worklist(wk);
	3578	continue;
	3579
	3580	default:
	3581	lk.lkt_held = -1;
	3582	panic("handle_written_inodeblock: Unknown type %s",
	3583	TYPENAME(wk->wk_type));
	3584	/* NOTREACHED */
	3585	}
	3586	}
	3587	if (filefree != NULL) {
	3588	if (free_inodedep(inodedep) == 0) {
	3589	lk.lkt_held = -1;
	3590	panic("handle_written_inodeblock: live inodedep");
	3591	}
	3592	add_to_worklist(filefree);
	3593	return (0);
	3594	}
	3595
	3596	/*
	3597	* If no outstanding dependencies, free it.
	3598	*/
	3599	if (free_inodedep(inodedep) \|\| TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
	3600	return (0);
	3601	return (hadchanges);
	3602	}
	3603
	3604	/*
	3605	* Process a diradd entry after its dependent inode has been written.
	3606	* This routine must be called with splbio interrupts blocked.
	3607	*/
	3608	static void
	3609	diradd_inode_written(dap, inodedep)
	3610	struct diradd *dap;
	3611	struct inodedep *inodedep;
	3612	{
	3613	struct pagedep *pagedep;
	3614
	3615	dap->da_state \|= COMPLETE;
	3616	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
	3617	if (dap->da_state & DIRCHG)
	3618	pagedep = dap->da_previous->dm_pagedep;
	3619	else
	3620	pagedep = dap->da_pagedep;
	3621	LIST_REMOVE(dap, da_pdlist);
	3622	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
	3623	}
	3624	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
	3625	}
	3626
	3627	/*
	3628	* Handle the completion of a mkdir dependency.
	3629	*/
	3630	static void
	3631	handle_written_mkdir(mkdir, type)
	3632	struct mkdir *mkdir;
	3633	int type;
	3634	{
	3635	struct diradd *dap;
	3636	struct pagedep *pagedep;
	3637
	3638	if (mkdir->md_state != type) {
	3639	lk.lkt_held = -1;
	3640	panic("handle_written_mkdir: bad type");
	3641	}
	3642	dap = mkdir->md_diradd;
	3643	dap->da_state &= ~type;
	3644	if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) == 0)
	3645	dap->da_state \|= DEPCOMPLETE;
	3646	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
	3647	if (dap->da_state & DIRCHG)
	3648	pagedep = dap->da_previous->dm_pagedep;
	3649	else
	3650	pagedep = dap->da_pagedep;
	3651	LIST_REMOVE(dap, da_pdlist);
	3652	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
	3653	}
	3654	LIST_REMOVE(mkdir, md_mkdirs);
	3655	WORKITEM_FREE(mkdir, D_MKDIR);
	3656	}
	3657
	3658	/*
	3659	* Called from within softdep_disk_write_complete above.
	3660	* A write operation was just completed. Removed inodes can
	3661	* now be freed and associated block pointers may be committed.
	3662	* Note that this routine is always called from interrupt level
	3663	* with further splbio interrupts blocked.
	3664	*/
	3665	static int
	3666	handle_written_filepage(pagedep, bp)
	3667	struct pagedep *pagedep;
	3668	struct buf bp; / buffer containing the written page */
	3669	{
	3670	struct dirrem *dirrem;
	3671	struct diradd dap, nextdap;
	3672	struct direct *ep;
	3673	int i, chgs;
	3674
	3675	if ((pagedep->pd_state & IOSTARTED) == 0) {
	3676	lk.lkt_held = -1;
	3677	panic("handle_written_filepage: not started");
	3678	}
	3679	pagedep->pd_state &= ~IOSTARTED;
	3680	/*
	3681	* Process any directory removals that have been committed.
	3682	*/
	3683	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
	3684	LIST_REMOVE(dirrem, dm_next);
	3685	dirrem->dm_dirinum = pagedep->pd_ino;
	3686	add_to_worklist(&dirrem->dm_list);
	3687	}
	3688	/*
	3689	* Free any directory additions that have been committed.
	3690	*/
	3691	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
	3692	free_diradd(dap);
	3693	/*
	3694	* Uncommitted directory entries must be restored.
	3695	*/
	3696	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
	3697	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
	3698	dap = nextdap) {
	3699	nextdap = LIST_NEXT(dap, da_pdlist);
	3700	if (dap->da_state & ATTACHED) {
	3701	lk.lkt_held = -1;
	3702	panic("handle_written_filepage: attached");
	3703	}
	3704	ep = (struct direct *)
	3705	((char *)bp->b_data + dap->da_offset);
	3706	ep->d_ino = dap->da_newinum;
	3707	dap->da_state &= ~UNDONE;
	3708	dap->da_state \|= ATTACHED;
	3709	chgs = 1;
	3710	/*
	3711	* If the inode referenced by the directory has
	3712	* been written out, then the dependency can be
	3713	* moved to the pending list.
	3714	*/
	3715	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
	3716	LIST_REMOVE(dap, da_pdlist);
	3717	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
	3718	da_pdlist);
	3719	}
	3720	}
	3721	}
	3722	/*
	3723	* If there were any rollbacks in the directory, then it must be
	3724	* marked dirty so that its will eventually get written back in
	3725	* its correct form.
	3726	*/
	3727	if (chgs) {
	3728	if ((bp->b_flags & B_DELWRI) == 0)
	3729	stat_dir_entry++;
	3730	bdirty(bp);
	3731	}
	3732	/*
	3733	* If no dependencies remain, the pagedep will be freed.
	3734	* Otherwise it will remain to update the page before it
	3735	* is written back to disk.
	3736	*/
	3737	if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) {
	3738	for (i = 0; i < DAHASHSZ; i++)
	3739	if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
	3740	break;
	3741	if (i == DAHASHSZ) {
	3742	LIST_REMOVE(pagedep, pd_hash);
	3743	WORKITEM_FREE(pagedep, D_PAGEDEP);
	3744	return (0);
	3745	}
	3746	}
	3747	return (1);
	3748	}
	3749
	3750	/*
	3751	* Writing back in-core inode structures.
	3752	*
	3753	* The file system only accesses an inode's contents when it occupies an
	3754	* "in-core" inode structure. These "in-core" structures are separate from
	3755	* the page frames used to cache inode blocks. Only the latter are
	3756	* transferred to/from the disk. So, when the updated contents of the
	3757	* "in-core" inode structure are copied to the corresponding in-memory inode
	3758	* block, the dependencies are also transferred. The following procedure is
	3759	* called when copying a dirty "in-core" inode to a cached inode block.
	3760	*/
	3761
	3762	/*
	3763	* Called when an inode is loaded from disk. If the effective link count
	3764	* differed from the actual link count when it was last flushed, then we
	3765	* need to ensure that the correct effective link count is put back.
	3766	*/
	3767	void
	3768	softdep_load_inodeblock(ip)
	3769	struct inode ip; / the "in_core" copy of the inode */
	3770	{
	3771	struct inodedep *inodedep;
	3772
	3773	/*
	3774	* Check for alternate nlink count.
	3775	*/
	3776	ip->i_effnlink = ip->i_nlink;
	3777	ACQUIRE_LOCK(&lk);
	3778	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
	3779	FREE_LOCK(&lk);
	3780	return;
	3781	}
	3782	ip->i_effnlink -= inodedep->id_nlinkdelta;
	3783	FREE_LOCK(&lk);
	3784	}
	3785
	3786	/*
	3787	* This routine is called just before the "in-core" inode
	3788	* information is to be copied to the in-memory inode block.
	3789	* Recall that an inode block contains several inodes. If
	3790	* the force flag is set, then the dependencies will be
	3791	* cleared so that the update can always be made. Note that
	3792	* the buffer is locked when this routine is called, so we
	3793	* will never be in the middle of writing the inode block
	3794	* to disk.
	3795	*/
	3796	void
	3797	softdep_update_inodeblock(ip, bp, waitfor)
	3798	struct inode ip; / the "in_core" copy of the inode */
	3799	struct buf bp; / the buffer containing the inode block */
	3800	int waitfor; /* nonzero => update must be allowed */
	3801	{
	3802	struct inodedep *inodedep;
	3803	struct worklist *wk;
	3804	int error, gotit;
	3805
	3806	/*
	3807	* If the effective link count is not equal to the actual link
	3808	* count, then we must track the difference in an inodedep while
	3809	* the inode is (potentially) tossed out of the cache. Otherwise,
	3810	* if there is no existing inodedep, then there are no dependencies
	3811	* to track.
	3812	*/
	3813	ACQUIRE_LOCK(&lk);
	3814	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
	3815	FREE_LOCK(&lk);
	3816	if (ip->i_effnlink != ip->i_nlink)
	3817	panic("softdep_update_inodeblock: bad link count");
	3818	return;
	3819	}
	3820	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) {
	3821	FREE_LOCK(&lk);
	3822	panic("softdep_update_inodeblock: bad delta");
	3823	}
	3824	/*
	3825	* Changes have been initiated. Anything depending on these
	3826	* changes cannot occur until this inode has been written.
	3827	*/
	3828	inodedep->id_state &= ~COMPLETE;
	3829	if ((inodedep->id_state & ONWORKLIST) == 0)
	3830	WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
	3831	/*
	3832	* Any new dependencies associated with the incore inode must
	3833	* now be moved to the list associated with the buffer holding
	3834	* the in-memory copy of the inode. Once merged process any
	3835	* allocdirects that are completed by the merger.
	3836	*/
	3837	merge_inode_lists(inodedep);
	3838	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
	3839	handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
	3840	/*
	3841	* Now that the inode has been pushed into the buffer, the
	3842	* operations dependent on the inode being written to disk
	3843	* can be moved to the id_bufwait so that they will be
	3844	* processed when the buffer I/O completes.
	3845	*/
	3846	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
	3847	WORKLIST_REMOVE(wk);
	3848	WORKLIST_INSERT(&inodedep->id_bufwait, wk);
	3849	}
	3850	/*
	3851	* Newly allocated inodes cannot be written until the bitmap
	3852	* that allocates them have been written (indicated by
	3853	* DEPCOMPLETE being set in id_state). If we are doing a
	3854	* forced sync (e.g., an fsync on a file), we force the bitmap
	3855	* to be written so that the update can be done.
	3856	*/
	3857	if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\| waitfor == 0) {
	3858	FREE_LOCK(&lk);
	3859	return;
	3860	}
	3861	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
	3862	FREE_LOCK(&lk);
	3863	if (gotit &&
	3864	(error = VOP_BWRITE(inodedep->id_buf->b_vp, inodedep->id_buf)) != 0)
	3865	softdep_error("softdep_update_inodeblock: bwrite", error);
	3866	if ((inodedep->id_state & DEPCOMPLETE) == 0)
	3867	panic("softdep_update_inodeblock: update failed");
	3868	}
	3869
	3870	/*
	3871	* Merge the new inode dependency list (id_newinoupdt) into the old
	3872	* inode dependency list (id_inoupdt). This routine must be called
	3873	* with splbio interrupts blocked.
	3874	*/
	3875	static void
	3876	merge_inode_lists(inodedep)
	3877	struct inodedep *inodedep;
	3878	{
	3879	struct allocdirect listadp, newadp;
	3880
	3881	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
	3882	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
	3883	if (listadp->ad_lbn < newadp->ad_lbn) {
	3884	listadp = TAILQ_NEXT(listadp, ad_next);
	3885	continue;
	3886	}
	3887	TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
	3888	TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
	3889	if (listadp->ad_lbn == newadp->ad_lbn) {
	3890	allocdirect_merge(&inodedep->id_inoupdt, newadp,
	3891	listadp);
	3892	listadp = newadp;
	3893	}
	3894	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
	3895	}
	3896	while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
	3897	TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
	3898	TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
	3899	}
	3900	}
	3901
	3902	/*
	3903	* If we are doing an fsync, then we must ensure that any directory
	3904	* entries for the inode have been written after the inode gets to disk.
	3905	*/
	3906	static int
	3907	softdep_fsync(vp)
	3908	struct vnode vp; / the "in_core" copy of the inode */
	3909	{
	3910	struct inodedep *inodedep;
	3911	struct pagedep *pagedep;
	3912	struct worklist *wk;
	3913	struct diradd *dap;
	3914	struct mount *mnt;
	3915	struct vnode *pvp;
	3916	struct inode *ip;
	3917	struct buf *bp;
	3918	struct fs *fs;
	3919	struct proc p = CURPROC; / XXX */
	3920	int error, flushparent;
	3921	ino_t parentino;
	3922	ufs_lbn_t lbn;
	3923
	3924	ip = VTOI(vp);
	3925	fs = ip->i_fs;
	3926	ACQUIRE_LOCK(&lk);
	3927	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
	3928	FREE_LOCK(&lk);
	3929	return (0);
	3930	}
	3931	if (LIST_FIRST(&inodedep->id_inowait) != NULL \|\|
	3932	LIST_FIRST(&inodedep->id_bufwait) != NULL \|\|
	3933	TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\|
	3934	TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
	3935	FREE_LOCK(&lk);
	3936	panic("softdep_fsync: pending ops");
	3937	}
	3938	for (error = 0, flushparent = 0; ; ) {
	3939	if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
	3940	break;
	3941	if (wk->wk_type != D_DIRADD) {
	3942	FREE_LOCK(&lk);
	3943	panic("softdep_fsync: Unexpected type %s",
	3944	TYPENAME(wk->wk_type));
	3945	}
	3946	dap = WK_DIRADD(wk);
	3947	/*
	3948	* Flush our parent if this directory entry
	3949	* has a MKDIR_PARENT dependency.
	3950	*/
	3951	if (dap->da_state & DIRCHG)
	3952	pagedep = dap->da_previous->dm_pagedep;
	3953	else
	3954	pagedep = dap->da_pagedep;
	3955	mnt = pagedep->pd_mnt;
	3956	parentino = pagedep->pd_ino;
	3957	lbn = pagedep->pd_lbn;
	3958	if ((dap->da_state & (MKDIR_BODY \| COMPLETE)) != COMPLETE) {
	3959	FREE_LOCK(&lk);
	3960	panic("softdep_fsync: dirty");
	3961	}
	3962	flushparent = dap->da_state & MKDIR_PARENT;
	3963	/*
	3964	* If we are being fsync'ed as part of vgone'ing this vnode,
	3965	* then we will not be able to release and recover the
	3966	* vnode below, so we just have to give up on writing its
	3967	* directory entry out. It will eventually be written, just
	3968	* not now, but then the user was not asking to have it
	3969	* written, so we are not breaking any promises.
	3970	*/
	3971	if (vp->v_flag & VXLOCK)
	3972	break;
	3973	/*
	3974	* We prevent deadlock by always fetching inodes from the
	3975	* root, moving down the directory tree. Thus, when fetching
	3976	* our parent directory, we must unlock ourselves before
	3977	* requesting the lock on our parent. See the comment in
	3978	* ufs_lookup for details on possible races.
	3979	*/
	3980	FREE_LOCK(&lk);
	3981	VOP_UNLOCK(vp, 0, p);
	3982	error = VFS_VGET(mnt, parentino, &pvp);
	3983	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY, p);
	3984	if (error != 0)
	3985	return (error);
	3986	if (flushparent) {
	3987	if ((error = UFS_UPDATE(pvp, 1)) != 0) {
	3988	vput(pvp);
	3989	return (error);
	3990	}
	3991	}
	3992	/*
	3993	* Flush directory page containing the inode's name.
	3994	*/
	3995	error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
	3996	&bp);
	3997	if (error == 0)
	3998	error = VOP_BWRITE(bp->b_vp, bp);
	3999	vput(pvp);
	4000	if (error != 0)
	4001	return (error);
	4002	ACQUIRE_LOCK(&lk);
	4003	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
	4004	break;
	4005	}
	4006	FREE_LOCK(&lk);
	4007	return (0);
	4008	}
	4009
	4010	/*
	4011	* Flush all the dirty bitmaps associated with the block device
	4012	* before flushing the rest of the dirty blocks so as to reduce
	4013	* the number of dependencies that will have to be rolled back.
	4014	*/
	4015	void
	4016	softdep_fsync_mountdev(vp)
	4017	struct vnode *vp;
	4018	{
	4019	struct buf bp, nbp;
	4020	struct worklist *wk;
	4021
	4022	if (!vn_isdisk(vp, NULL))
	4023	panic("softdep_fsync_mountdev: vnode not a disk");
	4024	ACQUIRE_LOCK(&lk);
	4025	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
	4026	nbp = TAILQ_NEXT(bp, b_vnbufs);
	4027	/*
	4028	* If it is already scheduled, skip to the next buffer.
	4029	*/
	4030	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT))
	4031	continue;
	4032	if ((bp->b_flags & B_DELWRI) == 0) {
	4033	FREE_LOCK(&lk);
	4034	panic("softdep_fsync_mountdev: not dirty");
	4035	}
	4036	/*
	4037	* We are only interested in bitmaps with outstanding
	4038	* dependencies.
	4039	*/
	4040	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL \|\|
	4041	wk->wk_type != D_BMSAFEMAP \|\|
	4042	(bp->b_xflags & BX_BKGRDINPROG)) {
	4043	BUF_UNLOCK(bp);
	4044	continue;
	4045	}
	4046	bremfree(bp);
	4047	FREE_LOCK(&lk);
	4048	(void) bawrite(bp);
	4049	ACQUIRE_LOCK(&lk);
	4050	/*
	4051	* Since we may have slept during the I/O, we need
	4052	* to start from a known point.
	4053	*/
	4054	nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
	4055	}
	4056	drain_output(vp, 1);
	4057	FREE_LOCK(&lk);
	4058	}
	4059
	4060	/*
	4061	* This routine is called when we are trying to synchronously flush a
	4062	* file. This routine must eliminate any filesystem metadata dependencies
	4063	* so that the syncing routine can succeed by pushing the dirty blocks
	4064	* associated with the file. If any I/O errors occur, they are returned.
	4065	*/
	4066	int
	4067	softdep_sync_metadata(ap)
	4068	struct vop_fsync_args /* {
	4069	struct vnode *a_vp;
	4070	struct ucred *a_cred;
	4071	int a_waitfor;
	4072	struct proc *a_p;
	4073	} / ap;
	4074	{
	4075	struct vnode *vp = ap->a_vp;
	4076	struct pagedep *pagedep;
	4077	struct allocdirect *adp;
	4078	struct allocindir *aip;
	4079	struct buf bp, nbp;
	4080	struct worklist *wk;
	4081	int i, error, waitfor;
	4082
	4083	/*
	4084	* Check whether this vnode is involved in a filesystem
	4085	* that is doing soft dependency processing.
	4086	*/
	4087	if (!vn_isdisk(vp, NULL)) {
	4088	if (!DOINGSOFTDEP(vp))
	4089	return (0);
	4090	} else
	4091	if (vp->v_specmountpoint == NULL \|\|
	4092	(vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0)
	4093	return (0);
	4094	/*
	4095	* Ensure that any direct block dependencies have been cleared.
	4096	*/
	4097	ACQUIRE_LOCK(&lk);
	4098	if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
	4099	FREE_LOCK(&lk);
	4100	return (error);
	4101	}
	4102	/*
	4103	* For most files, the only metadata dependencies are the
	4104	* cylinder group maps that allocate their inode or blocks.
	4105	* The block allocation dependencies can be found by traversing
	4106	* the dependency lists for any buffers that remain on their
	4107	* dirty buffer list. The inode allocation dependency will
	4108	* be resolved when the inode is updated with MNT_WAIT.
	4109	* This work is done in two passes. The first pass grabs most
	4110	* of the buffers and begins asynchronously writing them. The
	4111	* only way to wait for these asynchronous writes is to sleep
	4112	* on the filesystem vnode which may stay busy for a long time
	4113	* if the filesystem is active. So, instead, we make a second
	4114	* pass over the dependencies blocking on each write. In the
	4115	* usual case we will be blocking against a write that we
	4116	* initiated, so when it is done the dependency will have been
	4117	* resolved. Thus the second pass is expected to end quickly.
	4118	*/
	4119	waitfor = MNT_NOWAIT;
	4120	top:
	4121	/*
	4122	* We must wait for any I/O in progress to finish so that
	4123	* all potential buffers on the dirty list will be visible.
	4124	*/
	4125	drain_output(vp, 1);
	4126	if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) {
	4127	FREE_LOCK(&lk);
	4128	return (0);
	4129	}
	4130	bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
	4131	loop:
	4132	/*
	4133	* As we hold the buffer locked, none of its dependencies
	4134	* will disappear.
	4135	*/
	4136	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	4137	switch (wk->wk_type) {
	4138
	4139	case D_ALLOCDIRECT:
	4140	adp = WK_ALLOCDIRECT(wk);
	4141	if (adp->ad_state & DEPCOMPLETE)
	4142	break;
	4143	nbp = adp->ad_buf;
	4144	if (getdirtybuf(&nbp, waitfor) == 0)
	4145	break;
	4146	FREE_LOCK(&lk);
	4147	if (waitfor == MNT_NOWAIT) {
	4148	bawrite(nbp);
	4149	} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
	4150	bawrite(bp);
	4151	return (error);
	4152	}
	4153	ACQUIRE_LOCK(&lk);
	4154	break;
	4155
	4156	case D_ALLOCINDIR:
	4157	aip = WK_ALLOCINDIR(wk);
	4158	if (aip->ai_state & DEPCOMPLETE)
	4159	break;
	4160	nbp = aip->ai_buf;
	4161	if (getdirtybuf(&nbp, waitfor) == 0)
	4162	break;
	4163	FREE_LOCK(&lk);
	4164	if (waitfor == MNT_NOWAIT) {
	4165	bawrite(nbp);
	4166	} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
	4167	bawrite(bp);
	4168	return (error);
	4169	}
	4170	ACQUIRE_LOCK(&lk);
	4171	break;
	4172
	4173	case D_INDIRDEP:
	4174	restart:
	4175
	4176	LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
	4177	if (aip->ai_state & DEPCOMPLETE)
	4178	continue;
	4179	nbp = aip->ai_buf;
	4180	if (getdirtybuf(&nbp, MNT_WAIT) == 0)
	4181	goto restart;
	4182	FREE_LOCK(&lk);
	4183	if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
	4184	bawrite(bp);
	4185	return (error);
	4186	}
	4187	ACQUIRE_LOCK(&lk);
	4188	goto restart;
	4189	}
	4190	break;
	4191
	4192	case D_INODEDEP:
	4193	if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
	4194	WK_INODEDEP(wk)->id_ino)) != 0) {
	4195	FREE_LOCK(&lk);
	4196	bawrite(bp);
	4197	return (error);
	4198	}
	4199	break;
	4200
	4201	case D_PAGEDEP:
	4202	/*
	4203	* We are trying to sync a directory that may
	4204	* have dependencies on both its own metadata
	4205	* and/or dependencies on the inodes of any
	4206	* recently allocated files. We walk its diradd
	4207	* lists pushing out the associated inode.
	4208	*/
	4209	pagedep = WK_PAGEDEP(wk);
	4210	for (i = 0; i < DAHASHSZ; i++) {
	4211	if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
	4212	continue;
	4213	if ((error =
	4214	flush_pagedep_deps(vp, pagedep->pd_mnt,
	4215	&pagedep->pd_diraddhd[i]))) {
	4216	FREE_LOCK(&lk);
	4217	bawrite(bp);
	4218	return (error);
	4219	}
	4220	}
	4221	break;
	4222
	4223	case D_MKDIR:
	4224	/*
	4225	* This case should never happen if the vnode has
	4226	* been properly sync'ed. However, if this function
	4227	* is used at a place where the vnode has not yet
	4228	* been sync'ed, this dependency can show up. So,
	4229	* rather than panic, just flush it.
	4230	*/
	4231	nbp = WK_MKDIR(wk)->md_buf;
	4232	if (getdirtybuf(&nbp, waitfor) == 0)
	4233	break;
	4234	FREE_LOCK(&lk);
	4235	if (waitfor == MNT_NOWAIT) {
	4236	bawrite(nbp);
	4237	} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
	4238	bawrite(bp);
	4239	return (error);
	4240	}
	4241	ACQUIRE_LOCK(&lk);
	4242	break;
	4243
	4244	case D_BMSAFEMAP:
	4245	/*
	4246	* This case should never happen if the vnode has
	4247	* been properly sync'ed. However, if this function
	4248	* is used at a place where the vnode has not yet
	4249	* been sync'ed, this dependency can show up. So,
	4250	* rather than panic, just flush it.
	4251	*/
	4252	nbp = WK_BMSAFEMAP(wk)->sm_buf;
	4253	if (getdirtybuf(&nbp, waitfor) == 0)
	4254	break;
	4255	FREE_LOCK(&lk);
	4256	if (waitfor == MNT_NOWAIT) {
	4257	bawrite(nbp);
	4258	} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
	4259	bawrite(bp);
	4260	return (error);
	4261	}
	4262	ACQUIRE_LOCK(&lk);
	4263	break;
	4264
	4265	default:
	4266	FREE_LOCK(&lk);
	4267	panic("softdep_sync_metadata: Unknown type %s",
	4268	TYPENAME(wk->wk_type));
	4269	/* NOTREACHED */
	4270	}
	4271	}
	4272	(void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT);
	4273	nbp = TAILQ_NEXT(bp, b_vnbufs);
	4274	FREE_LOCK(&lk);
	4275	bawrite(bp);
	4276	ACQUIRE_LOCK(&lk);
	4277	if (nbp != NULL) {
	4278	bp = nbp;
	4279	goto loop;
	4280	}
	4281	/*
	4282	* The brief unlock is to allow any pent up dependency
	4283	* processing to be done. Then proceed with the second pass.
	4284	*/
	4285	if (waitfor == MNT_NOWAIT) {
	4286	waitfor = MNT_WAIT;
	4287	FREE_LOCK(&lk);
	4288	ACQUIRE_LOCK(&lk);
	4289	goto top;
	4290	}
	4291
	4292	/*
	4293	* If we have managed to get rid of all the dirty buffers,
	4294	* then we are done. For certain directories and block
	4295	* devices, we may need to do further work.
	4296	*/
	4297	if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) {
	4298	FREE_LOCK(&lk);
	4299	return (0);
	4300	}
	4301
	4302	FREE_LOCK(&lk);
	4303	/*
	4304	* If we are trying to sync a block device, some of its buffers may
	4305	* contain metadata that cannot be written until the contents of some
	4306	* partially written files have been written to disk. The only easy
	4307	* way to accomplish this is to sync the entire filesystem (luckily
	4308	* this happens rarely).
	4309	*
	4310	* We must wait for any I/O in progress to finish so that
	4311	* all potential buffers on the dirty list will be visible.
	4312	*/
	4313	drain_output(vp, 1);
	4314	if (vn_isdisk(vp, NULL) &&
	4315	vp->v_specmountpoint && !VOP_ISLOCKED(vp, NULL) &&
	4316	(error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred,
	4317	ap->a_p)) != 0)
	4318	return (error);
	4319	return (0);
	4320	}
	4321
	4322	/*
	4323	* Flush the dependencies associated with an inodedep.
	4324	* Called with splbio blocked.
	4325	*/
	4326	static int
	4327	flush_inodedep_deps(fs, ino)
	4328	struct fs *fs;
	4329	ino_t ino;
	4330	{
	4331	struct inodedep *inodedep;
	4332	struct allocdirect *adp;
	4333	int error, waitfor;
	4334	struct buf *bp;
	4335
	4336	/*
	4337	* This work is done in two passes. The first pass grabs most
	4338	* of the buffers and begins asynchronously writing them. The
	4339	* only way to wait for these asynchronous writes is to sleep
	4340	* on the filesystem vnode which may stay busy for a long time
	4341	* if the filesystem is active. So, instead, we make a second
	4342	* pass over the dependencies blocking on each write. In the
	4343	* usual case we will be blocking against a write that we
	4344	* initiated, so when it is done the dependency will have been
	4345	* resolved. Thus the second pass is expected to end quickly.
	4346	* We give a brief window at the top of the loop to allow
	4347	* any pending I/O to complete.
	4348	*/
	4349	for (waitfor = MNT_NOWAIT; ; ) {
	4350	FREE_LOCK(&lk);
	4351	ACQUIRE_LOCK(&lk);
	4352	if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
	4353	return (0);
	4354	TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) {
	4355	if (adp->ad_state & DEPCOMPLETE)
	4356	continue;
	4357	bp = adp->ad_buf;
	4358	if (getdirtybuf(&bp, waitfor) == 0) {
	4359	if (waitfor == MNT_NOWAIT)
	4360	continue;
	4361	break;
	4362	}
	4363	FREE_LOCK(&lk);
	4364	if (waitfor == MNT_NOWAIT) {
	4365	bawrite(bp);
	4366	} else if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) {
	4367	ACQUIRE_LOCK(&lk);
	4368	return (error);
	4369	}
	4370	ACQUIRE_LOCK(&lk);
	4371	break;
	4372	}
	4373	if (adp != NULL)
	4374	continue;
	4375	TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) {
	4376	if (adp->ad_state & DEPCOMPLETE)
	4377	continue;
	4378	bp = adp->ad_buf;
	4379	if (getdirtybuf(&bp, waitfor) == 0) {
	4380	if (waitfor == MNT_NOWAIT)
	4381	continue;
	4382	break;
	4383	}
	4384	FREE_LOCK(&lk);
	4385	if (waitfor == MNT_NOWAIT) {
	4386	bawrite(bp);
	4387	} else if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) {
	4388	ACQUIRE_LOCK(&lk);
	4389	return (error);
	4390	}
	4391	ACQUIRE_LOCK(&lk);
	4392	break;
	4393	}
	4394	if (adp != NULL)
	4395	continue;
	4396	/*
	4397	* If pass2, we are done, otherwise do pass 2.
	4398	*/
	4399	if (waitfor == MNT_WAIT)
	4400	break;
	4401	waitfor = MNT_WAIT;
	4402	}
	4403	/*
	4404	* Try freeing inodedep in case all dependencies have been removed.
	4405	*/
	4406	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
	4407	(void) free_inodedep(inodedep);
	4408	return (0);
	4409	}
	4410
	4411	/*
	4412	* Eliminate a pagedep dependency by flushing out all its diradd dependencies.
	4413	* Called with splbio blocked.
	4414	*/
	4415	static int
	4416	flush_pagedep_deps(pvp, mp, diraddhdp)
	4417	struct vnode *pvp;
	4418	struct mount *mp;
	4419	struct diraddhd *diraddhdp;
	4420	{
	4421	struct proc p = CURPROC; / XXX */
	4422	struct inodedep *inodedep;
	4423	struct ufsmount *ump;
	4424	struct diradd *dap;
	4425	struct vnode *vp;
	4426	int gotit, error = 0;
	4427	struct buf *bp;
	4428	ino_t inum;
	4429
	4430	ump = VFSTOUFS(mp);
	4431	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
	4432	/*
	4433	* Flush ourselves if this directory entry
	4434	* has a MKDIR_PARENT dependency.
	4435	*/
	4436	if (dap->da_state & MKDIR_PARENT) {
	4437	FREE_LOCK(&lk);
	4438	if ((error = UFS_UPDATE(pvp, 1)) != 0)
	4439	break;
	4440	ACQUIRE_LOCK(&lk);
	4441	/*
	4442	* If that cleared dependencies, go on to next.
	4443	*/
	4444	if (dap != LIST_FIRST(diraddhdp))
	4445	continue;
	4446	if (dap->da_state & MKDIR_PARENT) {
	4447	FREE_LOCK(&lk);
	4448	panic("flush_pagedep_deps: MKDIR_PARENT");
	4449	}
	4450	}
	4451	/*
	4452	* A newly allocated directory must have its "." and
	4453	* ".." entries written out before its name can be
	4454	* committed in its parent. We do not want or need
	4455	* the full semantics of a synchronous VOP_FSYNC as
	4456	* that may end up here again, once for each directory
	4457	* level in the filesystem. Instead, we push the blocks
	4458	* and wait for them to clear. We have to fsync twice
	4459	* because the first call may choose to defer blocks
	4460	* that still have dependencies, but deferral will
	4461	* happen at most once.
	4462	*/
	4463	inum = dap->da_newinum;
	4464	if (dap->da_state & MKDIR_BODY) {
	4465	FREE_LOCK(&lk);
	4466	if ((error = VFS_VGET(mp, inum, &vp)) != 0)
	4467	break;
	4468	if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) \|\|
	4469	(error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) {
	4470	vput(vp);
	4471	break;
	4472	}
	4473	drain_output(vp, 0);
	4474	vput(vp);
	4475	ACQUIRE_LOCK(&lk);
	4476	/*
	4477	* If that cleared dependencies, go on to next.
	4478	*/
	4479	if (dap != LIST_FIRST(diraddhdp))
	4480	continue;
	4481	if (dap->da_state & MKDIR_BODY) {
	4482	FREE_LOCK(&lk);
	4483	panic("flush_pagedep_deps: MKDIR_BODY");
	4484	}
	4485	}
	4486	/*
	4487	* Flush the inode on which the directory entry depends.
	4488	* Having accounted for MKDIR_PARENT and MKDIR_BODY above,
	4489	* the only remaining dependency is that the updated inode
	4490	* count must get pushed to disk. The inode has already
	4491	* been pushed into its inode buffer (via VOP_UPDATE) at
	4492	* the time of the reference count change. So we need only
	4493	* locate that buffer, ensure that there will be no rollback
	4494	* caused by a bitmap dependency, then write the inode buffer.
	4495	*/
	4496	if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
	4497	FREE_LOCK(&lk);
	4498	panic("flush_pagedep_deps: lost inode");
	4499	}
	4500	/*
	4501	* If the inode still has bitmap dependencies,
	4502	* push them to disk.
	4503	*/
	4504	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
	4505	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
	4506	FREE_LOCK(&lk);
	4507	if (gotit &&
	4508	(error = VOP_BWRITE(inodedep->id_buf->b_vp,
	4509	inodedep->id_buf)) != 0)
	4510	break;
	4511	ACQUIRE_LOCK(&lk);
	4512	if (dap != LIST_FIRST(diraddhdp))
	4513	continue;
	4514	}
	4515	/*
	4516	* If the inode is still sitting in a buffer waiting
	4517	* to be written, push it to disk.
	4518	*/
	4519	FREE_LOCK(&lk);
	4520	if ((error = bread(ump->um_devvp,
	4521	fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
	4522	(int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0)
	4523	break;
	4524	if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0)
	4525	break;
	4526	ACQUIRE_LOCK(&lk);
	4527	/*
	4528	* If we have failed to get rid of all the dependencies
	4529	* then something is seriously wrong.
	4530	*/
	4531	if (dap == LIST_FIRST(diraddhdp)) {
	4532	FREE_LOCK(&lk);
	4533	panic("flush_pagedep_deps: flush failed");
	4534	}
	4535	}
	4536	if (error)
	4537	ACQUIRE_LOCK(&lk);
	4538	return (error);
	4539	}
	4540
	4541	/*
	4542	* A large burst of file addition or deletion activity can drive the
	4543	* memory load excessively high. First attempt to slow things down
	4544	* using the techniques below. If that fails, this routine requests
	4545	* the offending operations to fall back to running synchronously
	4546	* until the memory load returns to a reasonable level.
	4547	*/
	4548	int
	4549	softdep_slowdown(vp)
	4550	struct vnode *vp;
	4551	{
	4552	int max_softdeps_hard;
	4553
	4554	max_softdeps_hard = max_softdeps * 11 / 10;
	4555	if (num_dirrem < max_softdeps_hard / 2 &&
	4556	num_inodedep < max_softdeps_hard)
	4557	return (0);
	4558	stat_sync_limit_hit += 1;
	4559	return (1);
	4560	}
	4561
	4562	/*
	4563	* If memory utilization has gotten too high, deliberately slow things
	4564	* down and speed up the I/O processing.
	4565	*/
	4566	static int
	4567	request_cleanup(resource, islocked)
	4568	int resource;
	4569	int islocked;
	4570	{
	4571	struct proc *p = CURPROC;
	4572
	4573	/*
	4574	* We never hold up the filesystem syncer process.
	4575	*/
	4576	if (p == filesys_syncer)
	4577	return (0);
	4578	/*
	4579	* First check to see if the work list has gotten backlogged.
	4580	* If it has, co-opt this process to help clean up two entries.
	4581	* Because this process may hold inodes locked, we cannot
	4582	* handle any remove requests that might block on a locked
	4583	* inode as that could lead to deadlock.
	4584	*/
	4585	if (num_on_worklist > max_softdeps / 10) {
	4586	if (islocked)
	4587	FREE_LOCK(&lk);
	4588	process_worklist_item(NULL, LK_NOWAIT);
	4589	process_worklist_item(NULL, LK_NOWAIT);
	4590	stat_worklist_push += 2;
	4591	if (islocked)
	4592	ACQUIRE_LOCK(&lk);
	4593	return(1);
	4594	}
	4595
	4596	/*
	4597	* If we are resource constrained on inode dependencies, try
	4598	* flushing some dirty inodes. Otherwise, we are constrained
	4599	* by file deletions, so try accelerating flushes of directories
	4600	* with removal dependencies. We would like to do the cleanup
	4601	* here, but we probably hold an inode locked at this point and
	4602	* that might deadlock against one that we try to clean. So,
	4603	* the best that we can do is request the syncer daemon to do
	4604	* the cleanup for us.
	4605	*/
	4606	switch (resource) {
	4607
	4608	case FLUSH_INODES:
	4609	stat_ino_limit_push += 1;
	4610	req_clear_inodedeps += 1;
	4611	stat_countp = &stat_ino_limit_hit;
	4612	break;
	4613
	4614	case FLUSH_REMOVE:
	4615	stat_blk_limit_push += 1;
	4616	req_clear_remove += 1;
	4617	stat_countp = &stat_blk_limit_hit;
	4618	break;
	4619
	4620	default:
	4621	if (islocked)
	4622	FREE_LOCK(&lk);
	4623	panic("request_cleanup: unknown type");
	4624	}
	4625	/*
	4626	* Hopefully the syncer daemon will catch up and awaken us.
	4627	* We wait at most tickdelay before proceeding in any case.
	4628	*/
	4629	if (islocked == 0)
	4630	ACQUIRE_LOCK(&lk);
	4631	proc_waiting += 1;
	4632	if (handle.callout == NULL)
	4633	handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
	4634	interlocked_sleep(&lk, SLEEP, (caddr_t)&proc_waiting, PPAUSE,
	4635	"softupdate", 0);
	4636	proc_waiting -= 1;
	4637	if (islocked == 0)
	4638	FREE_LOCK(&lk);
	4639	return (1);
	4640	}
	4641
	4642	/*
	4643	* Awaken processes pausing in request_cleanup and clear proc_waiting
	4644	* to indicate that there is no longer a timer running.
	4645	*/
	4646	void
	4647	pause_timer(arg)
	4648	void *arg;
	4649	{
	4650
	4651	*stat_countp += 1;
	4652	wakeup_one(&proc_waiting);
	4653	if (proc_waiting > 0)
	4654	handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
	4655	else
	4656	handle.callout = NULL;
	4657	}
	4658
	4659	/*
	4660	* Flush out a directory with at least one removal dependency in an effort to
	4661	* reduce the number of dirrem, freefile, and freeblks dependency structures.
	4662	*/
	4663	static void
	4664	clear_remove(p)
	4665	struct proc *p;
	4666	{
	4667	struct pagedep_hashhead *pagedephd;
	4668	struct pagedep *pagedep;
	4669	static int next = 0;
	4670	struct mount *mp;
	4671	struct vnode *vp;
	4672	int error, cnt;
	4673	ino_t ino;
	4674
	4675	ACQUIRE_LOCK(&lk);
	4676	for (cnt = 0; cnt < pagedep_hash; cnt++) {
	4677	pagedephd = &pagedep_hashtbl[next++];
	4678	if (next >= pagedep_hash)
	4679	next = 0;
	4680	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
	4681	if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
	4682	continue;
	4683	mp = pagedep->pd_mnt;
	4684	ino = pagedep->pd_ino;
	4685	FREE_LOCK(&lk);
	4686	if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
	4687	softdep_error("clear_remove: vget", error);
	4688	return;
	4689	}
	4690	if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
	4691	softdep_error("clear_remove: fsync", error);
	4692	drain_output(vp, 0);
	4693	vput(vp);
	4694	return;
	4695	}
	4696	}
	4697	FREE_LOCK(&lk);
	4698	}
	4699
	4700	/*
	4701	* Clear out a block of dirty inodes in an effort to reduce
	4702	* the number of inodedep dependency structures.
	4703	*/
	4704	static void
	4705	clear_inodedeps(p)
	4706	struct proc *p;
	4707	{
	4708	struct inodedep_hashhead *inodedephd;
	4709	struct inodedep *inodedep;
	4710	static int next = 0;
	4711	struct mount *mp;
	4712	struct vnode *vp;
	4713	struct fs *fs;
	4714	int error, cnt;
	4715	ino_t firstino, lastino, ino;
	4716
	4717	ACQUIRE_LOCK(&lk);
	4718	/*
	4719	* Pick a random inode dependency to be cleared.
	4720	* We will then gather up all the inodes in its block
	4721	* that have dependencies and flush them out.
	4722	*/
	4723	for (cnt = 0; cnt < inodedep_hash; cnt++) {
	4724	inodedephd = &inodedep_hashtbl[next++];
	4725	if (next >= inodedep_hash)
	4726	next = 0;
	4727	if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
	4728	break;
	4729	}
	4730	if (inodedep == NULL)
	4731	return;
	4732	/*
	4733	* Ugly code to find mount point given pointer to superblock.
	4734	*/
	4735	fs = inodedep->id_fs;
	4736	TAILQ_FOREACH(mp, &mountlist, mnt_list)
	4737	if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
	4738	break;
	4739	/*
	4740	* Find the last inode in the block with dependencies.
	4741	*/
	4742	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
	4743	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
	4744	if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
	4745	break;
	4746	/*
	4747	* Asynchronously push all but the last inode with dependencies.
	4748	* Synchronously push the last inode with dependencies to ensure
	4749	* that the inode block gets written to free up the inodedeps.
	4750	*/
	4751	for (ino = firstino; ino <= lastino; ino++) {
	4752	if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
	4753	continue;
	4754	FREE_LOCK(&lk);
	4755	if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
	4756	softdep_error("clear_inodedeps: vget", error);
	4757	return;
	4758	}
	4759	if (ino == lastino) {
	4760	if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p)))
	4761	softdep_error("clear_inodedeps: fsync1", error);
	4762	} else {
	4763	if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
	4764	softdep_error("clear_inodedeps: fsync2", error);
	4765	drain_output(vp, 0);
	4766	}
	4767	vput(vp);
	4768	ACQUIRE_LOCK(&lk);
	4769	}
	4770	FREE_LOCK(&lk);
	4771	}
	4772
	4773	/*
	4774	* Function to determine if the buffer has outstanding dependencies
	4775	* that will cause a roll-back if the buffer is written. If wantcount
	4776	* is set, return number of dependencies, otherwise just yes or no.
	4777	*/
	4778	static int
	4779	softdep_count_dependencies(bp, wantcount)
	4780	struct buf *bp;
	4781	int wantcount;
	4782	{
	4783	struct worklist *wk;
	4784	struct inodedep *inodedep;
	4785	struct indirdep *indirdep;
	4786	struct allocindir *aip;
	4787	struct pagedep *pagedep;
	4788	struct diradd *dap;
	4789	int i, retval;
	4790
	4791	retval = 0;
	4792	ACQUIRE_LOCK(&lk);
	4793	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	4794	switch (wk->wk_type) {
	4795
	4796	case D_INODEDEP:
	4797	inodedep = WK_INODEDEP(wk);
	4798	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
	4799	/* bitmap allocation dependency */
	4800	retval += 1;
	4801	if (!wantcount)
	4802	goto out;
	4803	}
	4804	if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
	4805	/* direct block pointer dependency */
	4806	retval += 1;
	4807	if (!wantcount)
	4808	goto out;
	4809	}
	4810	continue;
	4811
	4812	case D_INDIRDEP:
	4813	indirdep = WK_INDIRDEP(wk);
	4814
	4815	LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
	4816	/* indirect block pointer dependency */
	4817	retval += 1;
	4818	if (!wantcount)
	4819	goto out;
	4820	}
	4821	continue;
	4822
	4823	case D_PAGEDEP:
	4824	pagedep = WK_PAGEDEP(wk);
	4825	for (i = 0; i < DAHASHSZ; i++) {
	4826
	4827	LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
	4828	/* directory entry dependency */
	4829	retval += 1;
	4830	if (!wantcount)
	4831	goto out;
	4832	}
	4833	}
	4834	continue;
	4835
	4836	case D_BMSAFEMAP:
	4837	case D_ALLOCDIRECT:
	4838	case D_ALLOCINDIR:
	4839	case D_MKDIR:
	4840	/* never a dependency on these blocks */
	4841	continue;
	4842
	4843	default:
	4844	FREE_LOCK(&lk);
	4845	panic("softdep_check_for_rollback: Unexpected type %s",
	4846	TYPENAME(wk->wk_type));
	4847	/* NOTREACHED */
	4848	}
	4849	}
	4850	out:
	4851	FREE_LOCK(&lk);
	4852	return retval;
	4853	}
	4854
	4855	/*
	4856	* Acquire exclusive access to a buffer.
	4857	* Must be called with splbio blocked.
	4858	* Return 1 if buffer was acquired.
	4859	*/
	4860	static int
	4861	getdirtybuf(bpp, waitfor)
	4862	struct buf **bpp;
	4863	int waitfor;
	4864	{
	4865	struct buf *bp;
	4866	int error;
	4867
	4868	for (;;) {
	4869	if ((bp = *bpp) == NULL)
	4870	return (0);
	4871	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT) == 0) {
	4872	if ((bp->b_xflags & BX_BKGRDINPROG) == 0)
	4873	break;
	4874	BUF_UNLOCK(bp);
	4875	if (waitfor != MNT_WAIT)
	4876	return (0);
	4877	bp->b_xflags \|= BX_BKGRDWAIT;
	4878	interlocked_sleep(&lk, SLEEP, &bp->b_xflags, PRIBIO,
	4879	"getbuf", 0);
	4880	continue;
	4881	}
	4882	if (waitfor != MNT_WAIT)
	4883	return (0);
	4884	error = interlocked_sleep(&lk, LOCKBUF, bp,
	4885	LK_EXCLUSIVE \| LK_SLEEPFAIL, 0, 0);
	4886	if (error != ENOLCK) {
	4887	FREE_LOCK(&lk);
	4888	panic("getdirtybuf: inconsistent lock");
	4889	}
	4890	}
	4891	if ((bp->b_flags & B_DELWRI) == 0) {
	4892	BUF_UNLOCK(bp);
	4893	return (0);
	4894	}
	4895	bremfree(bp);
	4896	return (1);
	4897	}
	4898
	4899	/*
	4900	* Wait for pending output on a vnode to complete.
	4901	* Must be called with vnode locked.
	4902	*/
	4903	static void
	4904	drain_output(vp, islocked)
	4905	struct vnode *vp;
	4906	int islocked;
	4907	{
	4908
	4909	if (!islocked)
	4910	ACQUIRE_LOCK(&lk);
	4911	while (vp->v_numoutput) {
	4912	vp->v_flag \|= VBWAIT;
	4913	interlocked_sleep(&lk, SLEEP, (caddr_t)&vp->v_numoutput,
	4914	PRIBIO + 1, "drainvp", 0);
	4915	}
	4916	if (!islocked)
	4917	FREE_LOCK(&lk);
	4918	}
	4919
	4920	/*
	4921	* Called whenever a buffer that is being invalidated or reallocated
	4922	* contains dependencies. This should only happen if an I/O error has
	4923	* occurred. The routine is called with the buffer locked.
	4924	*/
	4925	static void
	4926	softdep_deallocate_dependencies(bp)
	4927	struct buf *bp;
	4928	{
	4929
	4930	if ((bp->b_flags & B_ERROR) == 0)
	4931	panic("softdep_deallocate_dependencies: dangling deps");
	4932	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
	4933	panic("softdep_deallocate_dependencies: unrecovered I/O error");
	4934	}
	4935
	4936	/*
	4937	* Function to handle asynchronous write errors in the filesystem.
	4938	*/
	4939	void
	4940	softdep_error(func, error)
	4941	char *func;
	4942	int error;
	4943	{
	4944
	4945	/* XXX should do something better! */
	4946	printf("%s: got error %d while accessing filesystem\n", func, error);
	4947	}