gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
	3	*
	4	* The soft updates code is derived from the appendix of a University
	5	* of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
	6	* "Soft Updates: A Solution to the Metadata Update Problem in File
	7	* Systems", CSE-TR-254-95, August 1995).
	8	*
	9	* Further information about soft updates can be obtained from:
	10	*
	11	* Marshall Kirk McKusick http://www.mckusick.com/softdep/
	12	* 1614 Oxford Street mckusick@mckusick.com
	13	* Berkeley, CA 94709-1608 +1-510-843-9542
	14	* USA
	15	*
	16	* Redistribution and use in source and binary forms, with or without
	17	* modification, are permitted provided that the following conditions
	18	* are met:
	19	*
	20	* 1. Redistributions of source code must retain the above copyright
	21	* notice, this list of conditions and the following disclaimer.
	22	* 2. Redistributions in binary form must reproduce the above copyright
	23	* notice, this list of conditions and the following disclaimer in the
	24	* documentation and/or other materials provided with the distribution.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
	27	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	28	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	29	* DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
	30	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
	39	* $FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.57.2.11 2002/02/05 18:46:53 dillon Exp $
	40	* $DragonFly: src/sys/vfs/ufs/ffs_softdep.c,v 1.4 2003/06/25 03:56:11 dillon Exp $
	41	*/
	42
	43	/*
	44	* For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
	45	*/
	46	#ifndef DIAGNOSTIC
	47	#define DIAGNOSTIC
	48	#endif
	49	#ifndef DEBUG
	50	#define DEBUG
	51	#endif
	52
	53	#include <sys/param.h>
	54	#include <sys/kernel.h>
	55	#include <sys/systm.h>
	56	#include <sys/buf.h>
	57	#include <sys/malloc.h>
	58	#include <sys/mount.h>
	59	#include <sys/proc.h>
	60	#include <sys/syslog.h>
	61	#include <sys/vnode.h>
	62	#include <sys/conf.h>
	63	#include <sys/buf2.h>
	64	#include <ufs/ufs/dir.h>
	65	#include <ufs/ufs/quota.h>
	66	#include <ufs/ufs/inode.h>
	67	#include <ufs/ufs/ufsmount.h>
	68	#include <ufs/ffs/fs.h>
	69	#include <ufs/ffs/softdep.h>
	70	#include <ufs/ffs/ffs_extern.h>
	71	#include <ufs/ufs/ufs_extern.h>
	72
	73	/*
	74	* These definitions need to be adapted to the system to which
	75	* this file is being ported.
	76	*/
	77	/*
	78	* malloc types defined for the softdep system.
	79	*/
	80	MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
	81	MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
	82	MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
	83	MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
	84	MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
	85	MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
	86	MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
	87	MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
	88	MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
	89	MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
	90	MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
	91	MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
	92	MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
	93
	94	#define M_SOFTDEP_FLAGS (M_WAITOK \| M_USE_RESERVE)
	95
	96	#define D_PAGEDEP 0
	97	#define D_INODEDEP 1
	98	#define D_NEWBLK 2
	99	#define D_BMSAFEMAP 3
	100	#define D_ALLOCDIRECT 4
	101	#define D_INDIRDEP 5
	102	#define D_ALLOCINDIR 6
	103	#define D_FREEFRAG 7
	104	#define D_FREEBLKS 8
	105	#define D_FREEFILE 9
	106	#define D_DIRADD 10
	107	#define D_MKDIR 11
	108	#define D_DIRREM 12
	109	#define D_LAST D_DIRREM
	110
	111	/*
	112	* translate from workitem type to memory type
	113	* MUST match the defines above, such that memtype[D_XXX] == M_XXX
	114	*/
	115	static struct malloc_type *memtype[] = {
	116	M_PAGEDEP,
	117	M_INODEDEP,
	118	M_NEWBLK,
	119	M_BMSAFEMAP,
	120	M_ALLOCDIRECT,
	121	M_INDIRDEP,
	122	M_ALLOCINDIR,
	123	M_FREEFRAG,
	124	M_FREEBLKS,
	125	M_FREEFILE,
	126	M_DIRADD,
	127	M_MKDIR,
	128	M_DIRREM
	129	};
	130
	131	#define DtoM(type) (memtype[type])
	132
	133	/*
	134	* Names of malloc types.
	135	*/
	136	#define TYPENAME(type) \
	137	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
	138	/*
	139	* End system adaptaion definitions.
	140	*/
	141
	142	/*
	143	* Internal function prototypes.
	144	*/
	145	static void softdep_error __P((char *, int));
	146	static void drain_output __P((struct vnode *, int));
	147	static int getdirtybuf __P((struct buf **, int));
	148	static void clear_remove __P((struct thread *));
	149	static void clear_inodedeps __P((struct thread *));
	150	static int flush_pagedep_deps __P((struct vnode , struct mount ,
	151	struct diraddhd *));
	152	static int flush_inodedep_deps __P((struct fs *, ino_t));
	153	static int handle_written_filepage __P((struct pagedep , struct buf ));
	154	static void diradd_inode_written __P((struct diradd , struct inodedep ));
	155	static int handle_written_inodeblock __P((struct inodedep , struct buf ));
	156	static void handle_allocdirect_partdone __P((struct allocdirect *));
	157	static void handle_allocindir_partdone __P((struct allocindir *));
	158	static void initiate_write_filepage __P((struct pagedep , struct buf ));
	159	static void handle_written_mkdir __P((struct mkdir *, int));
	160	static void initiate_write_inodeblock __P((struct inodedep , struct buf ));
	161	static void handle_workitem_freefile __P((struct freefile *));
	162	static void handle_workitem_remove __P((struct dirrem *));
	163	static struct dirrem newdirrem __P((struct buf , struct inode *,
	164	struct inode , int, struct dirrem *));
	165	static void free_diradd __P((struct diradd *));
	166	static void free_allocindir __P((struct allocindir , struct inodedep ));
	167	static int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t,
	168	long *));
	169	static void deallocate_dependencies __P((struct buf , struct inodedep ));
	170	static void free_allocdirect __P((struct allocdirectlst *,
	171	struct allocdirect *, int));
	172	static int check_inode_unwritten __P((struct inodedep *));
	173	static int free_inodedep __P((struct inodedep *));
	174	static void handle_workitem_freeblocks __P((struct freeblks *));
	175	static void merge_inode_lists __P((struct inodedep *));
	176	static void setup_allocindir_phase2 __P((struct buf , struct inode ,
	177	struct allocindir *));
	178	static struct allocindir newallocindir __P((struct inode , int, ufs_daddr_t,
	179	ufs_daddr_t));
	180	static void handle_workitem_freefrag __P((struct freefrag *));
	181	static struct freefrag newfreefrag __P((struct inode , ufs_daddr_t, long));
	182	static void allocdirect_merge __P((struct allocdirectlst *,
	183	struct allocdirect , struct allocdirect ));
	184	static struct bmsafemap bmsafemap_lookup __P((struct buf ));
	185	static int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
	186	struct newblk **));
	187	static int inodedep_lookup __P((struct fs , ino_t, int, struct inodedep *));
	188	static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
	189	struct pagedep **));
	190	static void pause_timer __P((void *));
	191	static int request_cleanup __P((int, int));
	192	static int process_worklist_item __P((struct mount *, int));
	193	static void add_to_worklist __P((struct worklist *));
	194
	195	/*
	196	* Exported softdep operations.
	197	*/
	198	static void softdep_disk_io_initiation __P((struct buf *));
	199	static void softdep_disk_write_complete __P((struct buf *));
	200	static void softdep_deallocate_dependencies __P((struct buf *));
	201	static int softdep_fsync __P((struct vnode *));
	202	static int softdep_process_worklist __P((struct mount *));
	203	static void softdep_move_dependencies __P((struct buf , struct buf ));
	204	static int softdep_count_dependencies __P((struct buf *bp, int));
	205
	206	struct bio_ops bioops = {
	207	softdep_disk_io_initiation, /* io_start */
	208	softdep_disk_write_complete, /* io_complete */
	209	softdep_deallocate_dependencies, /* io_deallocate */
	210	softdep_fsync, /* io_fsync */
	211	softdep_process_worklist, /* io_sync */
	212	softdep_move_dependencies, /* io_movedeps */
	213	softdep_count_dependencies, /* io_countdeps */
	214	};
	215
	216	/*
	217	* Locking primitives.
	218	*
	219	* For a uniprocessor, all we need to do is protect against disk
	220	* interrupts. For a multiprocessor, this lock would have to be
	221	* a mutex. A single mutex is used throughout this file, though
	222	* finer grain locking could be used if contention warranted it.
	223	*
	224	* For a multiprocessor, the sleep call would accept a lock and
	225	* release it after the sleep processing was complete. In a uniprocessor
	226	* implementation there is no such interlock, so we simple mark
	227	* the places where it needs to be done with the `interlocked' form
	228	* of the lock calls. Since the uniprocessor sleep already interlocks
	229	* the spl, there is nothing that really needs to be done.
	230	*/
	231	#ifndef /* NOT */ DEBUG
	232	static struct lockit {
	233	int lkt_spl;
	234	} lk = { 0 };
	235	#define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio()
	236	#define FREE_LOCK(lk) splx((lk)->lkt_spl)
	237
	238	#else /* DEBUG */
	239	#define NOHOLDER ((struct thread *)-1)
	240	#define SPECIAL_FLAG ((struct thread *)-2)
	241	static struct lockit {
	242	int lkt_spl;
	243	struct thread *lkt_held;
	244	} lk = { 0, NOHOLDER };
	245	static int lockcnt;
	246
	247	static void acquire_lock __P((struct lockit *));
	248	static void free_lock __P((struct lockit *));
	249	void softdep_panic __P((char *));
	250
	251	#define ACQUIRE_LOCK(lk) acquire_lock(lk)
	252	#define FREE_LOCK(lk) free_lock(lk)
	253
	254	static void
	255	acquire_lock(lk)
	256	struct lockit *lk;
	257	{
	258	thread_t holder;
	259
	260	if (lk->lkt_held != NOHOLDER) {
	261	holder = lk->lkt_held;
	262	FREE_LOCK(lk);
	263	if (holder == curthread)
	264	panic("softdep_lock: locking against myself");
	265	else
	266	panic("softdep_lock: lock held by %p", holder);
	267	}
	268	lk->lkt_spl = splbio();
	269	lk->lkt_held = curthread;
	270	lockcnt++;
	271	}
	272
	273	static void
	274	free_lock(lk)
	275	struct lockit *lk;
	276	{
	277
	278	if (lk->lkt_held == NOHOLDER)
	279	panic("softdep_unlock: lock not held");
	280	lk->lkt_held = NOHOLDER;
	281	splx(lk->lkt_spl);
	282	}
	283
	284	/*
	285	* Function to release soft updates lock and panic.
	286	*/
	287	void
	288	softdep_panic(msg)
	289	char *msg;
	290	{
	291
	292	if (lk.lkt_held != NOHOLDER)
	293	FREE_LOCK(&lk);
	294	panic(msg);
	295	}
	296	#endif /* DEBUG */
	297
	298	static int interlocked_sleep __P((struct lockit , int, void , int,
	299	const char *, int));
	300
	301	/*
	302	* When going to sleep, we must save our SPL so that it does
	303	* not get lost if some other process uses the lock while we
	304	* are sleeping. We restore it after we have slept. This routine
	305	* wraps the interlocking with functions that sleep. The list
	306	* below enumerates the available set of operations.
	307	*/
	308	#define UNKNOWN 0
	309	#define SLEEP 1
	310	#define LOCKBUF 2
	311
	312	static int
	313	interlocked_sleep(lk, op, ident, flags, wmesg, timo)
	314	struct lockit *lk;
	315	int op;
	316	void *ident;
	317	int flags;
	318	const char *wmesg;
	319	int timo;
	320	{
	321	thread_t holder;
	322	int s, retval;
	323
	324	s = lk->lkt_spl;
	325	# ifdef DEBUG
	326	if (lk->lkt_held == NOHOLDER)
	327	panic("interlocked_sleep: lock not held");
	328	lk->lkt_held = NOHOLDER;
	329	# endif /* DEBUG */
	330	switch (op) {
	331	case SLEEP:
	332	retval = tsleep(ident, flags, wmesg, timo);
	333	break;
	334	case LOCKBUF:
	335	retval = BUF_LOCK((struct buf *)ident, flags);
	336	break;
	337	default:
	338	panic("interlocked_sleep: unknown operation");
	339	}
	340	# ifdef DEBUG
	341	if (lk->lkt_held != NOHOLDER) {
	342	holder = lk->lkt_held;
	343	FREE_LOCK(lk);
	344	if (holder == curthread)
	345	panic("interlocked_sleep: locking against self");
	346	else
	347	panic("interlocked_sleep: lock held by %p", holder);
	348	}
	349	lk->lkt_held = curthread;
	350	lockcnt++;
	351	# endif /* DEBUG */
	352	lk->lkt_spl = s;
	353	return (retval);
	354	}
	355
	356	/*
	357	* Place holder for real semaphores.
	358	*/
	359	struct sema {
	360	int value;
	361	thread_t holder;
	362	char *name;
	363	int prio;
	364	int timo;
	365	};
	366	static void sema_init __P((struct sema , char , int, int));
	367	static int sema_get __P((struct sema , struct lockit ));
	368	static void sema_release __P((struct sema *));
	369
	370	static void
	371	sema_init(semap, name, prio, timo)
	372	struct sema *semap;
	373	char *name;
	374	int prio, timo;
	375	{
	376
	377	semap->holder = NOHOLDER;
	378	semap->value = 0;
	379	semap->name = name;
	380	semap->prio = prio;
	381	semap->timo = timo;
	382	}
	383
	384	static int
	385	sema_get(semap, interlock)
	386	struct sema *semap;
	387	struct lockit *interlock;
	388	{
	389
	390	if (semap->value++ > 0) {
	391	if (interlock != NULL) {
	392	interlocked_sleep(interlock, SLEEP, (caddr_t)semap,
	393	semap->prio, semap->name, semap->timo);
	394	FREE_LOCK(interlock);
	395	} else {
	396	tsleep((caddr_t)semap, semap->prio, semap->name,
	397	semap->timo);
	398	}
	399	return (0);
	400	}
	401	semap->holder = curthread;
	402	if (interlock != NULL)
	403	FREE_LOCK(interlock);
	404	return (1);
	405	}
	406
	407	static void
	408	sema_release(semap)
	409	struct sema *semap;
	410	{
	411
	412	if (semap->value <= 0 \|\| semap->holder != curthread) {
	413	if (lk.lkt_held != NOHOLDER)
	414	FREE_LOCK(&lk);
	415	panic("sema_release: not held");
	416	}
	417	if (--semap->value > 0) {
	418	semap->value = 0;
	419	wakeup(semap);
	420	}
	421	semap->holder = NOHOLDER;
	422	}
	423
	424	/*
	425	* Worklist queue management.
	426	* These routines require that the lock be held.
	427	*/
	428	#ifndef /* NOT */ DEBUG
	429	#define WORKLIST_INSERT(head, item) do { \
	430	(item)->wk_state \|= ONWORKLIST; \
	431	LIST_INSERT_HEAD(head, item, wk_list); \
	432	} while (0)
	433	#define WORKLIST_REMOVE(item) do { \
	434	(item)->wk_state &= ~ONWORKLIST; \
	435	LIST_REMOVE(item, wk_list); \
	436	} while (0)
	437	#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
	438
	439	#else /* DEBUG */
	440	static void worklist_insert __P((struct workhead , struct worklist ));
	441	static void worklist_remove __P((struct worklist *));
	442	static void workitem_free __P((struct worklist *, int));
	443
	444	#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
	445	#define WORKLIST_REMOVE(item) worklist_remove(item)
	446	#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
	447
	448	static void
	449	worklist_insert(head, item)
	450	struct workhead *head;
	451	struct worklist *item;
	452	{
	453
	454	if (lk.lkt_held == NOHOLDER)
	455	panic("worklist_insert: lock not held");
	456	if (item->wk_state & ONWORKLIST) {
	457	FREE_LOCK(&lk);
	458	panic("worklist_insert: already on list");
	459	}
	460	item->wk_state \|= ONWORKLIST;
	461	LIST_INSERT_HEAD(head, item, wk_list);
	462	}
	463
	464	static void
	465	worklist_remove(item)
	466	struct worklist *item;
	467	{
	468
	469	if (lk.lkt_held == NOHOLDER)
	470	panic("worklist_remove: lock not held");
	471	if ((item->wk_state & ONWORKLIST) == 0) {
	472	FREE_LOCK(&lk);
	473	panic("worklist_remove: not on list");
	474	}
	475	item->wk_state &= ~ONWORKLIST;
	476	LIST_REMOVE(item, wk_list);
	477	}
	478
	479	static void
	480	workitem_free(item, type)
	481	struct worklist *item;
	482	int type;
	483	{
	484
	485	if (item->wk_state & ONWORKLIST) {
	486	if (lk.lkt_held != NOHOLDER)
	487	FREE_LOCK(&lk);
	488	panic("workitem_free: still on list");
	489	}
	490	if (item->wk_type != type) {
	491	if (lk.lkt_held != NOHOLDER)
	492	FREE_LOCK(&lk);
	493	panic("workitem_free: type mismatch");
	494	}
	495	FREE(item, DtoM(type));
	496	}
	497	#endif /* DEBUG */
	498
	499	/*
	500	* Workitem queue management
	501	*/
	502	static struct workhead softdep_workitem_pending;
	503	static int num_on_worklist; /* number of worklist items to be processed */
	504	static int softdep_worklist_busy; /* 1 => trying to do unmount */
	505	static int softdep_worklist_req; /* serialized waiters */
	506	static int max_softdeps; /* maximum number of structs before slowdown */
	507	static int tickdelay = 2; /* number of ticks to pause during slowdown */
	508	static int stat_countp; / statistic to count in proc_waiting timeout */
	509	static int proc_waiting; /* tracks whether we have a timeout posted */
	510	static struct callout_handle handle; /* handle on posted proc_waiting timeout */
	511	static struct thread filesys_syncer; / proc of filesystem syncer process */
	512	static int req_clear_inodedeps; /* syncer process flush some inodedeps */
	513	#define FLUSH_INODES 1
	514	static int req_clear_remove; /* syncer process flush some freeblks */
	515	#define FLUSH_REMOVE 2
	516	/*
	517	* runtime statistics
	518	*/
	519	static int stat_worklist_push; /* number of worklist cleanups */
	520	static int stat_blk_limit_push; /* number of times block limit neared */
	521	static int stat_ino_limit_push; /* number of times inode limit neared */
	522	static int stat_blk_limit_hit; /* number of times block slowdown imposed */
	523	static int stat_ino_limit_hit; /* number of times inode slowdown imposed */
	524	static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
	525	static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
	526	static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
	527	static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
	528	static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
	529	#ifdef DEBUG
	530	#include <vm/vm.h>
	531	#include <sys/sysctl.h>
	532	SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
	533	SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
	534	SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
	535	SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
	536	SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
	537	SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
	538	SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
	539	SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
	540	SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
	541	SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
	542	SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
	543	SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
	544	#endif /* DEBUG */
	545
	546	/*
	547	* Add an item to the end of the work queue.
	548	* This routine requires that the lock be held.
	549	* This is the only routine that adds items to the list.
	550	* The following routine is the only one that removes items
	551	* and does so in order from first to last.
	552	*/
	553	static void
	554	add_to_worklist(wk)
	555	struct worklist *wk;
	556	{
	557	static struct worklist *worklist_tail;
	558
	559	if (wk->wk_state & ONWORKLIST) {
	560	if (lk.lkt_held != NOHOLDER)
	561	FREE_LOCK(&lk);
	562	panic("add_to_worklist: already on list");
	563	}
	564	wk->wk_state \|= ONWORKLIST;
	565	if (LIST_FIRST(&softdep_workitem_pending) == NULL)
	566	LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
	567	else
	568	LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
	569	worklist_tail = wk;
	570	num_on_worklist += 1;
	571	}
	572
	573	/*
	574	* Process that runs once per second to handle items in the background queue.
	575	*
	576	* Note that we ensure that everything is done in the order in which they
	577	* appear in the queue. The code below depends on this property to ensure
	578	* that blocks of a file are freed before the inode itself is freed. This
	579	* ordering ensures that no new <vfsid, inum, lbn> triples will be generated
	580	* until all the old ones have been purged from the dependency lists.
	581	*/
	582	static int
	583	softdep_process_worklist(matchmnt)
	584	struct mount *matchmnt;
	585	{
	586	thread_t td = curthread;
	587	int matchcnt, loopcount;
	588	long starttime;
	589
	590	/*
	591	* Record the process identifier of our caller so that we can give
	592	* this process preferential treatment in request_cleanup below.
	593	*/
	594	filesys_syncer = td;
	595	matchcnt = 0;
	596
	597	/*
	598	* There is no danger of having multiple processes run this
	599	* code, but we have to single-thread it when softdep_flushfiles()
	600	* is in operation to get an accurate count of the number of items
	601	* related to its mount point that are in the list.
	602	*/
	603	if (matchmnt == NULL) {
	604	if (softdep_worklist_busy < 0)
	605	return(-1);
	606	softdep_worklist_busy += 1;
	607	}
	608
	609	/*
	610	* If requested, try removing inode or removal dependencies.
	611	*/
	612	if (req_clear_inodedeps) {
	613	clear_inodedeps(td);
	614	req_clear_inodedeps -= 1;
	615	wakeup_one(&proc_waiting);
	616	}
	617	if (req_clear_remove) {
	618	clear_remove(td);
	619	req_clear_remove -= 1;
	620	wakeup_one(&proc_waiting);
	621	}
	622	loopcount = 1;
	623	starttime = time_second;
	624	while (num_on_worklist > 0) {
	625	matchcnt += process_worklist_item(matchmnt, 0);
	626
	627	/*
	628	* If a umount operation wants to run the worklist
	629	* accurately, abort.
	630	*/
	631	if (softdep_worklist_req && matchmnt == NULL) {
	632	matchcnt = -1;
	633	break;
	634	}
	635
	636	/*
	637	* If requested, try removing inode or removal dependencies.
	638	*/
	639	if (req_clear_inodedeps) {
	640	clear_inodedeps(td);
	641	req_clear_inodedeps -= 1;
	642	wakeup_one(&proc_waiting);
	643	}
	644	if (req_clear_remove) {
	645	clear_remove(td);
	646	req_clear_remove -= 1;
	647	wakeup_one(&proc_waiting);
	648	}
	649	/*
	650	* We do not generally want to stop for buffer space, but if
	651	* we are really being a buffer hog, we will stop and wait.
	652	*/
	653	if (loopcount++ % 128 == 0)
	654	bwillwrite();
	655	/*
	656	* Never allow processing to run for more than one
	657	* second. Otherwise the other syncer tasks may get
	658	* excessively backlogged.
	659	*/
	660	if (starttime != time_second && matchmnt == NULL) {
	661	matchcnt = -1;
	662	break;
	663	}
	664	}
	665	if (matchmnt == NULL) {
	666	--softdep_worklist_busy;
	667	if (softdep_worklist_req && softdep_worklist_busy == 0)
	668	wakeup(&softdep_worklist_req);
	669	}
	670	return (matchcnt);
	671	}
	672
	673	/*
	674	* Process one item on the worklist.
	675	*/
	676	static int
	677	process_worklist_item(matchmnt, flags)
	678	struct mount *matchmnt;
	679	int flags;
	680	{
	681	struct worklist *wk;
	682	struct dirrem *dirrem;
	683	struct fs *matchfs;
	684	struct vnode *vp;
	685	int matchcnt = 0;
	686
	687	matchfs = NULL;
	688	if (matchmnt != NULL)
	689	matchfs = VFSTOUFS(matchmnt)->um_fs;
	690	ACQUIRE_LOCK(&lk);
	691	/*
	692	* Normally we just process each item on the worklist in order.
	693	* However, if we are in a situation where we cannot lock any
	694	* inodes, we have to skip over any dirrem requests whose
	695	* vnodes are resident and locked.
	696	*/
	697	LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
	698	if ((flags & LK_NOWAIT) == 0 \|\| wk->wk_type != D_DIRREM)
	699	break;
	700	dirrem = WK_DIRREM(wk);
	701	vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev,
	702	dirrem->dm_oldinum);
	703	if (vp == NULL \|\| !VOP_ISLOCKED(vp, curthread))
	704	break;
	705	}
	706	if (wk == 0) {
	707	FREE_LOCK(&lk);
	708	return (0);
	709	}
	710	WORKLIST_REMOVE(wk);
	711	num_on_worklist -= 1;
	712	FREE_LOCK(&lk);
	713	switch (wk->wk_type) {
	714
	715	case D_DIRREM:
	716	/* removal of a directory entry */
	717	if (WK_DIRREM(wk)->dm_mnt == matchmnt)
	718	matchcnt += 1;
	719	handle_workitem_remove(WK_DIRREM(wk));
	720	break;
	721
	722	case D_FREEBLKS:
	723	/* releasing blocks and/or fragments from a file */
	724	if (WK_FREEBLKS(wk)->fb_fs == matchfs)
	725	matchcnt += 1;
	726	handle_workitem_freeblocks(WK_FREEBLKS(wk));
	727	break;
	728
	729	case D_FREEFRAG:
	730	/* releasing a fragment when replaced as a file grows */
	731	if (WK_FREEFRAG(wk)->ff_fs == matchfs)
	732	matchcnt += 1;
	733	handle_workitem_freefrag(WK_FREEFRAG(wk));
	734	break;
	735
	736	case D_FREEFILE:
	737	/* releasing an inode when its link count drops to 0 */
	738	if (WK_FREEFILE(wk)->fx_fs == matchfs)
	739	matchcnt += 1;
	740	handle_workitem_freefile(WK_FREEFILE(wk));
	741	break;
	742
	743	default:
	744	panic("%s_process_worklist: Unknown type %s",
	745	"softdep", TYPENAME(wk->wk_type));
	746	/* NOTREACHED */
	747	}
	748	return (matchcnt);
	749	}
	750
	751	/*
	752	* Move dependencies from one buffer to another.
	753	*/
	754	static void
	755	softdep_move_dependencies(oldbp, newbp)
	756	struct buf *oldbp;
	757	struct buf *newbp;
	758	{
	759	struct worklist wk, wktail;
	760
	761	if (LIST_FIRST(&newbp->b_dep) != NULL)
	762	panic("softdep_move_dependencies: need merge code");
	763	wktail = 0;
	764	ACQUIRE_LOCK(&lk);
	765	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
	766	LIST_REMOVE(wk, wk_list);
	767	if (wktail == 0)
	768	LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
	769	else
	770	LIST_INSERT_AFTER(wktail, wk, wk_list);
	771	wktail = wk;
	772	}
	773	FREE_LOCK(&lk);
	774	}
	775
	776	/*
	777	* Purge the work list of all items associated with a particular mount point.
	778	*/
	779	int
	780	softdep_flushfiles(struct mount oldmnt, int flags, struct thread td)
	781	{
	782	struct vnode *devvp;
	783	struct ucred *cred;
	784	int error, loopcnt;
	785
	786	KKASSERT(td->td_proc);
	787	cred = td->td_proc->p_ucred;
	788
	789	/*
	790	* Await our turn to clear out the queue, then serialize access.
	791	*/
	792	while (softdep_worklist_busy != 0) {
	793	softdep_worklist_req += 1;
	794	tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0);
	795	softdep_worklist_req -= 1;
	796	}
	797	softdep_worklist_busy = -1;
	798
	799	if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) {
	800	softdep_worklist_busy = 0;
	801	if (softdep_worklist_req)
	802	wakeup(&softdep_worklist_req);
	803	return (error);
	804	}
	805	/*
	806	* Alternately flush the block device associated with the mount
	807	* point and process any dependencies that the flushing
	808	* creates. In theory, this loop can happen at most twice,
	809	* but we give it a few extra just to be sure.
	810	*/
	811	devvp = VFSTOUFS(oldmnt)->um_devvp;
	812	for (loopcnt = 10; loopcnt > 0; ) {
	813	if (softdep_process_worklist(oldmnt) == 0) {
	814	loopcnt--;
	815	/*
	816	* Do another flush in case any vnodes were brought in
	817	* as part of the cleanup operations.
	818	*/
	819	if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
	820	break;
	821	/*
	822	* If we still found nothing to do, we are really done.
	823	*/
	824	if (softdep_process_worklist(oldmnt) == 0)
	825	break;
	826	}
	827	vn_lock(devvp, LK_EXCLUSIVE \| LK_RETRY, td);
	828	error = VOP_FSYNC(devvp, cred, MNT_WAIT, td);
	829	VOP_UNLOCK(devvp, 0, td);
	830	if (error)
	831	break;
	832	}
	833	softdep_worklist_busy = 0;
	834	if (softdep_worklist_req)
	835	wakeup(&softdep_worklist_req);
	836
	837	/*
	838	* If we are unmounting then it is an error to fail. If we
	839	* are simply trying to downgrade to read-only, then filesystem
	840	* activity can keep us busy forever, so we just fail with EBUSY.
	841	*/
	842	if (loopcnt == 0) {
	843	if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
	844	panic("softdep_flushfiles: looping");
	845	error = EBUSY;
	846	}
	847	return (error);
	848	}
	849
	850	/*
	851	* Structure hashing.
	852	*
	853	* There are three types of structures that can be looked up:
	854	* 1) pagedep structures identified by mount point, inode number,
	855	* and logical block.
	856	* 2) inodedep structures identified by mount point and inode number.
	857	* 3) newblk structures identified by mount point and
	858	* physical block number.
	859	*
	860	* The "pagedep" and "inodedep" dependency structures are hashed
	861	* separately from the file blocks and inodes to which they correspond.
	862	* This separation helps when the in-memory copy of an inode or
	863	* file block must be replaced. It also obviates the need to access
	864	* an inode or file page when simply updating (or de-allocating)
	865	* dependency structures. Lookup of newblk structures is needed to
	866	* find newly allocated blocks when trying to associate them with
	867	* their allocdirect or allocindir structure.
	868	*
	869	* The lookup routines optionally create and hash a new instance when
	870	* an existing entry is not found.
	871	*/
	872	#define DEPALLOC 0x0001 /* allocate structure if lookup fails */
	873	#define NODELAY 0x0002 /* cannot do background work */
	874
	875	/*
	876	* Structures and routines associated with pagedep caching.
	877	*/
	878	LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
	879	u_long pagedep_hash; /* size of hash table - 1 */
	880	#define PAGEDEP_HASH(mp, inum, lbn) \
	881	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
	882	pagedep_hash])
	883	static struct sema pagedep_in_progress;
	884
	885	/*
	886	* Look up a pagedep. Return 1 if found, 0 if not found.
	887	* If not found, allocate if DEPALLOC flag is passed.
	888	* Found or allocated entry is returned in pagedeppp.
	889	* This routine must be called with splbio interrupts blocked.
	890	*/
	891	static int
	892	pagedep_lookup(ip, lbn, flags, pagedeppp)
	893	struct inode *ip;
	894	ufs_lbn_t lbn;
	895	int flags;
	896	struct pagedep **pagedeppp;
	897	{
	898	struct pagedep *pagedep;
	899	struct pagedep_hashhead *pagedephd;
	900	struct mount *mp;
	901	int i;
	902
	903	#ifdef DEBUG
	904	if (lk.lkt_held == NOHOLDER)
	905	panic("pagedep_lookup: lock not held");
	906	#endif
	907	mp = ITOV(ip)->v_mount;
	908	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
	909	top:
	910	LIST_FOREACH(pagedep, pagedephd, pd_hash)
	911	if (ip->i_number == pagedep->pd_ino &&
	912	lbn == pagedep->pd_lbn &&
	913	mp == pagedep->pd_mnt)
	914	break;
	915	if (pagedep) {
	916	*pagedeppp = pagedep;
	917	return (1);
	918	}
	919	if ((flags & DEPALLOC) == 0) {
	920	*pagedeppp = NULL;
	921	return (0);
	922	}
	923	if (sema_get(&pagedep_in_progress, &lk) == 0) {
	924	ACQUIRE_LOCK(&lk);
	925	goto top;
	926	}
	927	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
	928	M_SOFTDEP_FLAGS);
	929	bzero(pagedep, sizeof(struct pagedep));
	930	pagedep->pd_list.wk_type = D_PAGEDEP;
	931	pagedep->pd_mnt = mp;
	932	pagedep->pd_ino = ip->i_number;
	933	pagedep->pd_lbn = lbn;
	934	LIST_INIT(&pagedep->pd_dirremhd);
	935	LIST_INIT(&pagedep->pd_pendinghd);
	936	for (i = 0; i < DAHASHSZ; i++)
	937	LIST_INIT(&pagedep->pd_diraddhd[i]);
	938	ACQUIRE_LOCK(&lk);
	939	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
	940	sema_release(&pagedep_in_progress);
	941	*pagedeppp = pagedep;
	942	return (0);
	943	}
	944
	945	/*
	946	* Structures and routines associated with inodedep caching.
	947	*/
	948	LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
	949	static u_long inodedep_hash; /* size of hash table - 1 */
	950	static long num_inodedep; /* number of inodedep allocated */
	951	#define INODEDEP_HASH(fs, inum) \
	952	(&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
	953	static struct sema inodedep_in_progress;
	954
	955	/*
	956	* Look up a inodedep. Return 1 if found, 0 if not found.
	957	* If not found, allocate if DEPALLOC flag is passed.
	958	* Found or allocated entry is returned in inodedeppp.
	959	* This routine must be called with splbio interrupts blocked.
	960	*/
	961	static int
	962	inodedep_lookup(fs, inum, flags, inodedeppp)
	963	struct fs *fs;
	964	ino_t inum;
	965	int flags;
	966	struct inodedep **inodedeppp;
	967	{
	968	struct inodedep *inodedep;
	969	struct inodedep_hashhead *inodedephd;
	970	int firsttry;
	971
	972	#ifdef DEBUG
	973	if (lk.lkt_held == NOHOLDER)
	974	panic("inodedep_lookup: lock not held");
	975	#endif
	976	firsttry = 1;
	977	inodedephd = INODEDEP_HASH(fs, inum);
	978	top:
	979	LIST_FOREACH(inodedep, inodedephd, id_hash)
	980	if (inum == inodedep->id_ino && fs == inodedep->id_fs)
	981	break;
	982	if (inodedep) {
	983	*inodedeppp = inodedep;
	984	return (1);
	985	}
	986	if ((flags & DEPALLOC) == 0) {
	987	*inodedeppp = NULL;
	988	return (0);
	989	}
	990	/*
	991	* If we are over our limit, try to improve the situation.
	992	*/
	993	if (num_inodedep > max_softdeps && firsttry &&
	994	speedup_syncer() == 0 && (flags & NODELAY) == 0 &&
	995	request_cleanup(FLUSH_INODES, 1)) {
	996	firsttry = 0;
	997	goto top;
	998	}
	999	if (sema_get(&inodedep_in_progress, &lk) == 0) {
	1000	ACQUIRE_LOCK(&lk);
	1001	goto top;
	1002	}
	1003	num_inodedep += 1;
	1004	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
	1005	M_INODEDEP, M_SOFTDEP_FLAGS);
	1006	inodedep->id_list.wk_type = D_INODEDEP;
	1007	inodedep->id_fs = fs;
	1008	inodedep->id_ino = inum;
	1009	inodedep->id_state = ALLCOMPLETE;
	1010	inodedep->id_nlinkdelta = 0;
	1011	inodedep->id_savedino = NULL;
	1012	inodedep->id_savedsize = -1;
	1013	inodedep->id_buf = NULL;
	1014	LIST_INIT(&inodedep->id_pendinghd);
	1015	LIST_INIT(&inodedep->id_inowait);
	1016	LIST_INIT(&inodedep->id_bufwait);
	1017	TAILQ_INIT(&inodedep->id_inoupdt);
	1018	TAILQ_INIT(&inodedep->id_newinoupdt);
	1019	ACQUIRE_LOCK(&lk);
	1020	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
	1021	sema_release(&inodedep_in_progress);
	1022	*inodedeppp = inodedep;
	1023	return (0);
	1024	}
	1025
	1026	/*
	1027	* Structures and routines associated with newblk caching.
	1028	*/
	1029	LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
	1030	u_long newblk_hash; /* size of hash table - 1 */
	1031	#define NEWBLK_HASH(fs, inum) \
	1032	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
	1033	static struct sema newblk_in_progress;
	1034
	1035	/*
	1036	* Look up a newblk. Return 1 if found, 0 if not found.
	1037	* If not found, allocate if DEPALLOC flag is passed.
	1038	* Found or allocated entry is returned in newblkpp.
	1039	*/
	1040	static int
	1041	newblk_lookup(fs, newblkno, flags, newblkpp)
	1042	struct fs *fs;
	1043	ufs_daddr_t newblkno;
	1044	int flags;
	1045	struct newblk **newblkpp;
	1046	{
	1047	struct newblk *newblk;
	1048	struct newblk_hashhead *newblkhd;
	1049
	1050	newblkhd = NEWBLK_HASH(fs, newblkno);
	1051	top:
	1052	LIST_FOREACH(newblk, newblkhd, nb_hash)
	1053	if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
	1054	break;
	1055	if (newblk) {
	1056	*newblkpp = newblk;
	1057	return (1);
	1058	}
	1059	if ((flags & DEPALLOC) == 0) {
	1060	*newblkpp = NULL;
	1061	return (0);
	1062	}
	1063	if (sema_get(&newblk_in_progress, 0) == 0)
	1064	goto top;
	1065	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
	1066	M_NEWBLK, M_SOFTDEP_FLAGS);
	1067	newblk->nb_state = 0;
	1068	newblk->nb_fs = fs;
	1069	newblk->nb_newblkno = newblkno;
	1070	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
	1071	sema_release(&newblk_in_progress);
	1072	*newblkpp = newblk;
	1073	return (0);
	1074	}
	1075
	1076	/*
	1077	* Executed during filesystem system initialization before
	1078	* mounting any file systems.
	1079	*/
	1080	void
	1081	softdep_initialize()
	1082	{
	1083
	1084	LIST_INIT(&mkdirlisthd);
	1085	LIST_INIT(&softdep_workitem_pending);
	1086	max_softdeps = min(desiredvnodes * 8,
	1087	M_INODEDEP->ks_limit / (2 * sizeof(struct inodedep)));
	1088	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
	1089	&pagedep_hash);
	1090	sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
	1091	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
	1092	sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
	1093	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
	1094	sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
	1095	}
	1096
	1097	/*
	1098	* Called at mount time to notify the dependency code that a
	1099	* filesystem wishes to use it.
	1100	*/
	1101	int
	1102	softdep_mount(devvp, mp, fs, cred)
	1103	struct vnode *devvp;
	1104	struct mount *mp;
	1105	struct fs *fs;
	1106	struct ucred *cred;
	1107	{
	1108	struct csum cstotal;
	1109	struct cg *cgp;
	1110	struct buf *bp;
	1111	int error, cyl;
	1112
	1113	mp->mnt_flag &= ~MNT_ASYNC;
	1114	mp->mnt_flag \|= MNT_SOFTDEP;
	1115	/*
	1116	* When doing soft updates, the counters in the
	1117	* superblock may have gotten out of sync, so we have
	1118	* to scan the cylinder groups and recalculate them.
	1119	*/
	1120	if (fs->fs_clean != 0)
	1121	return (0);
	1122	bzero(&cstotal, sizeof cstotal);
	1123	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
	1124	if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
	1125	fs->fs_cgsize, cred, &bp)) != 0) {
	1126	brelse(bp);
	1127	return (error);
	1128	}
	1129	cgp = (struct cg *)bp->b_data;
	1130	cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
	1131	cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
	1132	cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
	1133	cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
	1134	fs->fs_cs(fs, cyl) = cgp->cg_cs;
	1135	brelse(bp);
	1136	}
	1137	#ifdef DEBUG
	1138	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
	1139	printf("ffs_mountfs: superblock updated for soft updates\n");
	1140	#endif
	1141	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
	1142	return (0);
	1143	}
	1144
	1145	/*
	1146	* Protecting the freemaps (or bitmaps).
	1147	*
	1148	* To eliminate the need to execute fsck before mounting a file system
	1149	* after a power failure, one must (conservatively) guarantee that the
	1150	* on-disk copy of the bitmaps never indicate that a live inode or block is
	1151	* free. So, when a block or inode is allocated, the bitmap should be
	1152	* updated (on disk) before any new pointers. When a block or inode is
	1153	* freed, the bitmap should not be updated until all pointers have been
	1154	* reset. The latter dependency is handled by the delayed de-allocation
	1155	* approach described below for block and inode de-allocation. The former
	1156	* dependency is handled by calling the following procedure when a block or
	1157	* inode is allocated. When an inode is allocated an "inodedep" is created
	1158	* with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
	1159	* Each "inodedep" is also inserted into the hash indexing structure so
	1160	* that any additional link additions can be made dependent on the inode
	1161	* allocation.
	1162	*
	1163	* The ufs file system maintains a number of free block counts (e.g., per
	1164	* cylinder group, per cylinder and per <cylinder, rotational position> pair)
	1165	* in addition to the bitmaps. These counts are used to improve efficiency
	1166	* during allocation and therefore must be consistent with the bitmaps.
	1167	* There is no convenient way to guarantee post-crash consistency of these
	1168	* counts with simple update ordering, for two main reasons: (1) The counts
	1169	* and bitmaps for a single cylinder group block are not in the same disk
	1170	* sector. If a disk write is interrupted (e.g., by power failure), one may
	1171	* be written and the other not. (2) Some of the counts are located in the
	1172	* superblock rather than the cylinder group block. So, we focus our soft
	1173	* updates implementation on protecting the bitmaps. When mounting a
	1174	* filesystem, we recompute the auxiliary counts from the bitmaps.
	1175	*/
	1176
	1177	/*
	1178	* Called just after updating the cylinder group block to allocate an inode.
	1179	*/
	1180	void
	1181	softdep_setup_inomapdep(bp, ip, newinum)
	1182	struct buf bp; / buffer for cylgroup block with inode map */
	1183	struct inode ip; / inode related to allocation */
	1184	ino_t newinum; /* new inode number being allocated */
	1185	{
	1186	struct inodedep *inodedep;
	1187	struct bmsafemap *bmsafemap;
	1188
	1189	/*
	1190	* Create a dependency for the newly allocated inode.
	1191	* Panic if it already exists as something is seriously wrong.
	1192	* Otherwise add it to the dependency list for the buffer holding
	1193	* the cylinder group map from which it was allocated.
	1194	*/
	1195	ACQUIRE_LOCK(&lk);
	1196	if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC\|NODELAY, &inodedep))) {
	1197	FREE_LOCK(&lk);
	1198	panic("softdep_setup_inomapdep: found inode");
	1199	}
	1200	inodedep->id_buf = bp;
	1201	inodedep->id_state &= ~DEPCOMPLETE;
	1202	bmsafemap = bmsafemap_lookup(bp);
	1203	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
	1204	FREE_LOCK(&lk);
	1205	}
	1206
	1207	/*
	1208	* Called just after updating the cylinder group block to
	1209	* allocate block or fragment.
	1210	*/
	1211	void
	1212	softdep_setup_blkmapdep(bp, fs, newblkno)
	1213	struct buf bp; / buffer for cylgroup block with block map */
	1214	struct fs fs; / filesystem doing allocation */
	1215	ufs_daddr_t newblkno; /* number of newly allocated block */
	1216	{
	1217	struct newblk *newblk;
	1218	struct bmsafemap *bmsafemap;
	1219
	1220	/*
	1221	* Create a dependency for the newly allocated block.
	1222	* Add it to the dependency list for the buffer holding
	1223	* the cylinder group map from which it was allocated.
	1224	*/
	1225	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
	1226	panic("softdep_setup_blkmapdep: found block");
	1227	ACQUIRE_LOCK(&lk);
	1228	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
	1229	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
	1230	FREE_LOCK(&lk);
	1231	}
	1232
	1233	/*
	1234	* Find the bmsafemap associated with a cylinder group buffer.
	1235	* If none exists, create one. The buffer must be locked when
	1236	* this routine is called and this routine must be called with
	1237	* splbio interrupts blocked.
	1238	*/
	1239	static struct bmsafemap *
	1240	bmsafemap_lookup(bp)
	1241	struct buf *bp;
	1242	{
	1243	struct bmsafemap *bmsafemap;
	1244	struct worklist *wk;
	1245
	1246	#ifdef DEBUG
	1247	if (lk.lkt_held == NOHOLDER)
	1248	panic("bmsafemap_lookup: lock not held");
	1249	#endif
	1250	LIST_FOREACH(wk, &bp->b_dep, wk_list)
	1251	if (wk->wk_type == D_BMSAFEMAP)
	1252	return (WK_BMSAFEMAP(wk));
	1253	FREE_LOCK(&lk);
	1254	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
	1255	M_BMSAFEMAP, M_SOFTDEP_FLAGS);
	1256	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
	1257	bmsafemap->sm_list.wk_state = 0;
	1258	bmsafemap->sm_buf = bp;
	1259	LIST_INIT(&bmsafemap->sm_allocdirecthd);
	1260	LIST_INIT(&bmsafemap->sm_allocindirhd);
	1261	LIST_INIT(&bmsafemap->sm_inodedephd);
	1262	LIST_INIT(&bmsafemap->sm_newblkhd);
	1263	ACQUIRE_LOCK(&lk);
	1264	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
	1265	return (bmsafemap);
	1266	}
	1267
	1268	/*
	1269	* Direct block allocation dependencies.
	1270	*
	1271	* When a new block is allocated, the corresponding disk locations must be
	1272	* initialized (with zeros or new data) before the on-disk inode points to
	1273	* them. Also, the freemap from which the block was allocated must be
	1274	* updated (on disk) before the inode's pointer. These two dependencies are
	1275	* independent of each other and are needed for all file blocks and indirect
	1276	* blocks that are pointed to directly by the inode. Just before the
	1277	* "in-core" version of the inode is updated with a newly allocated block
	1278	* number, a procedure (below) is called to setup allocation dependency
	1279	* structures. These structures are removed when the corresponding
	1280	* dependencies are satisfied or when the block allocation becomes obsolete
	1281	* (i.e., the file is deleted, the block is de-allocated, or the block is a
	1282	* fragment that gets upgraded). All of these cases are handled in
	1283	* procedures described later.
	1284	*
	1285	* When a file extension causes a fragment to be upgraded, either to a larger
	1286	* fragment or to a full block, the on-disk location may change (if the
	1287	* previous fragment could not simply be extended). In this case, the old
	1288	* fragment must be de-allocated, but not until after the inode's pointer has
	1289	* been updated. In most cases, this is handled by later procedures, which
	1290	* will construct a "freefrag" structure to be added to the workitem queue
	1291	* when the inode update is complete (or obsolete). The main exception to
	1292	* this is when an allocation occurs while a pending allocation dependency
	1293	* (for the same block pointer) remains. This case is handled in the main
	1294	* allocation dependency setup procedure by immediately freeing the
	1295	* unreferenced fragments.
	1296	*/
	1297	void
	1298	softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
	1299	struct inode ip; / inode to which block is being added */
	1300	ufs_lbn_t lbn; /* block pointer within inode */
	1301	ufs_daddr_t newblkno; /* disk block number being added */
	1302	ufs_daddr_t oldblkno; /* previous block number, 0 unless frag */
	1303	long newsize; /* size of new block */
	1304	long oldsize; /* size of new block */
	1305	struct buf bp; / bp for allocated block */
	1306	{
	1307	struct allocdirect adp, oldadp;
	1308	struct allocdirectlst *adphead;
	1309	struct bmsafemap *bmsafemap;
	1310	struct inodedep *inodedep;
	1311	struct pagedep *pagedep;
	1312	struct newblk *newblk;
	1313
	1314	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
	1315	M_ALLOCDIRECT, M_SOFTDEP_FLAGS);
	1316	bzero(adp, sizeof(struct allocdirect));
	1317	adp->ad_list.wk_type = D_ALLOCDIRECT;
	1318	adp->ad_lbn = lbn;
	1319	adp->ad_newblkno = newblkno;
	1320	adp->ad_oldblkno = oldblkno;
	1321	adp->ad_newsize = newsize;
	1322	adp->ad_oldsize = oldsize;
	1323	adp->ad_state = ATTACHED;
	1324	if (newblkno == oldblkno)
	1325	adp->ad_freefrag = NULL;
	1326	else
	1327	adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
	1328
	1329	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
	1330	panic("softdep_setup_allocdirect: lost block");
	1331
	1332	ACQUIRE_LOCK(&lk);
	1333	inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC \| NODELAY, &inodedep);
	1334	adp->ad_inodedep = inodedep;
	1335
	1336	if (newblk->nb_state == DEPCOMPLETE) {
	1337	adp->ad_state \|= DEPCOMPLETE;
	1338	adp->ad_buf = NULL;
	1339	} else {
	1340	bmsafemap = newblk->nb_bmsafemap;
	1341	adp->ad_buf = bmsafemap->sm_buf;
	1342	LIST_REMOVE(newblk, nb_deps);
	1343	LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
	1344	}
	1345	LIST_REMOVE(newblk, nb_hash);
	1346	FREE(newblk, M_NEWBLK);
	1347
	1348	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
	1349	if (lbn >= NDADDR) {
	1350	/* allocating an indirect block */
	1351	if (oldblkno != 0) {
	1352	FREE_LOCK(&lk);
	1353	panic("softdep_setup_allocdirect: non-zero indir");
	1354	}
	1355	} else {
	1356	/*
	1357	* Allocating a direct block.
	1358	*
	1359	* If we are allocating a directory block, then we must
	1360	* allocate an associated pagedep to track additions and
	1361	* deletions.
	1362	*/
	1363	if ((ip->i_mode & IFMT) == IFDIR &&
	1364	pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
	1365	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
	1366	}
	1367	/*
	1368	* The list of allocdirects must be kept in sorted and ascending
	1369	* order so that the rollback routines can quickly determine the
	1370	* first uncommitted block (the size of the file stored on disk
	1371	* ends at the end of the lowest committed fragment, or if there
	1372	* are no fragments, at the end of the highest committed block).
	1373	* Since files generally grow, the typical case is that the new
	1374	* block is to be added at the end of the list. We speed this
	1375	* special case by checking against the last allocdirect in the
	1376	* list before laboriously traversing the list looking for the
	1377	* insertion point.
	1378	*/
	1379	adphead = &inodedep->id_newinoupdt;
	1380	oldadp = TAILQ_LAST(adphead, allocdirectlst);
	1381	if (oldadp == NULL \|\| oldadp->ad_lbn <= lbn) {
	1382	/* insert at end of list */
	1383	TAILQ_INSERT_TAIL(adphead, adp, ad_next);
	1384	if (oldadp != NULL && oldadp->ad_lbn == lbn)
	1385	allocdirect_merge(adphead, adp, oldadp);
	1386	FREE_LOCK(&lk);
	1387	return;
	1388	}
	1389	TAILQ_FOREACH(oldadp, adphead, ad_next) {
	1390	if (oldadp->ad_lbn >= lbn)
	1391	break;
	1392	}
	1393	if (oldadp == NULL) {
	1394	FREE_LOCK(&lk);
	1395	panic("softdep_setup_allocdirect: lost entry");
	1396	}
	1397	/* insert in middle of list */
	1398	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
	1399	if (oldadp->ad_lbn == lbn)
	1400	allocdirect_merge(adphead, adp, oldadp);
	1401	FREE_LOCK(&lk);
	1402	}
	1403
	1404	/*
	1405	* Replace an old allocdirect dependency with a newer one.
	1406	* This routine must be called with splbio interrupts blocked.
	1407	*/
	1408	static void
	1409	allocdirect_merge(adphead, newadp, oldadp)
	1410	struct allocdirectlst adphead; / head of list holding allocdirects */
	1411	struct allocdirect newadp; / allocdirect being added */
	1412	struct allocdirect oldadp; / existing allocdirect being checked */
	1413	{
	1414	struct freefrag *freefrag;
	1415
	1416	#ifdef DEBUG
	1417	if (lk.lkt_held == NOHOLDER)
	1418	panic("allocdirect_merge: lock not held");
	1419	#endif
	1420	if (newadp->ad_oldblkno != oldadp->ad_newblkno \|\|
	1421	newadp->ad_oldsize != oldadp->ad_newsize \|\|
	1422	newadp->ad_lbn >= NDADDR) {
	1423	FREE_LOCK(&lk);
	1424	panic("allocdirect_check: old %d != new %d \|\| lbn %ld >= %d",
	1425	newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
	1426	NDADDR);
	1427	}
	1428	newadp->ad_oldblkno = oldadp->ad_oldblkno;
	1429	newadp->ad_oldsize = oldadp->ad_oldsize;
	1430	/*
	1431	* If the old dependency had a fragment to free or had never
	1432	* previously had a block allocated, then the new dependency
	1433	* can immediately post its freefrag and adopt the old freefrag.
	1434	* This action is done by swapping the freefrag dependencies.
	1435	* The new dependency gains the old one's freefrag, and the
	1436	* old one gets the new one and then immediately puts it on
	1437	* the worklist when it is freed by free_allocdirect. It is
	1438	* not possible to do this swap when the old dependency had a
	1439	* non-zero size but no previous fragment to free. This condition
	1440	* arises when the new block is an extension of the old block.
	1441	* Here, the first part of the fragment allocated to the new
	1442	* dependency is part of the block currently claimed on disk by
	1443	* the old dependency, so cannot legitimately be freed until the
	1444	* conditions for the new dependency are fulfilled.
	1445	*/
	1446	if (oldadp->ad_freefrag != NULL \|\| oldadp->ad_oldblkno == 0) {
	1447	freefrag = newadp->ad_freefrag;
	1448	newadp->ad_freefrag = oldadp->ad_freefrag;
	1449	oldadp->ad_freefrag = freefrag;
	1450	}
	1451	free_allocdirect(adphead, oldadp, 0);
	1452	}
	1453
	1454	/*
	1455	* Allocate a new freefrag structure if needed.
	1456	*/
	1457	static struct freefrag *
	1458	newfreefrag(ip, blkno, size)
	1459	struct inode *ip;
	1460	ufs_daddr_t blkno;
	1461	long size;
	1462	{
	1463	struct freefrag *freefrag;
	1464	struct fs *fs;
	1465
	1466	if (blkno == 0)
	1467	return (NULL);
	1468	fs = ip->i_fs;
	1469	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
	1470	panic("newfreefrag: frag size");
	1471	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
	1472	M_FREEFRAG, M_SOFTDEP_FLAGS);
	1473	freefrag->ff_list.wk_type = D_FREEFRAG;
	1474	freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */
	1475	freefrag->ff_inum = ip->i_number;
	1476	freefrag->ff_fs = fs;
	1477	freefrag->ff_devvp = ip->i_devvp;
	1478	freefrag->ff_blkno = blkno;
	1479	freefrag->ff_fragsize = size;
	1480	return (freefrag);
	1481	}
	1482
	1483	/*
	1484	* This workitem de-allocates fragments that were replaced during
	1485	* file block allocation.
	1486	*/
	1487	static void
	1488	handle_workitem_freefrag(freefrag)
	1489	struct freefrag *freefrag;
	1490	{
	1491	struct inode tip;
	1492
	1493	tip.i_fs = freefrag->ff_fs;
	1494	tip.i_devvp = freefrag->ff_devvp;
	1495	tip.i_dev = freefrag->ff_devvp->v_rdev;
	1496	tip.i_number = freefrag->ff_inum;
	1497	tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above */
	1498	ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
	1499	FREE(freefrag, M_FREEFRAG);
	1500	}
	1501
	1502	/*
	1503	* Indirect block allocation dependencies.
	1504	*
	1505	* The same dependencies that exist for a direct block also exist when
	1506	* a new block is allocated and pointed to by an entry in a block of
	1507	* indirect pointers. The undo/redo states described above are also
	1508	* used here. Because an indirect block contains many pointers that
	1509	* may have dependencies, a second copy of the entire in-memory indirect
	1510	* block is kept. The buffer cache copy is always completely up-to-date.
	1511	* The second copy, which is used only as a source for disk writes,
	1512	* contains only the safe pointers (i.e., those that have no remaining
	1513	* update dependencies). The second copy is freed when all pointers
	1514	* are safe. The cache is not allowed to replace indirect blocks with
	1515	* pending update dependencies. If a buffer containing an indirect
	1516	* block with dependencies is written, these routines will mark it
	1517	* dirty again. It can only be successfully written once all the
	1518	* dependencies are removed. The ffs_fsync routine in conjunction with
	1519	* softdep_sync_metadata work together to get all the dependencies
	1520	* removed so that a file can be successfully written to disk. Three
	1521	* procedures are used when setting up indirect block pointer
	1522	* dependencies. The division is necessary because of the organization
	1523	* of the "balloc" routine and because of the distinction between file
	1524	* pages and file metadata blocks.
	1525	*/
	1526
	1527	/*
	1528	* Allocate a new allocindir structure.
	1529	*/
	1530	static struct allocindir *
	1531	newallocindir(ip, ptrno, newblkno, oldblkno)
	1532	struct inode ip; / inode for file being extended */
	1533	int ptrno; /* offset of pointer in indirect block */
	1534	ufs_daddr_t newblkno; /* disk block number being added */
	1535	ufs_daddr_t oldblkno; /* previous block number, 0 if none */
	1536	{
	1537	struct allocindir *aip;
	1538
	1539	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
	1540	M_ALLOCINDIR, M_SOFTDEP_FLAGS);
	1541	bzero(aip, sizeof(struct allocindir));
	1542	aip->ai_list.wk_type = D_ALLOCINDIR;
	1543	aip->ai_state = ATTACHED;
	1544	aip->ai_offset = ptrno;
	1545	aip->ai_newblkno = newblkno;
	1546	aip->ai_oldblkno = oldblkno;
	1547	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
	1548	return (aip);
	1549	}
	1550
	1551	/*
	1552	* Called just before setting an indirect block pointer
	1553	* to a newly allocated file page.
	1554	*/
	1555	void
	1556	softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
	1557	struct inode ip; / inode for file being extended */
	1558	ufs_lbn_t lbn; /* allocated block number within file */
	1559	struct buf bp; / buffer with indirect blk referencing page */
	1560	int ptrno; /* offset of pointer in indirect block */
	1561	ufs_daddr_t newblkno; /* disk block number being added */
	1562	ufs_daddr_t oldblkno; /* previous block number, 0 if none */
	1563	struct buf nbp; / buffer holding allocated page */
	1564	{
	1565	struct allocindir *aip;
	1566	struct pagedep *pagedep;
	1567
	1568	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
	1569	ACQUIRE_LOCK(&lk);
	1570	/*
	1571	* If we are allocating a directory page, then we must
	1572	* allocate an associated pagedep to track additions and
	1573	* deletions.
	1574	*/
	1575	if ((ip->i_mode & IFMT) == IFDIR &&
	1576	pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
	1577	WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
	1578	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
	1579	FREE_LOCK(&lk);
	1580	setup_allocindir_phase2(bp, ip, aip);
	1581	}
	1582
	1583	/*
	1584	* Called just before setting an indirect block pointer to a
	1585	* newly allocated indirect block.
	1586	*/
	1587	void
	1588	softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
	1589	struct buf nbp; / newly allocated indirect block */
	1590	struct inode ip; / inode for file being extended */
	1591	struct buf bp; / indirect block referencing allocated block */
	1592	int ptrno; /* offset of pointer in indirect block */
	1593	ufs_daddr_t newblkno; /* disk block number being added */
	1594	{
	1595	struct allocindir *aip;
	1596
	1597	aip = newallocindir(ip, ptrno, newblkno, 0);
	1598	ACQUIRE_LOCK(&lk);
	1599	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
	1600	FREE_LOCK(&lk);
	1601	setup_allocindir_phase2(bp, ip, aip);
	1602	}
	1603
	1604	/*
	1605	* Called to finish the allocation of the "aip" allocated
	1606	* by one of the two routines above.
	1607	*/
	1608	static void
	1609	setup_allocindir_phase2(bp, ip, aip)
	1610	struct buf bp; / in-memory copy of the indirect block */
	1611	struct inode ip; / inode for file being extended */
	1612	struct allocindir aip; / allocindir allocated by the above routines */
	1613	{
	1614	struct worklist *wk;
	1615	struct indirdep indirdep, newindirdep;
	1616	struct bmsafemap *bmsafemap;
	1617	struct allocindir *oldaip;
	1618	struct freefrag *freefrag;
	1619	struct newblk *newblk;
	1620
	1621	if (bp->b_lblkno >= 0)
	1622	panic("setup_allocindir_phase2: not indir blk");
	1623	for (indirdep = NULL, newindirdep = NULL; ; ) {
	1624	ACQUIRE_LOCK(&lk);
	1625	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	1626	if (wk->wk_type != D_INDIRDEP)
	1627	continue;
	1628	indirdep = WK_INDIRDEP(wk);
	1629	break;
	1630	}
	1631	if (indirdep == NULL && newindirdep) {
	1632	indirdep = newindirdep;
	1633	WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
	1634	newindirdep = NULL;
	1635	}
	1636	FREE_LOCK(&lk);
	1637	if (indirdep) {
	1638	if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
	1639	&newblk) == 0)
	1640	panic("setup_allocindir: lost block");
	1641	ACQUIRE_LOCK(&lk);
	1642	if (newblk->nb_state == DEPCOMPLETE) {
	1643	aip->ai_state \|= DEPCOMPLETE;
	1644	aip->ai_buf = NULL;
	1645	} else {
	1646	bmsafemap = newblk->nb_bmsafemap;
	1647	aip->ai_buf = bmsafemap->sm_buf;
	1648	LIST_REMOVE(newblk, nb_deps);
	1649	LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
	1650	aip, ai_deps);
	1651	}
	1652	LIST_REMOVE(newblk, nb_hash);
	1653	FREE(newblk, M_NEWBLK);
	1654	aip->ai_indirdep = indirdep;
	1655	/*
	1656	* Check to see if there is an existing dependency
	1657	* for this block. If there is, merge the old
	1658	* dependency into the new one.
	1659	*/
	1660	if (aip->ai_oldblkno == 0)
	1661	oldaip = NULL;
	1662	else
	1663
	1664	LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
	1665	if (oldaip->ai_offset == aip->ai_offset)
	1666	break;
	1667	if (oldaip != NULL) {
	1668	if (oldaip->ai_newblkno != aip->ai_oldblkno) {
	1669	FREE_LOCK(&lk);
	1670	panic("setup_allocindir_phase2: blkno");
	1671	}
	1672	aip->ai_oldblkno = oldaip->ai_oldblkno;
	1673	freefrag = oldaip->ai_freefrag;
	1674	oldaip->ai_freefrag = aip->ai_freefrag;
	1675	aip->ai_freefrag = freefrag;
	1676	free_allocindir(oldaip, NULL);
	1677	}
	1678	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
	1679	((ufs_daddr_t *)indirdep->ir_savebp->b_data)
	1680	[aip->ai_offset] = aip->ai_oldblkno;
	1681	FREE_LOCK(&lk);
	1682	}
	1683	if (newindirdep) {
	1684	if (indirdep->ir_savebp != NULL)
	1685	brelse(newindirdep->ir_savebp);
	1686	WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
	1687	}
	1688	if (indirdep)
	1689	break;
	1690	MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
	1691	M_INDIRDEP, M_SOFTDEP_FLAGS);
	1692	newindirdep->ir_list.wk_type = D_INDIRDEP;
	1693	newindirdep->ir_state = ATTACHED;
	1694	LIST_INIT(&newindirdep->ir_deplisthd);
	1695	LIST_INIT(&newindirdep->ir_donehd);
	1696	if (bp->b_blkno == bp->b_lblkno) {
	1697	VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
	1698	NULL, NULL);
	1699	}
	1700	newindirdep->ir_savebp =
	1701	getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
	1702	BUF_KERNPROC(newindirdep->ir_savebp);
	1703	bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
	1704	}
	1705	}
	1706
	1707	/*
	1708	* Block de-allocation dependencies.
	1709	*
	1710	* When blocks are de-allocated, the on-disk pointers must be nullified before
	1711	* the blocks are made available for use by other files. (The true
	1712	* requirement is that old pointers must be nullified before new on-disk
	1713	* pointers are set. We chose this slightly more stringent requirement to
	1714	* reduce complexity.) Our implementation handles this dependency by updating
	1715	* the inode (or indirect block) appropriately but delaying the actual block
	1716	* de-allocation (i.e., freemap and free space count manipulation) until
	1717	* after the updated versions reach stable storage. After the disk is
	1718	* updated, the blocks can be safely de-allocated whenever it is convenient.
	1719	* This implementation handles only the common case of reducing a file's
	1720	* length to zero. Other cases are handled by the conventional synchronous
	1721	* write approach.
	1722	*
	1723	* The ffs implementation with which we worked double-checks
	1724	* the state of the block pointers and file size as it reduces
	1725	* a file's length. Some of this code is replicated here in our
	1726	* soft updates implementation. The freeblks->fb_chkcnt field is
	1727	* used to transfer a part of this information to the procedure
	1728	* that eventually de-allocates the blocks.
	1729	*
	1730	* This routine should be called from the routine that shortens
	1731	* a file's length, before the inode's size or block pointers
	1732	* are modified. It will save the block pointer information for
	1733	* later release and zero the inode so that the calling routine
	1734	* can release it.
	1735	*/
	1736	void
	1737	softdep_setup_freeblocks(ip, length)
	1738	struct inode ip; / The inode whose length is to be reduced */
	1739	off_t length; /* The new length for the file */
	1740	{
	1741	struct freeblks *freeblks;
	1742	struct inodedep *inodedep;
	1743	struct allocdirect *adp;
	1744	struct vnode *vp;
	1745	struct buf *bp;
	1746	struct fs *fs;
	1747	int i, error, delay;
	1748
	1749	fs = ip->i_fs;
	1750	if (length != 0)
	1751	panic("softde_setup_freeblocks: non-zero length");
	1752	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
	1753	M_FREEBLKS, M_SOFTDEP_FLAGS);
	1754	bzero(freeblks, sizeof(struct freeblks));
	1755	freeblks->fb_list.wk_type = D_FREEBLKS;
	1756	freeblks->fb_uid = ip->i_uid;
	1757	freeblks->fb_previousinum = ip->i_number;
	1758	freeblks->fb_devvp = ip->i_devvp;
	1759	freeblks->fb_fs = fs;
	1760	freeblks->fb_oldsize = ip->i_size;
	1761	freeblks->fb_newsize = length;
	1762	freeblks->fb_chkcnt = ip->i_blocks;
	1763	for (i = 0; i < NDADDR; i++) {
	1764	freeblks->fb_dblks[i] = ip->i_db[i];
	1765	ip->i_db[i] = 0;
	1766	}
	1767	for (i = 0; i < NIADDR; i++) {
	1768	freeblks->fb_iblks[i] = ip->i_ib[i];
	1769	ip->i_ib[i] = 0;
	1770	}
	1771	ip->i_blocks = 0;
	1772	ip->i_size = 0;
	1773	/*
	1774	* Push the zero'ed inode to to its disk buffer so that we are free
	1775	* to delete its dependencies below. Once the dependencies are gone
	1776	* the buffer can be safely released.
	1777	*/
	1778	if ((error = bread(ip->i_devvp,
	1779	fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
	1780	(int)fs->fs_bsize, NOCRED, &bp)) != 0)
	1781	softdep_error("softdep_setup_freeblocks", error);
	1782	((struct dinode )bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
	1783	ip->i_din;
	1784	/*
	1785	* Find and eliminate any inode dependencies.
	1786	*/
	1787	ACQUIRE_LOCK(&lk);
	1788	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
	1789	if ((inodedep->id_state & IOSTARTED) != 0) {
	1790	FREE_LOCK(&lk);
	1791	panic("softdep_setup_freeblocks: inode busy");
	1792	}
	1793	/*
	1794	* Add the freeblks structure to the list of operations that
	1795	* must await the zero'ed inode being written to disk. If we
	1796	* still have a bitmap dependency (delay == 0), then the inode
	1797	* has never been written to disk, so we can process the
	1798	* freeblks below once we have deleted the dependencies.
	1799	*/
	1800	delay = (inodedep->id_state & DEPCOMPLETE);
	1801	if (delay)
	1802	WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
	1803	/*
	1804	* Because the file length has been truncated to zero, any
	1805	* pending block allocation dependency structures associated
	1806	* with this inode are obsolete and can simply be de-allocated.
	1807	* We must first merge the two dependency lists to get rid of
	1808	* any duplicate freefrag structures, then purge the merged list.
	1809	*/
	1810	merge_inode_lists(inodedep);
	1811	while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
	1812	free_allocdirect(&inodedep->id_inoupdt, adp, 1);
	1813	FREE_LOCK(&lk);
	1814	bdwrite(bp);
	1815	/*
	1816	* We must wait for any I/O in progress to finish so that
	1817	* all potential buffers on the dirty list will be visible.
	1818	* Once they are all there, walk the list and get rid of
	1819	* any dependencies.
	1820	*/
	1821	vp = ITOV(ip);
	1822	ACQUIRE_LOCK(&lk);
	1823	drain_output(vp, 1);
	1824	while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) {
	1825	bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
	1826	(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
	1827	deallocate_dependencies(bp, inodedep);
	1828	bp->b_flags \|= B_INVAL \| B_NOCACHE;
	1829	FREE_LOCK(&lk);
	1830	brelse(bp);
	1831	ACQUIRE_LOCK(&lk);
	1832	}
	1833	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
	1834	(void)free_inodedep(inodedep);
	1835	FREE_LOCK(&lk);
	1836	/*
	1837	* If the inode has never been written to disk (delay == 0),
	1838	* then we can process the freeblks now that we have deleted
	1839	* the dependencies.
	1840	*/
	1841	if (!delay)
	1842	handle_workitem_freeblocks(freeblks);
	1843	}
	1844
	1845	/*
	1846	* Reclaim any dependency structures from a buffer that is about to
	1847	* be reallocated to a new vnode. The buffer must be locked, thus,
	1848	* no I/O completion operations can occur while we are manipulating
	1849	* its associated dependencies. The mutex is held so that other I/O's
	1850	* associated with related dependencies do not occur.
	1851	*/
	1852	static void
	1853	deallocate_dependencies(bp, inodedep)
	1854	struct buf *bp;
	1855	struct inodedep *inodedep;
	1856	{
	1857	struct worklist *wk;
	1858	struct indirdep *indirdep;
	1859	struct allocindir *aip;
	1860	struct pagedep *pagedep;
	1861	struct dirrem *dirrem;
	1862	struct diradd *dap;
	1863	int i;
	1864
	1865	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
	1866	switch (wk->wk_type) {
	1867
	1868	case D_INDIRDEP:
	1869	indirdep = WK_INDIRDEP(wk);
	1870	/*
	1871	* None of the indirect pointers will ever be visible,
	1872	* so they can simply be tossed. GOINGAWAY ensures
	1873	* that allocated pointers will be saved in the buffer
	1874	* cache until they are freed. Note that they will
	1875	* only be able to be found by their physical address
	1876	* since the inode mapping the logical address will
	1877	* be gone. The save buffer used for the safe copy
	1878	* was allocated in setup_allocindir_phase2 using
	1879	* the physical address so it could be used for this
	1880	* purpose. Hence we swap the safe copy with the real
	1881	* copy, allowing the safe copy to be freed and holding
	1882	* on to the real copy for later use in indir_trunc.
	1883	*/
	1884	if (indirdep->ir_state & GOINGAWAY) {
	1885	FREE_LOCK(&lk);
	1886	panic("deallocate_dependencies: already gone");
	1887	}
	1888	indirdep->ir_state \|= GOINGAWAY;
	1889	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
	1890	free_allocindir(aip, inodedep);
	1891	if (bp->b_lblkno >= 0 \|\|
	1892	bp->b_blkno != indirdep->ir_savebp->b_lblkno) {
	1893	FREE_LOCK(&lk);
	1894	panic("deallocate_dependencies: not indir");
	1895	}
	1896	bcopy(bp->b_data, indirdep->ir_savebp->b_data,
	1897	bp->b_bcount);
	1898	WORKLIST_REMOVE(wk);
	1899	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
	1900	continue;
	1901
	1902	case D_PAGEDEP:
	1903	pagedep = WK_PAGEDEP(wk);
	1904	/*
	1905	* None of the directory additions will ever be
	1906	* visible, so they can simply be tossed.
	1907	*/
	1908	for (i = 0; i < DAHASHSZ; i++)
	1909	while ((dap =
	1910	LIST_FIRST(&pagedep->pd_diraddhd[i])))
	1911	free_diradd(dap);
	1912	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
	1913	free_diradd(dap);
	1914	/*
	1915	* Copy any directory remove dependencies to the list
	1916	* to be processed after the zero'ed inode is written.
	1917	* If the inode has already been written, then they
	1918	* can be dumped directly onto the work list.
	1919	*/
	1920	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
	1921	LIST_REMOVE(dirrem, dm_next);
	1922	dirrem->dm_dirinum = pagedep->pd_ino;
	1923	if (inodedep == NULL \|\|
	1924	(inodedep->id_state & ALLCOMPLETE) ==
	1925	ALLCOMPLETE)
	1926	add_to_worklist(&dirrem->dm_list);
	1927	else
	1928	WORKLIST_INSERT(&inodedep->id_bufwait,
	1929	&dirrem->dm_list);
	1930	}
	1931	WORKLIST_REMOVE(&pagedep->pd_list);
	1932	LIST_REMOVE(pagedep, pd_hash);
	1933	WORKITEM_FREE(pagedep, D_PAGEDEP);
	1934	continue;
	1935
	1936	case D_ALLOCINDIR:
	1937	free_allocindir(WK_ALLOCINDIR(wk), inodedep);
	1938	continue;
	1939
	1940	case D_ALLOCDIRECT:
	1941	case D_INODEDEP:
	1942	FREE_LOCK(&lk);
	1943	panic("deallocate_dependencies: Unexpected type %s",
	1944	TYPENAME(wk->wk_type));
	1945	/* NOTREACHED */
	1946
	1947	default:
	1948	FREE_LOCK(&lk);
	1949	panic("deallocate_dependencies: Unknown type %s",
	1950	TYPENAME(wk->wk_type));
	1951	/* NOTREACHED */
	1952	}
	1953	}
	1954	}
	1955
	1956	/*
	1957	* Free an allocdirect. Generate a new freefrag work request if appropriate.
	1958	* This routine must be called with splbio interrupts blocked.
	1959	*/
	1960	static void
	1961	free_allocdirect(adphead, adp, delay)
	1962	struct allocdirectlst *adphead;
	1963	struct allocdirect *adp;
	1964	int delay;
	1965	{
	1966
	1967	#ifdef DEBUG
	1968	if (lk.lkt_held == NOHOLDER)
	1969	panic("free_allocdirect: lock not held");
	1970	#endif
	1971	if ((adp->ad_state & DEPCOMPLETE) == 0)
	1972	LIST_REMOVE(adp, ad_deps);
	1973	TAILQ_REMOVE(adphead, adp, ad_next);
	1974	if ((adp->ad_state & COMPLETE) == 0)
	1975	WORKLIST_REMOVE(&adp->ad_list);
	1976	if (adp->ad_freefrag != NULL) {
	1977	if (delay)
	1978	WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
	1979	&adp->ad_freefrag->ff_list);
	1980	else
	1981	add_to_worklist(&adp->ad_freefrag->ff_list);
	1982	}
	1983	WORKITEM_FREE(adp, D_ALLOCDIRECT);
	1984	}
	1985
	1986	/*
	1987	* Prepare an inode to be freed. The actual free operation is not
	1988	* done until the zero'ed inode has been written to disk.
	1989	*/
	1990	void
	1991	softdep_freefile(pvp, ino, mode)
	1992	struct vnode *pvp;
	1993	ino_t ino;
	1994	int mode;
	1995	{
	1996	struct inode *ip = VTOI(pvp);
	1997	struct inodedep *inodedep;
	1998	struct freefile *freefile;
	1999
	2000	/*
	2001	* This sets up the inode de-allocation dependency.
	2002	*/
	2003	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
	2004	M_FREEFILE, M_SOFTDEP_FLAGS);
	2005	freefile->fx_list.wk_type = D_FREEFILE;
	2006	freefile->fx_list.wk_state = 0;
	2007	freefile->fx_mode = mode;
	2008	freefile->fx_oldinum = ino;
	2009	freefile->fx_devvp = ip->i_devvp;
	2010	freefile->fx_fs = ip->i_fs;
	2011
	2012	/*
	2013	* If the inodedep does not exist, then the zero'ed inode has
	2014	* been written to disk. If the allocated inode has never been
	2015	* written to disk, then the on-disk inode is zero'ed. In either
	2016	* case we can free the file immediately.
	2017	*/
	2018	ACQUIRE_LOCK(&lk);
	2019	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 \|\|
	2020	check_inode_unwritten(inodedep)) {
	2021	FREE_LOCK(&lk);
	2022	handle_workitem_freefile(freefile);
	2023	return;
	2024	}
	2025	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
	2026	FREE_LOCK(&lk);
	2027	}
	2028
	2029	/*
	2030	* Check to see if an inode has never been written to disk. If
	2031	* so free the inodedep and return success, otherwise return failure.
	2032	* This routine must be called with splbio interrupts blocked.
	2033	*
	2034	* If we still have a bitmap dependency, then the inode has never
	2035	* been written to disk. Drop the dependency as it is no longer
	2036	* necessary since the inode is being deallocated. We set the
	2037	* ALLCOMPLETE flags since the bitmap now properly shows that the
	2038	* inode is not allocated. Even if the inode is actively being
	2039	* written, it has been rolled back to its zero'ed state, so we
	2040	* are ensured that a zero inode is what is on the disk. For short
	2041	* lived files, this change will usually result in removing all the
	2042	* dependencies from the inode so that it can be freed immediately.
	2043	*/
	2044	static int
	2045	check_inode_unwritten(inodedep)
	2046	struct inodedep *inodedep;
	2047	{
	2048
	2049	if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\|
	2050	LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\|
	2051	LIST_FIRST(&inodedep->id_bufwait) != NULL \|\|
	2052	LIST_FIRST(&inodedep->id_inowait) != NULL \|\|
	2053	TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\|
	2054	TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\|
	2055	inodedep->id_nlinkdelta != 0)
	2056	return (0);
	2057	inodedep->id_state \|= ALLCOMPLETE;
	2058	LIST_REMOVE(inodedep, id_deps);
	2059	inodedep->id_buf = NULL;
	2060	if (inodedep->id_state & ONWORKLIST)
	2061	WORKLIST_REMOVE(&inodedep->id_list);
	2062	if (inodedep->id_savedino != NULL) {
	2063	FREE(inodedep->id_savedino, M_INODEDEP);
	2064	inodedep->id_savedino = NULL;
	2065	}
	2066	if (free_inodedep(inodedep) == 0) {
	2067	FREE_LOCK(&lk);
	2068	panic("check_inode_unwritten: busy inode");
	2069	}
	2070	return (1);
	2071	}
	2072
	2073	/*
	2074	* Try to free an inodedep structure. Return 1 if it could be freed.
	2075	*/
	2076	static int
	2077	free_inodedep(inodedep)
	2078	struct inodedep *inodedep;
	2079	{
	2080
	2081	if ((inodedep->id_state & ONWORKLIST) != 0 \|\|
	2082	(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE \|\|
	2083	LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\|
	2084	LIST_FIRST(&inodedep->id_bufwait) != NULL \|\|
	2085	LIST_FIRST(&inodedep->id_inowait) != NULL \|\|
	2086	TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\|
	2087	TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\|
	2088	inodedep->id_nlinkdelta != 0 \|\| inodedep->id_savedino != NULL)
	2089	return (0);
	2090	LIST_REMOVE(inodedep, id_hash);
	2091	WORKITEM_FREE(inodedep, D_INODEDEP);
	2092	num_inodedep -= 1;
	2093	return (1);
	2094	}
	2095
	2096	/*
	2097	* This workitem routine performs the block de-allocation.
	2098	* The workitem is added to the pending list after the updated
	2099	* inode block has been written to disk. As mentioned above,
	2100	* checks regarding the number of blocks de-allocated (compared
	2101	* to the number of blocks allocated for the file) are also
	2102	* performed in this function.
	2103	*/
	2104	static void
	2105	handle_workitem_freeblocks(freeblks)
	2106	struct freeblks *freeblks;
	2107	{
	2108	struct inode tip;
	2109	ufs_daddr_t bn;
	2110	struct fs *fs;
	2111	int i, level, bsize;
	2112	long nblocks, blocksreleased = 0;
	2113	int error, allerror = 0;
	2114	ufs_lbn_t baselbns[NIADDR], tmpval;
	2115
	2116	tip.i_number = freeblks->fb_previousinum;
	2117	tip.i_devvp = freeblks->fb_devvp;
	2118	tip.i_dev = freeblks->fb_devvp->v_rdev;
	2119	tip.i_fs = freeblks->fb_fs;
	2120	tip.i_size = freeblks->fb_oldsize;
	2121	tip.i_uid = freeblks->fb_uid;
	2122	fs = freeblks->fb_fs;
	2123	tmpval = 1;
	2124	baselbns[0] = NDADDR;
	2125	for (i = 1; i < NIADDR; i++) {
	2126	tmpval *= NINDIR(fs);
	2127	baselbns[i] = baselbns[i - 1] + tmpval;
	2128	}
	2129	nblocks = btodb(fs->fs_bsize);
	2130	blocksreleased = 0;
	2131	/*
	2132	* Indirect blocks first.
	2133	*/
	2134	for (level = (NIADDR - 1); level >= 0; level--) {
	2135	if ((bn = freeblks->fb_iblks[level]) == 0)
	2136	continue;
	2137	if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
	2138	baselbns[level], &blocksreleased)) == 0)
	2139	allerror = error;
	2140	ffs_blkfree(&tip, bn, fs->fs_bsize);
	2141	blocksreleased += nblocks;
	2142	}
	2143	/*
	2144	* All direct blocks or frags.
	2145	*/
	2146	for (i = (NDADDR - 1); i >= 0; i--) {
	2147	if ((bn = freeblks->fb_dblks[i]) == 0)
	2148	continue;
	2149	bsize = blksize(fs, &tip, i);
	2150	ffs_blkfree(&tip, bn, bsize);
	2151	blocksreleased += btodb(bsize);
	2152	}
	2153
	2154	#ifdef DIAGNOSTIC
	2155	if (freeblks->fb_chkcnt != blocksreleased)
	2156	printf("handle_workitem_freeblocks: block count\n");
	2157	if (allerror)
	2158	softdep_error("handle_workitem_freeblks", allerror);
	2159	#endif /* DIAGNOSTIC */
	2160	WORKITEM_FREE(freeblks, D_FREEBLKS);
	2161	}
	2162
	2163	/*
	2164	* Release blocks associated with the inode ip and stored in the indirect
	2165	* block dbn. If level is greater than SINGLE, the block is an indirect block
	2166	* and recursive calls to indirtrunc must be used to cleanse other indirect
	2167	* blocks.
	2168	*/
	2169	static int
	2170	indir_trunc(ip, dbn, level, lbn, countp)
	2171	struct inode *ip;
	2172	ufs_daddr_t dbn;
	2173	int level;
	2174	ufs_lbn_t lbn;
	2175	long *countp;
	2176	{
	2177	struct buf *bp;
	2178	ufs_daddr_t *bap;
	2179	ufs_daddr_t nb;
	2180	struct fs *fs;
	2181	struct worklist *wk;
	2182	struct indirdep *indirdep;
	2183	int i, lbnadd, nblocks;
	2184	int error, allerror = 0;
	2185
	2186	fs = ip->i_fs;
	2187	lbnadd = 1;
	2188	for (i = level; i > 0; i--)
	2189	lbnadd *= NINDIR(fs);
	2190	/*
	2191	* Get buffer of block pointers to be freed. This routine is not
	2192	* called until the zero'ed inode has been written, so it is safe
	2193	* to free blocks as they are encountered. Because the inode has
	2194	* been zero'ed, calls to bmap on these blocks will fail. So, we
	2195	* have to use the on-disk address and the block device for the
	2196	* filesystem to look them up. If the file was deleted before its
	2197	* indirect blocks were all written to disk, the routine that set
	2198	* us up (deallocate_dependencies) will have arranged to leave
	2199	* a complete copy of the indirect block in memory for our use.
	2200	* Otherwise we have to read the blocks in from the disk.
	2201	*/
	2202	ACQUIRE_LOCK(&lk);
	2203	if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
	2204	(wk = LIST_FIRST(&bp->b_dep)) != NULL) {
	2205	if (wk->wk_type != D_INDIRDEP \|\|
	2206	(indirdep = WK_INDIRDEP(wk))->ir_savebp != bp \|\|
	2207	(indirdep->ir_state & GOINGAWAY) == 0) {
	2208	FREE_LOCK(&lk);
	2209	panic("indir_trunc: lost indirdep");
	2210	}
	2211	WORKLIST_REMOVE(wk);
	2212	WORKITEM_FREE(indirdep, D_INDIRDEP);
	2213	if (LIST_FIRST(&bp->b_dep) != NULL) {
	2214	FREE_LOCK(&lk);
	2215	panic("indir_trunc: dangling dep");
	2216	}
	2217	FREE_LOCK(&lk);
	2218	} else {
	2219	FREE_LOCK(&lk);
	2220	error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
	2221	if (error)
	2222	return (error);
	2223	}
	2224	/*
	2225	* Recursively free indirect blocks.
	2226	*/
	2227	bap = (ufs_daddr_t *)bp->b_data;
	2228	nblocks = btodb(fs->fs_bsize);
	2229	for (i = NINDIR(fs) - 1; i >= 0; i--) {
	2230	if ((nb = bap[i]) == 0)
	2231	continue;
	2232	if (level != 0) {
	2233	if ((error = indir_trunc(ip, fsbtodb(fs, nb),
	2234	level - 1, lbn + (i * lbnadd), countp)) != 0)
	2235	allerror = error;
	2236	}
	2237	ffs_blkfree(ip, nb, fs->fs_bsize);
	2238	*countp += nblocks;
	2239	}
	2240	bp->b_flags \|= B_INVAL \| B_NOCACHE;
	2241	brelse(bp);
	2242	return (allerror);
	2243	}
	2244
	2245	/*
	2246	* Free an allocindir.
	2247	* This routine must be called with splbio interrupts blocked.
	2248	*/
	2249	static void
	2250	free_allocindir(aip, inodedep)
	2251	struct allocindir *aip;
	2252	struct inodedep *inodedep;
	2253	{
	2254	struct freefrag *freefrag;
	2255
	2256	#ifdef DEBUG
	2257	if (lk.lkt_held == NOHOLDER)
	2258	panic("free_allocindir: lock not held");
	2259	#endif
	2260	if ((aip->ai_state & DEPCOMPLETE) == 0)
	2261	LIST_REMOVE(aip, ai_deps);
	2262	if (aip->ai_state & ONWORKLIST)
	2263	WORKLIST_REMOVE(&aip->ai_list);
	2264	LIST_REMOVE(aip, ai_next);
	2265	if ((freefrag = aip->ai_freefrag) != NULL) {
	2266	if (inodedep == NULL)
	2267	add_to_worklist(&freefrag->ff_list);
	2268	else
	2269	WORKLIST_INSERT(&inodedep->id_bufwait,
	2270	&freefrag->ff_list);
	2271	}
	2272	WORKITEM_FREE(aip, D_ALLOCINDIR);
	2273	}
	2274
	2275	/*
	2276	* Directory entry addition dependencies.
	2277	*
	2278	* When adding a new directory entry, the inode (with its incremented link
	2279	* count) must be written to disk before the directory entry's pointer to it.
	2280	* Also, if the inode is newly allocated, the corresponding freemap must be
	2281	* updated (on disk) before the directory entry's pointer. These requirements
	2282	* are met via undo/redo on the directory entry's pointer, which consists
	2283	* simply of the inode number.
	2284	*
	2285	* As directory entries are added and deleted, the free space within a
	2286	* directory block can become fragmented. The ufs file system will compact
	2287	* a fragmented directory block to make space for a new entry. When this
	2288	* occurs, the offsets of previously added entries change. Any "diradd"
	2289	* dependency structures corresponding to these entries must be updated with
	2290	* the new offsets.
	2291	*/
	2292
	2293	/*
	2294	* This routine is called after the in-memory inode's link
	2295	* count has been incremented, but before the directory entry's
	2296	* pointer to the inode has been set.
	2297	*/
	2298	void
	2299	softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
	2300	struct buf bp; / buffer containing directory block */
	2301	struct inode dp; / inode for directory */
	2302	off_t diroffset; /* offset of new entry in directory */
	2303	long newinum; /* inode referenced by new directory entry */
	2304	struct buf newdirbp; / non-NULL => contents of new mkdir */
	2305	{
	2306	int offset; /* offset of new entry within directory block */
	2307	ufs_lbn_t lbn; /* block in directory containing new entry */
	2308	struct fs *fs;
	2309	struct diradd *dap;
	2310	struct pagedep *pagedep;
	2311	struct inodedep *inodedep;
	2312	struct mkdir mkdir1, mkdir2;
	2313
	2314	/*
	2315	* Whiteouts have no dependencies.
	2316	*/
	2317	if (newinum == WINO) {
	2318	if (newdirbp != NULL)
	2319	bdwrite(newdirbp);
	2320	return;
	2321	}
	2322
	2323	fs = dp->i_fs;
	2324	lbn = lblkno(fs, diroffset);
	2325	offset = blkoff(fs, diroffset);
	2326	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
	2327	M_SOFTDEP_FLAGS);
	2328	bzero(dap, sizeof(struct diradd));
	2329	dap->da_list.wk_type = D_DIRADD;
	2330	dap->da_offset = offset;
	2331	dap->da_newinum = newinum;
	2332	dap->da_state = ATTACHED;
	2333	if (newdirbp == NULL) {
	2334	dap->da_state \|= DEPCOMPLETE;
	2335	ACQUIRE_LOCK(&lk);
	2336	} else {
	2337	dap->da_state \|= MKDIR_BODY \| MKDIR_PARENT;
	2338	MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
	2339	M_SOFTDEP_FLAGS);
	2340	mkdir1->md_list.wk_type = D_MKDIR;
	2341	mkdir1->md_state = MKDIR_BODY;
	2342	mkdir1->md_diradd = dap;
	2343	MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
	2344	M_SOFTDEP_FLAGS);
	2345	mkdir2->md_list.wk_type = D_MKDIR;
	2346	mkdir2->md_state = MKDIR_PARENT;
	2347	mkdir2->md_diradd = dap;
	2348	/*
	2349	* Dependency on "." and ".." being written to disk.
	2350	*/
	2351	mkdir1->md_buf = newdirbp;
	2352	ACQUIRE_LOCK(&lk);
	2353	LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
	2354	WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
	2355	FREE_LOCK(&lk);
	2356	bdwrite(newdirbp);
	2357	/*
	2358	* Dependency on link count increase for parent directory
	2359	*/
	2360	ACQUIRE_LOCK(&lk);
	2361	if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
	2362	\|\| (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
	2363	dap->da_state &= ~MKDIR_PARENT;
	2364	WORKITEM_FREE(mkdir2, D_MKDIR);
	2365	} else {
	2366	LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
	2367	WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
	2368	}
	2369	}
	2370	/*
	2371	* Link into parent directory pagedep to await its being written.
	2372	*/
	2373	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
	2374	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
	2375	dap->da_pagedep = pagedep;
	2376	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
	2377	da_pdlist);
	2378	/*
	2379	* Link into its inodedep. Put it on the id_bufwait list if the inode
	2380	* is not yet written. If it is written, do the post-inode write
	2381	* processing to put it on the id_pendinghd list.
	2382	*/
	2383	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
	2384	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
	2385	diradd_inode_written(dap, inodedep);
	2386	else
	2387	WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
	2388	FREE_LOCK(&lk);
	2389	}
	2390
	2391	/*
	2392	* This procedure is called to change the offset of a directory
	2393	* entry when compacting a directory block which must be owned
	2394	* exclusively by the caller. Note that the actual entry movement
	2395	* must be done in this procedure to ensure that no I/O completions
	2396	* occur while the move is in progress.
	2397	*/
	2398	void
	2399	softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
	2400	struct inode dp; / inode for directory */
	2401	caddr_t base; /* address of dp->i_offset */
	2402	caddr_t oldloc; /* address of old directory location */
	2403	caddr_t newloc; /* address of new directory location */
	2404	int entrysize; /* size of directory entry */
	2405	{
	2406	int offset, oldoffset, newoffset;
	2407	struct pagedep *pagedep;
	2408	struct diradd *dap;
	2409	ufs_lbn_t lbn;
	2410
	2411	ACQUIRE_LOCK(&lk);
	2412	lbn = lblkno(dp->i_fs, dp->i_offset);
	2413	offset = blkoff(dp->i_fs, dp->i_offset);
	2414	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
	2415	goto done;
	2416	oldoffset = offset + (oldloc - base);
	2417	newoffset = offset + (newloc - base);
	2418
	2419	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
	2420	if (dap->da_offset != oldoffset)
	2421	continue;
	2422	dap->da_offset = newoffset;
	2423	if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
	2424	break;
	2425	LIST_REMOVE(dap, da_pdlist);
	2426	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
	2427	dap, da_pdlist);
	2428	break;
	2429	}
	2430	if (dap == NULL) {
	2431
	2432	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
	2433	if (dap->da_offset == oldoffset) {
	2434	dap->da_offset = newoffset;
	2435	break;
	2436	}
	2437	}
	2438	}
	2439	done:
	2440	bcopy(oldloc, newloc, entrysize);
	2441	FREE_LOCK(&lk);
	2442	}
	2443
	2444	/*
	2445	* Free a diradd dependency structure. This routine must be called
	2446	* with splbio interrupts blocked.
	2447	*/
	2448	static void
	2449	free_diradd(dap)
	2450	struct diradd *dap;
	2451	{
	2452	struct dirrem *dirrem;
	2453	struct pagedep *pagedep;
	2454	struct inodedep *inodedep;
	2455	struct mkdir mkdir, nextmd;
	2456
	2457	#ifdef DEBUG
	2458	if (lk.lkt_held == NOHOLDER)
	2459	panic("free_diradd: lock not held");
	2460	#endif
	2461	WORKLIST_REMOVE(&dap->da_list);
	2462	LIST_REMOVE(dap, da_pdlist);
	2463	if ((dap->da_state & DIRCHG) == 0) {
	2464	pagedep = dap->da_pagedep;
	2465	} else {
	2466	dirrem = dap->da_previous;
	2467	pagedep = dirrem->dm_pagedep;
	2468	dirrem->dm_dirinum = pagedep->pd_ino;
	2469	add_to_worklist(&dirrem->dm_list);
	2470	}
	2471	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
	2472	0, &inodedep) != 0)
	2473	(void) free_inodedep(inodedep);
	2474	if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) {
	2475	for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
	2476	nextmd = LIST_NEXT(mkdir, md_mkdirs);
	2477	if (mkdir->md_diradd != dap)
	2478	continue;
	2479	dap->da_state &= ~mkdir->md_state;
	2480	WORKLIST_REMOVE(&mkdir->md_list);
	2481	LIST_REMOVE(mkdir, md_mkdirs);
	2482	WORKITEM_FREE(mkdir, D_MKDIR);
	2483	}
	2484	if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) {
	2485	FREE_LOCK(&lk);
	2486	panic("free_diradd: unfound ref");
	2487	}
	2488	}
	2489	WORKITEM_FREE(dap, D_DIRADD);
	2490	}
	2491
	2492	/*
	2493	* Directory entry removal dependencies.
	2494	*
	2495	* When removing a directory entry, the entry's inode pointer must be
	2496	* zero'ed on disk before the corresponding inode's link count is decremented
	2497	* (possibly freeing the inode for re-use). This dependency is handled by
	2498	* updating the directory entry but delaying the inode count reduction until
	2499	* after the directory block has been written to disk. After this point, the
	2500	* inode count can be decremented whenever it is convenient.
	2501	*/
	2502
	2503	/*
	2504	* This routine should be called immediately after removing
	2505	* a directory entry. The inode's link count should not be
	2506	* decremented by the calling procedure -- the soft updates
	2507	* code will do this task when it is safe.
	2508	*/
	2509	void
	2510	softdep_setup_remove(bp, dp, ip, isrmdir)
	2511	struct buf bp; / buffer containing directory block */
	2512	struct inode dp; / inode for the directory being modified */
	2513	struct inode ip; / inode for directory entry being removed */
	2514	int isrmdir; /* indicates if doing RMDIR */
	2515	{
	2516	struct dirrem dirrem, prevdirrem;
	2517
	2518	/*
	2519	* Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
	2520	*/
	2521	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
	2522
	2523	/*
	2524	* If the COMPLETE flag is clear, then there were no active
	2525	* entries and we want to roll back to a zeroed entry until
	2526	* the new inode is committed to disk. If the COMPLETE flag is
	2527	* set then we have deleted an entry that never made it to
	2528	* disk. If the entry we deleted resulted from a name change,
	2529	* then the old name still resides on disk. We cannot delete
	2530	* its inode (returned to us in prevdirrem) until the zeroed
	2531	* directory entry gets to disk. The new inode has never been
	2532	* referenced on the disk, so can be deleted immediately.
	2533	*/
	2534	if ((dirrem->dm_state & COMPLETE) == 0) {
	2535	LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
	2536	dm_next);
	2537	FREE_LOCK(&lk);
	2538	} else {
	2539	if (prevdirrem != NULL)
	2540	LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
	2541	prevdirrem, dm_next);
	2542	dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
	2543	FREE_LOCK(&lk);
	2544	handle_workitem_remove(dirrem);
	2545	}
	2546	}
	2547
	2548	/*
	2549	* Allocate a new dirrem if appropriate and return it along with
	2550	* its associated pagedep. Called without a lock, returns with lock.
	2551	*/
	2552	static long num_dirrem; /* number of dirrem allocated */
	2553	static struct dirrem *
	2554	newdirrem(bp, dp, ip, isrmdir, prevdirremp)
	2555	struct buf bp; / buffer containing directory block */
	2556	struct inode dp; / inode for the directory being modified */
	2557	struct inode ip; / inode for directory entry being removed */
	2558	int isrmdir; /* indicates if doing RMDIR */
	2559	struct dirrem *prevdirremp; / previously referenced inode, if any */
	2560	{
	2561	int offset;
	2562	ufs_lbn_t lbn;
	2563	struct diradd *dap;
	2564	struct dirrem *dirrem;
	2565	struct pagedep *pagedep;
	2566
	2567	/*
	2568	* Whiteouts have no deletion dependencies.
	2569	*/
	2570	if (ip == NULL)
	2571	panic("newdirrem: whiteout");
	2572	/*
	2573	* If we are over our limit, try to improve the situation.
	2574	* Limiting the number of dirrem structures will also limit
	2575	* the number of freefile and freeblks structures.
	2576	*/
	2577	if (num_dirrem > max_softdeps / 2 && speedup_syncer() == 0)
	2578	(void) request_cleanup(FLUSH_REMOVE, 0);
	2579	num_dirrem += 1;
	2580	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
	2581	M_DIRREM, M_SOFTDEP_FLAGS);
	2582	bzero(dirrem, sizeof(struct dirrem));
	2583	dirrem->dm_list.wk_type = D_DIRREM;
	2584	dirrem->dm_state = isrmdir ? RMDIR : 0;
	2585	dirrem->dm_mnt = ITOV(ip)->v_mount;
	2586	dirrem->dm_oldinum = ip->i_number;
	2587	*prevdirremp = NULL;
	2588
	2589	ACQUIRE_LOCK(&lk);
	2590	lbn = lblkno(dp->i_fs, dp->i_offset);
	2591	offset = blkoff(dp->i_fs, dp->i_offset);
	2592	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
	2593	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
	2594	dirrem->dm_pagedep = pagedep;
	2595	/*
	2596	* Check for a diradd dependency for the same directory entry.
	2597	* If present, then both dependencies become obsolete and can
	2598	* be de-allocated. Check for an entry on both the pd_dirraddhd
	2599	* list and the pd_pendinghd list.
	2600	*/
	2601
	2602	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
	2603	if (dap->da_offset == offset)
	2604	break;
	2605	if (dap == NULL) {
	2606
	2607	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
	2608	if (dap->da_offset == offset)
	2609	break;
	2610	if (dap == NULL)
	2611	return (dirrem);
	2612	}
	2613	/*
	2614	* Must be ATTACHED at this point.
	2615	*/
	2616	if ((dap->da_state & ATTACHED) == 0) {
	2617	FREE_LOCK(&lk);
	2618	panic("newdirrem: not ATTACHED");
	2619	}
	2620	if (dap->da_newinum != ip->i_number) {
	2621	FREE_LOCK(&lk);
	2622	panic("newdirrem: inum %d should be %d",
	2623	ip->i_number, dap->da_newinum);
	2624	}
	2625	/*
	2626	* If we are deleting a changed name that never made it to disk,
	2627	* then return the dirrem describing the previous inode (which
	2628	* represents the inode currently referenced from this entry on disk).
	2629	*/
	2630	if ((dap->da_state & DIRCHG) != 0) {
	2631	*prevdirremp = dap->da_previous;
	2632	dap->da_state &= ~DIRCHG;
	2633	dap->da_pagedep = pagedep;
	2634	}
	2635	/*
	2636	* We are deleting an entry that never made it to disk.
	2637	* Mark it COMPLETE so we can delete its inode immediately.
	2638	*/
	2639	dirrem->dm_state \|= COMPLETE;
	2640	free_diradd(dap);
	2641	return (dirrem);
	2642	}
	2643
	2644	/*
	2645	* Directory entry change dependencies.
	2646	*
	2647	* Changing an existing directory entry requires that an add operation
	2648	* be completed first followed by a deletion. The semantics for the addition
	2649	* are identical to the description of adding a new entry above except
	2650	* that the rollback is to the old inode number rather than zero. Once
	2651	* the addition dependency is completed, the removal is done as described
	2652	* in the removal routine above.
	2653	*/
	2654
	2655	/*
	2656	* This routine should be called immediately after changing
	2657	* a directory entry. The inode's link count should not be
	2658	* decremented by the calling procedure -- the soft updates
	2659	* code will perform this task when it is safe.
	2660	*/
	2661	void
	2662	softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
	2663	struct buf bp; / buffer containing directory block */
	2664	struct inode dp; / inode for the directory being modified */
	2665	struct inode ip; / inode for directory entry being removed */
	2666	long newinum; /* new inode number for changed entry */
	2667	int isrmdir; /* indicates if doing RMDIR */
	2668	{
	2669	int offset;
	2670	struct diradd *dap = NULL;
	2671	struct dirrem dirrem, prevdirrem;
	2672	struct pagedep *pagedep;
	2673	struct inodedep *inodedep;
	2674
	2675	offset = blkoff(dp->i_fs, dp->i_offset);
	2676
	2677	/*
	2678	* Whiteouts do not need diradd dependencies.
	2679	*/
	2680	if (newinum != WINO) {
	2681	MALLOC(dap, struct diradd *, sizeof(struct diradd),
	2682	M_DIRADD, M_SOFTDEP_FLAGS);
	2683	bzero(dap, sizeof(struct diradd));
	2684	dap->da_list.wk_type = D_DIRADD;
	2685	dap->da_state = DIRCHG \| ATTACHED \| DEPCOMPLETE;
	2686	dap->da_offset = offset;
	2687	dap->da_newinum = newinum;
	2688	}
	2689
	2690	/*
	2691	* Allocate a new dirrem and ACQUIRE_LOCK.
	2692	*/
	2693	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
	2694	pagedep = dirrem->dm_pagedep;
	2695	/*
	2696	* The possible values for isrmdir:
	2697	* 0 - non-directory file rename
	2698	* 1 - directory rename within same directory
	2699	* inum - directory rename to new directory of given inode number
	2700	* When renaming to a new directory, we are both deleting and
	2701	* creating a new directory entry, so the link count on the new
	2702	* directory should not change. Thus we do not need the followup
	2703	* dirrem which is usually done in handle_workitem_remove. We set
	2704	* the DIRCHG flag to tell handle_workitem_remove to skip the
	2705	* followup dirrem.
	2706	*/
	2707	if (isrmdir > 1)
	2708	dirrem->dm_state \|= DIRCHG;
	2709
	2710	/*
	2711	* Whiteouts have no additional dependencies,
	2712	* so just put the dirrem on the correct list.
	2713	*/
	2714	if (newinum == WINO) {
	2715	if ((dirrem->dm_state & COMPLETE) == 0) {
	2716	LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
	2717	dm_next);
	2718	} else {
	2719	dirrem->dm_dirinum = pagedep->pd_ino;
	2720	add_to_worklist(&dirrem->dm_list);
	2721	}
	2722	FREE_LOCK(&lk);
	2723	return;
	2724	}
	2725
	2726	/*
	2727	* If the COMPLETE flag is clear, then there were no active
	2728	* entries and we want to roll back to the previous inode until
	2729	* the new inode is committed to disk. If the COMPLETE flag is
	2730	* set, then we have deleted an entry that never made it to disk.
	2731	* If the entry we deleted resulted from a name change, then the old
	2732	* inode reference still resides on disk. Any rollback that we do
	2733	* needs to be to that old inode (returned to us in prevdirrem). If
	2734	* the entry we deleted resulted from a create, then there is
	2735	* no entry on the disk, so we want to roll back to zero rather
	2736	* than the uncommitted inode. In either of the COMPLETE cases we
	2737	* want to immediately free the unwritten and unreferenced inode.
	2738	*/
	2739	if ((dirrem->dm_state & COMPLETE) == 0) {
	2740	dap->da_previous = dirrem;
	2741	} else {
	2742	if (prevdirrem != NULL) {
	2743	dap->da_previous = prevdirrem;
	2744	} else {
	2745	dap->da_state &= ~DIRCHG;
	2746	dap->da_pagedep = pagedep;
	2747	}
	2748	dirrem->dm_dirinum = pagedep->pd_ino;
	2749	add_to_worklist(&dirrem->dm_list);
	2750	}
	2751	/*
	2752	* Link into its inodedep. Put it on the id_bufwait list if the inode
	2753	* is not yet written. If it is written, do the post-inode write
	2754	* processing to put it on the id_pendinghd list.
	2755	*/
	2756	if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 \|\|
	2757	(inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
	2758	dap->da_state \|= COMPLETE;
	2759	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
	2760	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
	2761	} else {
	2762	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
	2763	dap, da_pdlist);
	2764	WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
	2765	}
	2766	FREE_LOCK(&lk);
	2767	}
	2768
	2769	/*
	2770	* Called whenever the link count on an inode is changed.
	2771	* It creates an inode dependency so that the new reference(s)
	2772	* to the inode cannot be committed to disk until the updated
	2773	* inode has been written.
	2774	*/
	2775	void
	2776	softdep_change_linkcnt(ip)
	2777	struct inode ip; / the inode with the increased link count */
	2778	{
	2779	struct inodedep *inodedep;
	2780
	2781	ACQUIRE_LOCK(&lk);
	2782	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
	2783	if (ip->i_nlink < ip->i_effnlink) {
	2784	FREE_LOCK(&lk);
	2785	panic("softdep_change_linkcnt: bad delta");
	2786	}
	2787	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
	2788	FREE_LOCK(&lk);
	2789	}
	2790
	2791	/*
	2792	* This workitem decrements the inode's link count.
	2793	* If the link count reaches zero, the file is removed.
	2794	*/
	2795	static void
	2796	handle_workitem_remove(dirrem)
	2797	struct dirrem *dirrem;
	2798	{
	2799	struct thread td = curthread; / XXX */
	2800	struct ucred *cred;
	2801	struct inodedep *inodedep;
	2802	struct vnode *vp;
	2803	struct inode *ip;
	2804	ino_t oldinum;
	2805	int error;
	2806
	2807	KKASSERT(td->td_proc);
	2808	cred = td->td_proc->p_ucred;
	2809	if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
	2810	softdep_error("handle_workitem_remove: vget", error);
	2811	return;
	2812	}
	2813	ip = VTOI(vp);
	2814	ACQUIRE_LOCK(&lk);
	2815	if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){
	2816	FREE_LOCK(&lk);
	2817	panic("handle_workitem_remove: lost inodedep");
	2818	}
	2819	/*
	2820	* Normal file deletion.
	2821	*/
	2822	if ((dirrem->dm_state & RMDIR) == 0) {
	2823	ip->i_nlink--;
	2824	ip->i_flag \|= IN_CHANGE;
	2825	if (ip->i_nlink < ip->i_effnlink) {
	2826	FREE_LOCK(&lk);
	2827	panic("handle_workitem_remove: bad file delta");
	2828	}
	2829	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
	2830	FREE_LOCK(&lk);
	2831	vput(vp);
	2832	num_dirrem -= 1;
	2833	WORKITEM_FREE(dirrem, D_DIRREM);
	2834	return;
	2835	}
	2836	/*
	2837	* Directory deletion. Decrement reference count for both the
	2838	* just deleted parent directory entry and the reference for ".".
	2839	* Next truncate the directory to length zero. When the
	2840	* truncation completes, arrange to have the reference count on
	2841	* the parent decremented to account for the loss of "..".
	2842	*/
	2843	ip->i_nlink -= 2;
	2844	ip->i_flag \|= IN_CHANGE;
	2845	if (ip->i_nlink < ip->i_effnlink) {
	2846	FREE_LOCK(&lk);
	2847	panic("handle_workitem_remove: bad dir delta");
	2848	}
	2849	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
	2850	FREE_LOCK(&lk);
	2851	if ((error = UFS_TRUNCATE(vp, (off_t)0, 0,cred, td)) != 0)
	2852	softdep_error("handle_workitem_remove: truncate", error);
	2853	/*
	2854	* Rename a directory to a new parent. Since, we are both deleting
	2855	* and creating a new directory entry, the link count on the new
	2856	* directory should not change. Thus we skip the followup dirrem.
	2857	*/
	2858	if (dirrem->dm_state & DIRCHG) {
	2859	vput(vp);
	2860	num_dirrem -= 1;
	2861	WORKITEM_FREE(dirrem, D_DIRREM);
	2862	return;
	2863	}
	2864	/*
	2865	* If the inodedep does not exist, then the zero'ed inode has
	2866	* been written to disk. If the allocated inode has never been
	2867	* written to disk, then the on-disk inode is zero'ed. In either
	2868	* case we can remove the file immediately.
	2869	*/
	2870	ACQUIRE_LOCK(&lk);
	2871	dirrem->dm_state = 0;
	2872	oldinum = dirrem->dm_oldinum;
	2873	dirrem->dm_oldinum = dirrem->dm_dirinum;
	2874	if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 \|\|
	2875	check_inode_unwritten(inodedep)) {
	2876	FREE_LOCK(&lk);
	2877	vput(vp);
	2878	handle_workitem_remove(dirrem);
	2879	return;
	2880	}
	2881	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
	2882	FREE_LOCK(&lk);
	2883	vput(vp);
	2884	}
	2885
	2886	/*
	2887	* Inode de-allocation dependencies.
	2888	*
	2889	* When an inode's link count is reduced to zero, it can be de-allocated. We
	2890	* found it convenient to postpone de-allocation until after the inode is
	2891	* written to disk with its new link count (zero). At this point, all of the
	2892	* on-disk inode's block pointers are nullified and, with careful dependency
	2893	* list ordering, all dependencies related to the inode will be satisfied and
	2894	* the corresponding dependency structures de-allocated. So, if/when the
	2895	* inode is reused, there will be no mixing of old dependencies with new
	2896	* ones. This artificial dependency is set up by the block de-allocation
	2897	* procedure above (softdep_setup_freeblocks) and completed by the
	2898	* following procedure.
	2899	*/
	2900	static void
	2901	handle_workitem_freefile(freefile)
	2902	struct freefile *freefile;
	2903	{
	2904	struct vnode vp;
	2905	struct inode tip;
	2906	struct inodedep *idp;
	2907	int error;
	2908
	2909	#ifdef DEBUG
	2910	ACQUIRE_LOCK(&lk);
	2911	error = inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp);
	2912	FREE_LOCK(&lk);
	2913	if (error)
	2914	panic("handle_workitem_freefile: inodedep survived");
	2915	#endif
	2916	tip.i_devvp = freefile->fx_devvp;
	2917	tip.i_dev = freefile->fx_devvp->v_rdev;
	2918	tip.i_fs = freefile->fx_fs;
	2919	vp.v_data = &tip;
	2920	if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
	2921	softdep_error("handle_workitem_freefile", error);
	2922	WORKITEM_FREE(freefile, D_FREEFILE);
	2923	}
	2924
	2925	/*
	2926	* Disk writes.
	2927	*
	2928	* The dependency structures constructed above are most actively used when file
	2929	* system blocks are written to disk. No constraints are placed on when a
	2930	* block can be written, but unsatisfied update dependencies are made safe by
	2931	* modifying (or replacing) the source memory for the duration of the disk
	2932	* write. When the disk write completes, the memory block is again brought
	2933	* up-to-date.
	2934	*
	2935	* In-core inode structure reclamation.
	2936	*
	2937	* Because there are a finite number of "in-core" inode structures, they are
	2938	* reused regularly. By transferring all inode-related dependencies to the
	2939	* in-memory inode block and indexing them separately (via "inodedep"s), we
	2940	* can allow "in-core" inode structures to be reused at any time and avoid
	2941	* any increase in contention.
	2942	*
	2943	* Called just before entering the device driver to initiate a new disk I/O.
	2944	* The buffer must be locked, thus, no I/O completion operations can occur
	2945	* while we are manipulating its associated dependencies.
	2946	*/
	2947	static void
	2948	softdep_disk_io_initiation(bp)
	2949	struct buf bp; / structure describing disk write to occur */
	2950	{
	2951	struct worklist wk, nextwk;
	2952	struct indirdep *indirdep;
	2953
	2954	/*
	2955	* We only care about write operations. There should never
	2956	* be dependencies for reads.
	2957	*/
	2958	if (bp->b_flags & B_READ)
	2959	panic("softdep_disk_io_initiation: read");
	2960	/*
	2961	* Do any necessary pre-I/O processing.
	2962	*/
	2963	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
	2964	nextwk = LIST_NEXT(wk, wk_list);
	2965	switch (wk->wk_type) {
	2966
	2967	case D_PAGEDEP:
	2968	initiate_write_filepage(WK_PAGEDEP(wk), bp);
	2969	continue;
	2970
	2971	case D_INODEDEP:
	2972	initiate_write_inodeblock(WK_INODEDEP(wk), bp);
	2973	continue;
	2974
	2975	case D_INDIRDEP:
	2976	indirdep = WK_INDIRDEP(wk);
	2977	if (indirdep->ir_state & GOINGAWAY)
	2978	panic("disk_io_initiation: indirdep gone");
	2979	/*
	2980	* If there are no remaining dependencies, this
	2981	* will be writing the real pointers, so the
	2982	* dependency can be freed.
	2983	*/
	2984	if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
	2985	indirdep->ir_savebp->b_flags \|= B_INVAL \| B_NOCACHE;
	2986	brelse(indirdep->ir_savebp);
	2987	/* inline expand WORKLIST_REMOVE(wk); */
	2988	wk->wk_state &= ~ONWORKLIST;
	2989	LIST_REMOVE(wk, wk_list);
	2990	WORKITEM_FREE(indirdep, D_INDIRDEP);
	2991	continue;
	2992	}
	2993	/*
	2994	* Replace up-to-date version with safe version.
	2995	*/
	2996	MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
	2997	M_INDIRDEP, M_SOFTDEP_FLAGS);
	2998	ACQUIRE_LOCK(&lk);
	2999	indirdep->ir_state &= ~ATTACHED;
	3000	indirdep->ir_state \|= UNDONE;
	3001	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
	3002	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
	3003	bp->b_bcount);
	3004	FREE_LOCK(&lk);
	3005	continue;
	3006
	3007	case D_MKDIR:
	3008	case D_BMSAFEMAP:
	3009	case D_ALLOCDIRECT:
	3010	case D_ALLOCINDIR:
	3011	continue;
	3012
	3013	default:
	3014	panic("handle_disk_io_initiation: Unexpected type %s",
	3015	TYPENAME(wk->wk_type));
	3016	/* NOTREACHED */
	3017	}
	3018	}
	3019	}
	3020
	3021	/*
	3022	* Called from within the procedure above to deal with unsatisfied
	3023	* allocation dependencies in a directory. The buffer must be locked,
	3024	* thus, no I/O completion operations can occur while we are
	3025	* manipulating its associated dependencies.
	3026	*/
	3027	static void
	3028	initiate_write_filepage(pagedep, bp)
	3029	struct pagedep *pagedep;
	3030	struct buf *bp;
	3031	{
	3032	struct diradd *dap;
	3033	struct direct *ep;
	3034	int i;
	3035
	3036	if (pagedep->pd_state & IOSTARTED) {
	3037	/*
	3038	* This can only happen if there is a driver that does not
	3039	* understand chaining. Here biodone will reissue the call
	3040	* to strategy for the incomplete buffers.
	3041	*/
	3042	printf("initiate_write_filepage: already started\n");
	3043	return;
	3044	}
	3045	pagedep->pd_state \|= IOSTARTED;
	3046	ACQUIRE_LOCK(&lk);
	3047	for (i = 0; i < DAHASHSZ; i++) {
	3048	LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
	3049	ep = (struct direct *)
	3050	((char *)bp->b_data + dap->da_offset);
	3051	if (ep->d_ino != dap->da_newinum) {
	3052	FREE_LOCK(&lk);
	3053	panic("%s: dir inum %d != new %d",
	3054	"initiate_write_filepage",
	3055	ep->d_ino, dap->da_newinum);
	3056	}
	3057	if (dap->da_state & DIRCHG)
	3058	ep->d_ino = dap->da_previous->dm_oldinum;
	3059	else
	3060	ep->d_ino = 0;
	3061	dap->da_state &= ~ATTACHED;
	3062	dap->da_state \|= UNDONE;
	3063	}
	3064	}
	3065	FREE_LOCK(&lk);
	3066	}
	3067
	3068	/*
	3069	* Called from within the procedure above to deal with unsatisfied
	3070	* allocation dependencies in an inodeblock. The buffer must be
	3071	* locked, thus, no I/O completion operations can occur while we
	3072	* are manipulating its associated dependencies.
	3073	*/
	3074	static void
	3075	initiate_write_inodeblock(inodedep, bp)
	3076	struct inodedep *inodedep;
	3077	struct buf bp; / The inode block */
	3078	{
	3079	struct allocdirect adp, lastadp;
	3080	struct dinode *dp;
	3081	struct fs *fs;
	3082	ufs_lbn_t prevlbn = 0;
	3083	int i, deplist;
	3084
	3085	if (inodedep->id_state & IOSTARTED)
	3086	panic("initiate_write_inodeblock: already started");
	3087	inodedep->id_state \|= IOSTARTED;
	3088	fs = inodedep->id_fs;
	3089	dp = (struct dinode *)bp->b_data +
	3090	ino_to_fsbo(fs, inodedep->id_ino);
	3091	/*
	3092	* If the bitmap is not yet written, then the allocated
	3093	* inode cannot be written to disk.
	3094	*/
	3095	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
	3096	if (inodedep->id_savedino != NULL)
	3097	panic("initiate_write_inodeblock: already doing I/O");
	3098	MALLOC(inodedep->id_savedino, struct dinode *,
	3099	sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
	3100	inodedep->id_savedino = dp;
	3101	bzero((caddr_t)dp, sizeof(struct dinode));
	3102	return;
	3103	}
	3104	/*
	3105	* If no dependencies, then there is nothing to roll back.
	3106	*/
	3107	inodedep->id_savedsize = dp->di_size;
	3108	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
	3109	return;
	3110	/*
	3111	* Set the dependencies to busy.
	3112	*/
	3113	ACQUIRE_LOCK(&lk);
	3114	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
	3115	adp = TAILQ_NEXT(adp, ad_next)) {
	3116	#ifdef DIAGNOSTIC
	3117	if (deplist != 0 && prevlbn >= adp->ad_lbn) {
	3118	FREE_LOCK(&lk);
	3119	panic("softdep_write_inodeblock: lbn order");
	3120	}
	3121	prevlbn = adp->ad_lbn;
	3122	if (adp->ad_lbn < NDADDR &&
	3123	dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
	3124	FREE_LOCK(&lk);
	3125	panic("%s: direct pointer #%ld mismatch %d != %d",
	3126	"softdep_write_inodeblock", adp->ad_lbn,
	3127	dp->di_db[adp->ad_lbn], adp->ad_newblkno);
	3128	}
	3129	if (adp->ad_lbn >= NDADDR &&
	3130	dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) {
	3131	FREE_LOCK(&lk);
	3132	panic("%s: indirect pointer #%ld mismatch %d != %d",
	3133	"softdep_write_inodeblock", adp->ad_lbn - NDADDR,
	3134	dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno);
	3135	}
	3136	deplist \|= 1 << adp->ad_lbn;
	3137	if ((adp->ad_state & ATTACHED) == 0) {
	3138	FREE_LOCK(&lk);
	3139	panic("softdep_write_inodeblock: Unknown state 0x%x",
	3140	adp->ad_state);
	3141	}
	3142	#endif /* DIAGNOSTIC */
	3143	adp->ad_state &= ~ATTACHED;
	3144	adp->ad_state \|= UNDONE;
	3145	}
	3146	/*
	3147	* The on-disk inode cannot claim to be any larger than the last
	3148	* fragment that has been written. Otherwise, the on-disk inode
	3149	* might have fragments that were not the last block in the file
	3150	* which would corrupt the filesystem.
	3151	*/
	3152	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
	3153	lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
	3154	if (adp->ad_lbn >= NDADDR)
	3155	break;
	3156	dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
	3157	/* keep going until hitting a rollback to a frag */
	3158	if (adp->ad_oldsize == 0 \|\| adp->ad_oldsize == fs->fs_bsize)
	3159	continue;
	3160	dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
	3161	for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
	3162	#ifdef DIAGNOSTIC
	3163	if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
	3164	FREE_LOCK(&lk);
	3165	panic("softdep_write_inodeblock: lost dep1");
	3166	}
	3167	#endif /* DIAGNOSTIC */
	3168	dp->di_db[i] = 0;
	3169	}
	3170	for (i = 0; i < NIADDR; i++) {
	3171	#ifdef DIAGNOSTIC
	3172	if (dp->di_ib[i] != 0 &&
	3173	(deplist & ((1 << NDADDR) << i)) == 0) {
	3174	FREE_LOCK(&lk);
	3175	panic("softdep_write_inodeblock: lost dep2");
	3176	}
	3177	#endif /* DIAGNOSTIC */
	3178	dp->di_ib[i] = 0;
	3179	}
	3180	FREE_LOCK(&lk);
	3181	return;
	3182	}
	3183	/*
	3184	* If we have zero'ed out the last allocated block of the file,
	3185	* roll back the size to the last currently allocated block.
	3186	* We know that this last allocated block is a full-sized as
	3187	* we already checked for fragments in the loop above.
	3188	*/
	3189	if (lastadp != NULL &&
	3190	dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
	3191	for (i = lastadp->ad_lbn; i >= 0; i--)
	3192	if (dp->di_db[i] != 0)
	3193	break;
	3194	dp->di_size = (i + 1) * fs->fs_bsize;
	3195	}
	3196	/*
	3197	* The only dependencies are for indirect blocks.
	3198	*
	3199	* The file size for indirect block additions is not guaranteed.
	3200	* Such a guarantee would be non-trivial to achieve. The conventional
	3201	* synchronous write implementation also does not make this guarantee.
	3202	* Fsck should catch and fix discrepancies. Arguably, the file size
	3203	* can be over-estimated without destroying integrity when the file
	3204	* moves into the indirect blocks (i.e., is large). If we want to
	3205	* postpone fsck, we are stuck with this argument.
	3206	*/
	3207	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
	3208	dp->di_ib[adp->ad_lbn - NDADDR] = 0;
	3209	FREE_LOCK(&lk);
	3210	}
	3211
	3212	/*
	3213	* This routine is called during the completion interrupt
	3214	* service routine for a disk write (from the procedure called
	3215	* by the device driver to inform the file system caches of
	3216	* a request completion). It should be called early in this
	3217	* procedure, before the block is made available to other
	3218	* processes or other routines are called.
	3219	*/
	3220	static void
	3221	softdep_disk_write_complete(bp)
	3222	struct buf bp; / describes the completed disk write */
	3223	{
	3224	struct worklist *wk;
	3225	struct workhead reattach;
	3226	struct newblk *newblk;
	3227	struct allocindir *aip;
	3228	struct allocdirect *adp;
	3229	struct indirdep *indirdep;
	3230	struct inodedep *inodedep;
	3231	struct bmsafemap *bmsafemap;
	3232
	3233	#ifdef DEBUG
	3234	if (lk.lkt_held != NOHOLDER)
	3235	panic("softdep_disk_write_complete: lock is held");
	3236	lk.lkt_held = SPECIAL_FLAG;
	3237	#endif
	3238	LIST_INIT(&reattach);
	3239	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
	3240	WORKLIST_REMOVE(wk);
	3241	switch (wk->wk_type) {
	3242
	3243	case D_PAGEDEP:
	3244	if (handle_written_filepage(WK_PAGEDEP(wk), bp))
	3245	WORKLIST_INSERT(&reattach, wk);
	3246	continue;
	3247
	3248	case D_INODEDEP:
	3249	if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
	3250	WORKLIST_INSERT(&reattach, wk);
	3251	continue;
	3252
	3253	case D_BMSAFEMAP:
	3254	bmsafemap = WK_BMSAFEMAP(wk);
	3255	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
	3256	newblk->nb_state \|= DEPCOMPLETE;
	3257	newblk->nb_bmsafemap = NULL;
	3258	LIST_REMOVE(newblk, nb_deps);
	3259	}
	3260	while ((adp =
	3261	LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
	3262	adp->ad_state \|= DEPCOMPLETE;
	3263	adp->ad_buf = NULL;
	3264	LIST_REMOVE(adp, ad_deps);
	3265	handle_allocdirect_partdone(adp);
	3266	}
	3267	while ((aip =
	3268	LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
	3269	aip->ai_state \|= DEPCOMPLETE;
	3270	aip->ai_buf = NULL;
	3271	LIST_REMOVE(aip, ai_deps);
	3272	handle_allocindir_partdone(aip);
	3273	}
	3274	while ((inodedep =
	3275	LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
	3276	inodedep->id_state \|= DEPCOMPLETE;
	3277	LIST_REMOVE(inodedep, id_deps);
	3278	inodedep->id_buf = NULL;
	3279	}
	3280	WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
	3281	continue;
	3282
	3283	case D_MKDIR:
	3284	handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
	3285	continue;
	3286
	3287	case D_ALLOCDIRECT:
	3288	adp = WK_ALLOCDIRECT(wk);
	3289	adp->ad_state \|= COMPLETE;
	3290	handle_allocdirect_partdone(adp);
	3291	continue;
	3292
	3293	case D_ALLOCINDIR:
	3294	aip = WK_ALLOCINDIR(wk);
	3295	aip->ai_state \|= COMPLETE;
	3296	handle_allocindir_partdone(aip);
	3297	continue;
	3298
	3299	case D_INDIRDEP:
	3300	indirdep = WK_INDIRDEP(wk);
	3301	if (indirdep->ir_state & GOINGAWAY) {
	3302	lk.lkt_held = NOHOLDER;
	3303	panic("disk_write_complete: indirdep gone");
	3304	}
	3305	bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
	3306	FREE(indirdep->ir_saveddata, M_INDIRDEP);
	3307	indirdep->ir_saveddata = 0;
	3308	indirdep->ir_state &= ~UNDONE;
	3309	indirdep->ir_state \|= ATTACHED;
	3310	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
	3311	handle_allocindir_partdone(aip);
	3312	if (aip == LIST_FIRST(&indirdep->ir_donehd)) {
	3313	lk.lkt_held = NOHOLDER;
	3314	panic("disk_write_complete: not gone");
	3315	}
	3316	}
	3317	WORKLIST_INSERT(&reattach, wk);
	3318	if ((bp->b_flags & B_DELWRI) == 0)
	3319	stat_indir_blk_ptrs++;
	3320	bdirty(bp);
	3321	continue;
	3322
	3323	default:
	3324	lk.lkt_held = NOHOLDER;
	3325	panic("handle_disk_write_complete: Unknown type %s",
	3326	TYPENAME(wk->wk_type));
	3327	/* NOTREACHED */
	3328	}
	3329	}
	3330	/*
	3331	* Reattach any requests that must be redone.
	3332	*/
	3333	while ((wk = LIST_FIRST(&reattach)) != NULL) {
	3334	WORKLIST_REMOVE(wk);
	3335	WORKLIST_INSERT(&bp->b_dep, wk);
	3336	}
	3337	#ifdef DEBUG
	3338	if (lk.lkt_held != SPECIAL_FLAG)
	3339	panic("softdep_disk_write_complete: lock lost");
	3340	lk.lkt_held = NOHOLDER;
	3341	#endif
	3342	}
	3343
	3344	/*
	3345	* Called from within softdep_disk_write_complete above. Note that
	3346	* this routine is always called from interrupt level with further
	3347	* splbio interrupts blocked.
	3348	*/
	3349	static void
	3350	handle_allocdirect_partdone(adp)
	3351	struct allocdirect adp; / the completed allocdirect */
	3352	{
	3353	struct allocdirect *listadp;
	3354	struct inodedep *inodedep;
	3355	long bsize;
	3356
	3357	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
	3358	return;
	3359	if (adp->ad_buf != NULL) {
	3360	lk.lkt_held = NOHOLDER;
	3361	panic("handle_allocdirect_partdone: dangling dep");
	3362	}
	3363	/*
	3364	* The on-disk inode cannot claim to be any larger than the last
	3365	* fragment that has been written. Otherwise, the on-disk inode
	3366	* might have fragments that were not the last block in the file
	3367	* which would corrupt the filesystem. Thus, we cannot free any
	3368	* allocdirects after one whose ad_oldblkno claims a fragment as
	3369	* these blocks must be rolled back to zero before writing the inode.
	3370	* We check the currently active set of allocdirects in id_inoupdt.
	3371	*/
	3372	inodedep = adp->ad_inodedep;
	3373	bsize = inodedep->id_fs->fs_bsize;
	3374	TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) {
	3375	/* found our block */
	3376	if (listadp == adp)
	3377	break;
	3378	/* continue if ad_oldlbn is not a fragment */
	3379	if (listadp->ad_oldsize == 0 \|\|
	3380	listadp->ad_oldsize == bsize)
	3381	continue;
	3382	/* hit a fragment */
	3383	return;
	3384	}
	3385	/*
	3386	* If we have reached the end of the current list without
	3387	* finding the just finished dependency, then it must be
	3388	* on the future dependency list. Future dependencies cannot
	3389	* be freed until they are moved to the current list.
	3390	*/
	3391	if (listadp == NULL) {
	3392	#ifdef DEBUG
	3393	TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next)
	3394	/* found our block */
	3395	if (listadp == adp)
	3396	break;
	3397	if (listadp == NULL) {
	3398	lk.lkt_held = NOHOLDER;
	3399	panic("handle_allocdirect_partdone: lost dep");
	3400	}
	3401	#endif /* DEBUG */
	3402	return;
	3403	}
	3404	/*
	3405	* If we have found the just finished dependency, then free
	3406	* it along with anything that follows it that is complete.
	3407	*/
	3408	for (; adp; adp = listadp) {
	3409	listadp = TAILQ_NEXT(adp, ad_next);
	3410	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
	3411	return;
	3412	free_allocdirect(&inodedep->id_inoupdt, adp, 1);
	3413	}
	3414	}
	3415
	3416	/*
	3417	* Called from within softdep_disk_write_complete above. Note that
	3418	* this routine is always called from interrupt level with further
	3419	* splbio interrupts blocked.
	3420	*/
	3421	static void
	3422	handle_allocindir_partdone(aip)
	3423	struct allocindir aip; / the completed allocindir */
	3424	{
	3425	struct indirdep *indirdep;
	3426
	3427	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
	3428	return;
	3429	if (aip->ai_buf != NULL) {
	3430	lk.lkt_held = NOHOLDER;
	3431	panic("handle_allocindir_partdone: dangling dependency");
	3432	}
	3433	indirdep = aip->ai_indirdep;
	3434	if (indirdep->ir_state & UNDONE) {
	3435	LIST_REMOVE(aip, ai_next);
	3436	LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
	3437	return;
	3438	}
	3439	((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
	3440	aip->ai_newblkno;
	3441	LIST_REMOVE(aip, ai_next);
	3442	if (aip->ai_freefrag != NULL)
	3443	add_to_worklist(&aip->ai_freefrag->ff_list);
	3444	WORKITEM_FREE(aip, D_ALLOCINDIR);
	3445	}
	3446
	3447	/*
	3448	* Called from within softdep_disk_write_complete above to restore
	3449	* in-memory inode block contents to their most up-to-date state. Note
	3450	* that this routine is always called from interrupt level with further
	3451	* splbio interrupts blocked.
	3452	*/
	3453	static int
	3454	handle_written_inodeblock(inodedep, bp)
	3455	struct inodedep *inodedep;
	3456	struct buf bp; / buffer containing the inode block */
	3457	{
	3458	struct worklist wk, filefree;
	3459	struct allocdirect adp, nextadp;
	3460	struct dinode *dp;
	3461	int hadchanges;
	3462
	3463	if ((inodedep->id_state & IOSTARTED) == 0) {
	3464	lk.lkt_held = NOHOLDER;
	3465	panic("handle_written_inodeblock: not started");
	3466	}
	3467	inodedep->id_state &= ~IOSTARTED;
	3468	inodedep->id_state \|= COMPLETE;
	3469	dp = (struct dinode *)bp->b_data +
	3470	ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
	3471	/*
	3472	* If we had to rollback the inode allocation because of
	3473	* bitmaps being incomplete, then simply restore it.
	3474	* Keep the block dirty so that it will not be reclaimed until
	3475	* all associated dependencies have been cleared and the
	3476	* corresponding updates written to disk.
	3477	*/
	3478	if (inodedep->id_savedino != NULL) {
	3479	dp = inodedep->id_savedino;
	3480	FREE(inodedep->id_savedino, M_INODEDEP);
	3481	inodedep->id_savedino = NULL;
	3482	if ((bp->b_flags & B_DELWRI) == 0)
	3483	stat_inode_bitmap++;
	3484	bdirty(bp);
	3485	return (1);
	3486	}
	3487	/*
	3488	* Roll forward anything that had to be rolled back before
	3489	* the inode could be updated.
	3490	*/
	3491	hadchanges = 0;
	3492	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
	3493	nextadp = TAILQ_NEXT(adp, ad_next);
	3494	if (adp->ad_state & ATTACHED) {
	3495	lk.lkt_held = NOHOLDER;
	3496	panic("handle_written_inodeblock: new entry");
	3497	}
	3498	if (adp->ad_lbn < NDADDR) {
	3499	if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) {
	3500	lk.lkt_held = NOHOLDER;
	3501	panic("%s: %s #%ld mismatch %d != %d",
	3502	"handle_written_inodeblock",
	3503	"direct pointer", adp->ad_lbn,
	3504	dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
	3505	}
	3506	dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
	3507	} else {
	3508	if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) {
	3509	lk.lkt_held = NOHOLDER;
	3510	panic("%s: %s #%ld allocated as %d",
	3511	"handle_written_inodeblock",
	3512	"indirect pointer", adp->ad_lbn - NDADDR,
	3513	dp->di_ib[adp->ad_lbn - NDADDR]);
	3514	}
	3515	dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno;
	3516	}
	3517	adp->ad_state &= ~UNDONE;
	3518	adp->ad_state \|= ATTACHED;
	3519	hadchanges = 1;
	3520	}
	3521	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
	3522	stat_direct_blk_ptrs++;
	3523	/*
	3524	* Reset the file size to its most up-to-date value.
	3525	*/
	3526	if (inodedep->id_savedsize == -1) {
	3527	lk.lkt_held = NOHOLDER;
	3528	panic("handle_written_inodeblock: bad size");
	3529	}
	3530	if (dp->di_size != inodedep->id_savedsize) {
	3531	dp->di_size = inodedep->id_savedsize;
	3532	hadchanges = 1;
	3533	}
	3534	inodedep->id_savedsize = -1;
	3535	/*
	3536	* If there were any rollbacks in the inode block, then it must be
	3537	* marked dirty so that its will eventually get written back in
	3538	* its correct form.
	3539	*/
	3540	if (hadchanges)
	3541	bdirty(bp);
	3542	/*
	3543	* Process any allocdirects that completed during the update.
	3544	*/
	3545	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
	3546	handle_allocdirect_partdone(adp);
	3547	/*
	3548	* Process deallocations that were held pending until the
	3549	* inode had been written to disk. Freeing of the inode
	3550	* is delayed until after all blocks have been freed to
	3551	* avoid creation of new <vfsid, inum, lbn> triples
	3552	* before the old ones have been deleted.
	3553	*/
	3554	filefree = NULL;
	3555	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
	3556	WORKLIST_REMOVE(wk);
	3557	switch (wk->wk_type) {
	3558
	3559	case D_FREEFILE:
	3560	/*
	3561	* We defer adding filefree to the worklist until
	3562	* all other additions have been made to ensure
	3563	* that it will be done after all the old blocks
	3564	* have been freed.
	3565	*/
	3566	if (filefree != NULL) {
	3567	lk.lkt_held = NOHOLDER;
	3568	panic("handle_written_inodeblock: filefree");
	3569	}
	3570	filefree = wk;
	3571	continue;
	3572
	3573	case D_MKDIR:
	3574	handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
	3575	continue;
	3576
	3577	case D_DIRADD:
	3578	diradd_inode_written(WK_DIRADD(wk), inodedep);
	3579	continue;
	3580
	3581	case D_FREEBLKS:
	3582	case D_FREEFRAG:
	3583	case D_DIRREM:
	3584	add_to_worklist(wk);
	3585	continue;
	3586
	3587	default:
	3588	lk.lkt_held = NOHOLDER;
	3589	panic("handle_written_inodeblock: Unknown type %s",
	3590	TYPENAME(wk->wk_type));
	3591	/* NOTREACHED */
	3592	}
	3593	}
	3594	if (filefree != NULL) {
	3595	if (free_inodedep(inodedep) == 0) {
	3596	lk.lkt_held = NOHOLDER;
	3597	panic("handle_written_inodeblock: live inodedep");
	3598	}
	3599	add_to_worklist(filefree);
	3600	return (0);
	3601	}
	3602
	3603	/*
	3604	* If no outstanding dependencies, free it.
	3605	*/
	3606	if (free_inodedep(inodedep) \|\| TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
	3607	return (0);
	3608	return (hadchanges);
	3609	}
	3610
	3611	/*
	3612	* Process a diradd entry after its dependent inode has been written.
	3613	* This routine must be called with splbio interrupts blocked.
	3614	*/
	3615	static void
	3616	diradd_inode_written(dap, inodedep)
	3617	struct diradd *dap;
	3618	struct inodedep *inodedep;
	3619	{
	3620	struct pagedep *pagedep;
	3621
	3622	dap->da_state \|= COMPLETE;
	3623	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
	3624	if (dap->da_state & DIRCHG)
	3625	pagedep = dap->da_previous->dm_pagedep;
	3626	else
	3627	pagedep = dap->da_pagedep;
	3628	LIST_REMOVE(dap, da_pdlist);
	3629	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
	3630	}
	3631	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
	3632	}
	3633
	3634	/*
	3635	* Handle the completion of a mkdir dependency.
	3636	*/
	3637	static void
	3638	handle_written_mkdir(mkdir, type)
	3639	struct mkdir *mkdir;
	3640	int type;
	3641	{
	3642	struct diradd *dap;
	3643	struct pagedep *pagedep;
	3644
	3645	if (mkdir->md_state != type) {
	3646	lk.lkt_held = NOHOLDER;
	3647	panic("handle_written_mkdir: bad type");
	3648	}
	3649	dap = mkdir->md_diradd;
	3650	dap->da_state &= ~type;
	3651	if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) == 0)
	3652	dap->da_state \|= DEPCOMPLETE;
	3653	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
	3654	if (dap->da_state & DIRCHG)
	3655	pagedep = dap->da_previous->dm_pagedep;
	3656	else
	3657	pagedep = dap->da_pagedep;
	3658	LIST_REMOVE(dap, da_pdlist);
	3659	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
	3660	}
	3661	LIST_REMOVE(mkdir, md_mkdirs);
	3662	WORKITEM_FREE(mkdir, D_MKDIR);
	3663	}
	3664
	3665	/*
	3666	* Called from within softdep_disk_write_complete above.
	3667	* A write operation was just completed. Removed inodes can
	3668	* now be freed and associated block pointers may be committed.
	3669	* Note that this routine is always called from interrupt level
	3670	* with further splbio interrupts blocked.
	3671	*/
	3672	static int
	3673	handle_written_filepage(pagedep, bp)
	3674	struct pagedep *pagedep;
	3675	struct buf bp; / buffer containing the written page */
	3676	{
	3677	struct dirrem *dirrem;
	3678	struct diradd dap, nextdap;
	3679	struct direct *ep;
	3680	int i, chgs;
	3681
	3682	if ((pagedep->pd_state & IOSTARTED) == 0) {
	3683	lk.lkt_held = NOHOLDER;
	3684	panic("handle_written_filepage: not started");
	3685	}
	3686	pagedep->pd_state &= ~IOSTARTED;
	3687	/*
	3688	* Process any directory removals that have been committed.
	3689	*/
	3690	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
	3691	LIST_REMOVE(dirrem, dm_next);
	3692	dirrem->dm_dirinum = pagedep->pd_ino;
	3693	add_to_worklist(&dirrem->dm_list);
	3694	}
	3695	/*
	3696	* Free any directory additions that have been committed.
	3697	*/
	3698	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
	3699	free_diradd(dap);
	3700	/*
	3701	* Uncommitted directory entries must be restored.
	3702	*/
	3703	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
	3704	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
	3705	dap = nextdap) {
	3706	nextdap = LIST_NEXT(dap, da_pdlist);
	3707	if (dap->da_state & ATTACHED) {
	3708	lk.lkt_held = NOHOLDER;
	3709	panic("handle_written_filepage: attached");
	3710	}
	3711	ep = (struct direct *)
	3712	((char *)bp->b_data + dap->da_offset);
	3713	ep->d_ino = dap->da_newinum;
	3714	dap->da_state &= ~UNDONE;
	3715	dap->da_state \|= ATTACHED;
	3716	chgs = 1;
	3717	/*
	3718	* If the inode referenced by the directory has
	3719	* been written out, then the dependency can be
	3720	* moved to the pending list.
	3721	*/
	3722	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
	3723	LIST_REMOVE(dap, da_pdlist);
	3724	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
	3725	da_pdlist);
	3726	}
	3727	}
	3728	}
	3729	/*
	3730	* If there were any rollbacks in the directory, then it must be
	3731	* marked dirty so that its will eventually get written back in
	3732	* its correct form.
	3733	*/
	3734	if (chgs) {
	3735	if ((bp->b_flags & B_DELWRI) == 0)
	3736	stat_dir_entry++;
	3737	bdirty(bp);
	3738	}
	3739	/*
	3740	* If no dependencies remain, the pagedep will be freed.
	3741	* Otherwise it will remain to update the page before it
	3742	* is written back to disk.
	3743	*/
	3744	if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) {
	3745	for (i = 0; i < DAHASHSZ; i++)
	3746	if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
	3747	break;
	3748	if (i == DAHASHSZ) {
	3749	LIST_REMOVE(pagedep, pd_hash);
	3750	WORKITEM_FREE(pagedep, D_PAGEDEP);
	3751	return (0);
	3752	}
	3753	}
	3754	return (1);
	3755	}
	3756
	3757	/*
	3758	* Writing back in-core inode structures.
	3759	*
	3760	* The file system only accesses an inode's contents when it occupies an
	3761	* "in-core" inode structure. These "in-core" structures are separate from
	3762	* the page frames used to cache inode blocks. Only the latter are
	3763	* transferred to/from the disk. So, when the updated contents of the
	3764	* "in-core" inode structure are copied to the corresponding in-memory inode
	3765	* block, the dependencies are also transferred. The following procedure is
	3766	* called when copying a dirty "in-core" inode to a cached inode block.
	3767	*/
	3768
	3769	/*
	3770	* Called when an inode is loaded from disk. If the effective link count
	3771	* differed from the actual link count when it was last flushed, then we
	3772	* need to ensure that the correct effective link count is put back.
	3773	*/
	3774	void
	3775	softdep_load_inodeblock(ip)
	3776	struct inode ip; / the "in_core" copy of the inode */
	3777	{
	3778	struct inodedep *inodedep;
	3779
	3780	/*
	3781	* Check for alternate nlink count.
	3782	*/
	3783	ip->i_effnlink = ip->i_nlink;
	3784	ACQUIRE_LOCK(&lk);
	3785	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
	3786	FREE_LOCK(&lk);
	3787	return;
	3788	}
	3789	ip->i_effnlink -= inodedep->id_nlinkdelta;
	3790	FREE_LOCK(&lk);
	3791	}
	3792
	3793	/*
	3794	* This routine is called just before the "in-core" inode
	3795	* information is to be copied to the in-memory inode block.
	3796	* Recall that an inode block contains several inodes. If
	3797	* the force flag is set, then the dependencies will be
	3798	* cleared so that the update can always be made. Note that
	3799	* the buffer is locked when this routine is called, so we
	3800	* will never be in the middle of writing the inode block
	3801	* to disk.
	3802	*/
	3803	void
	3804	softdep_update_inodeblock(ip, bp, waitfor)
	3805	struct inode ip; / the "in_core" copy of the inode */
	3806	struct buf bp; / the buffer containing the inode block */
	3807	int waitfor; /* nonzero => update must be allowed */
	3808	{
	3809	struct inodedep *inodedep;
	3810	struct worklist *wk;
	3811	int error, gotit;
	3812
	3813	/*
	3814	* If the effective link count is not equal to the actual link
	3815	* count, then we must track the difference in an inodedep while
	3816	* the inode is (potentially) tossed out of the cache. Otherwise,
	3817	* if there is no existing inodedep, then there are no dependencies
	3818	* to track.
	3819	*/
	3820	ACQUIRE_LOCK(&lk);
	3821	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
	3822	FREE_LOCK(&lk);
	3823	if (ip->i_effnlink != ip->i_nlink)
	3824	panic("softdep_update_inodeblock: bad link count");
	3825	return;
	3826	}
	3827	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) {
	3828	FREE_LOCK(&lk);
	3829	panic("softdep_update_inodeblock: bad delta");
	3830	}
	3831	/*
	3832	* Changes have been initiated. Anything depending on these
	3833	* changes cannot occur until this inode has been written.
	3834	*/
	3835	inodedep->id_state &= ~COMPLETE;
	3836	if ((inodedep->id_state & ONWORKLIST) == 0)
	3837	WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
	3838	/*
	3839	* Any new dependencies associated with the incore inode must
	3840	* now be moved to the list associated with the buffer holding
	3841	* the in-memory copy of the inode. Once merged process any
	3842	* allocdirects that are completed by the merger.
	3843	*/
	3844	merge_inode_lists(inodedep);
	3845	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
	3846	handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
	3847	/*
	3848	* Now that the inode has been pushed into the buffer, the
	3849	* operations dependent on the inode being written to disk
	3850	* can be moved to the id_bufwait so that they will be
	3851	* processed when the buffer I/O completes.
	3852	*/
	3853	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
	3854	WORKLIST_REMOVE(wk);
	3855	WORKLIST_INSERT(&inodedep->id_bufwait, wk);
	3856	}
	3857	/*
	3858	* Newly allocated inodes cannot be written until the bitmap
	3859	* that allocates them have been written (indicated by
	3860	* DEPCOMPLETE being set in id_state). If we are doing a
	3861	* forced sync (e.g., an fsync on a file), we force the bitmap
	3862	* to be written so that the update can be done.
	3863	*/
	3864	if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\| waitfor == 0) {
	3865	FREE_LOCK(&lk);
	3866	return;
	3867	}
	3868	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
	3869	FREE_LOCK(&lk);
	3870	if (gotit &&
	3871	(error = VOP_BWRITE(inodedep->id_buf->b_vp, inodedep->id_buf)) != 0)
	3872	softdep_error("softdep_update_inodeblock: bwrite", error);
	3873	if ((inodedep->id_state & DEPCOMPLETE) == 0)
	3874	panic("softdep_update_inodeblock: update failed");
	3875	}
	3876
	3877	/*
	3878	* Merge the new inode dependency list (id_newinoupdt) into the old
	3879	* inode dependency list (id_inoupdt). This routine must be called
	3880	* with splbio interrupts blocked.
	3881	*/
	3882	static void
	3883	merge_inode_lists(inodedep)
	3884	struct inodedep *inodedep;
	3885	{
	3886	struct allocdirect listadp, newadp;
	3887
	3888	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
	3889	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
	3890	if (listadp->ad_lbn < newadp->ad_lbn) {
	3891	listadp = TAILQ_NEXT(listadp, ad_next);
	3892	continue;
	3893	}
	3894	TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
	3895	TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
	3896	if (listadp->ad_lbn == newadp->ad_lbn) {
	3897	allocdirect_merge(&inodedep->id_inoupdt, newadp,
	3898	listadp);
	3899	listadp = newadp;
	3900	}
	3901	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
	3902	}
	3903	while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
	3904	TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
	3905	TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
	3906	}
	3907	}
	3908
	3909	/*
	3910	* If we are doing an fsync, then we must ensure that any directory
	3911	* entries for the inode have been written after the inode gets to disk.
	3912	*/
	3913	static int
	3914	softdep_fsync(vp)
	3915	struct vnode vp; / the "in_core" copy of the inode */
	3916	{
	3917	struct inodedep *inodedep;
	3918	struct pagedep *pagedep;
	3919	struct worklist *wk;
	3920	struct diradd *dap;
	3921	struct mount *mnt;
	3922	struct vnode *pvp;
	3923	struct inode *ip;
	3924	struct buf *bp;
	3925	struct fs *fs;
	3926	struct thread td = curthread; / XXX */
	3927	struct proc *p = td->td_proc;
	3928	int error, flushparent;
	3929	ino_t parentino;
	3930	ufs_lbn_t lbn;
	3931
	3932	KKASSERT(p);
	3933
	3934	ip = VTOI(vp);
	3935	fs = ip->i_fs;
	3936	ACQUIRE_LOCK(&lk);
	3937	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
	3938	FREE_LOCK(&lk);
	3939	return (0);
	3940	}
	3941	if (LIST_FIRST(&inodedep->id_inowait) != NULL \|\|
	3942	LIST_FIRST(&inodedep->id_bufwait) != NULL \|\|
	3943	TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\|
	3944	TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
	3945	FREE_LOCK(&lk);
	3946	panic("softdep_fsync: pending ops");
	3947	}
	3948	for (error = 0, flushparent = 0; ; ) {
	3949	if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
	3950	break;
	3951	if (wk->wk_type != D_DIRADD) {
	3952	FREE_LOCK(&lk);
	3953	panic("softdep_fsync: Unexpected type %s",
	3954	TYPENAME(wk->wk_type));
	3955	}
	3956	dap = WK_DIRADD(wk);
	3957	/*
	3958	* Flush our parent if this directory entry
	3959	* has a MKDIR_PARENT dependency.
	3960	*/
	3961	if (dap->da_state & DIRCHG)
	3962	pagedep = dap->da_previous->dm_pagedep;
	3963	else
	3964	pagedep = dap->da_pagedep;
	3965	mnt = pagedep->pd_mnt;
	3966	parentino = pagedep->pd_ino;
	3967	lbn = pagedep->pd_lbn;
	3968	if ((dap->da_state & (MKDIR_BODY \| COMPLETE)) != COMPLETE) {
	3969	FREE_LOCK(&lk);
	3970	panic("softdep_fsync: dirty");
	3971	}
	3972	flushparent = dap->da_state & MKDIR_PARENT;
	3973	/*
	3974	* If we are being fsync'ed as part of vgone'ing this vnode,
	3975	* then we will not be able to release and recover the
	3976	* vnode below, so we just have to give up on writing its
	3977	* directory entry out. It will eventually be written, just
	3978	* not now, but then the user was not asking to have it
	3979	* written, so we are not breaking any promises.
	3980	*/
	3981	if (vp->v_flag & VXLOCK)
	3982	break;
	3983	/*
	3984	* We prevent deadlock by always fetching inodes from the
	3985	* root, moving down the directory tree. Thus, when fetching
	3986	* our parent directory, we must unlock ourselves before
	3987	* requesting the lock on our parent. See the comment in
	3988	* ufs_lookup for details on possible races.
	3989	*/
	3990	FREE_LOCK(&lk);
	3991	VOP_UNLOCK(vp, 0, td);
	3992	error = VFS_VGET(mnt, parentino, &pvp);
	3993	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY, td);
	3994	if (error != 0)
	3995	return (error);
	3996	if (flushparent) {
	3997	if ((error = UFS_UPDATE(pvp, 1)) != 0) {
	3998	vput(pvp);
	3999	return (error);
	4000	}
	4001	}
	4002	/*
	4003	* Flush directory page containing the inode's name.
	4004	*/
	4005	error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
	4006	&bp);
	4007	if (error == 0)
	4008	error = VOP_BWRITE(bp->b_vp, bp);
	4009	vput(pvp);
	4010	if (error != 0)
	4011	return (error);
	4012	ACQUIRE_LOCK(&lk);
	4013	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
	4014	break;
	4015	}
	4016	FREE_LOCK(&lk);
	4017	return (0);
	4018	}
	4019
	4020	/*
	4021	* Flush all the dirty bitmaps associated with the block device
	4022	* before flushing the rest of the dirty blocks so as to reduce
	4023	* the number of dependencies that will have to be rolled back.
	4024	*/
	4025	void
	4026	softdep_fsync_mountdev(vp)
	4027	struct vnode *vp;
	4028	{
	4029	struct buf bp, nbp;
	4030	struct worklist *wk;
	4031
	4032	if (!vn_isdisk(vp, NULL))
	4033	panic("softdep_fsync_mountdev: vnode not a disk");
	4034	ACQUIRE_LOCK(&lk);
	4035	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
	4036	nbp = TAILQ_NEXT(bp, b_vnbufs);
	4037	/*
	4038	* If it is already scheduled, skip to the next buffer.
	4039	*/
	4040	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT))
	4041	continue;
	4042	if ((bp->b_flags & B_DELWRI) == 0) {
	4043	FREE_LOCK(&lk);
	4044	panic("softdep_fsync_mountdev: not dirty");
	4045	}
	4046	/*
	4047	* We are only interested in bitmaps with outstanding
	4048	* dependencies.
	4049	*/
	4050	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL \|\|
	4051	wk->wk_type != D_BMSAFEMAP \|\|
	4052	(bp->b_xflags & BX_BKGRDINPROG)) {
	4053	BUF_UNLOCK(bp);
	4054	continue;
	4055	}
	4056	bremfree(bp);
	4057	FREE_LOCK(&lk);
	4058	(void) bawrite(bp);
	4059	ACQUIRE_LOCK(&lk);
	4060	/*
	4061	* Since we may have slept during the I/O, we need
	4062	* to start from a known point.
	4063	*/
	4064	nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
	4065	}
	4066	drain_output(vp, 1);
	4067	FREE_LOCK(&lk);
	4068	}
	4069
	4070	/*
	4071	* This routine is called when we are trying to synchronously flush a
	4072	* file. This routine must eliminate any filesystem metadata dependencies
	4073	* so that the syncing routine can succeed by pushing the dirty blocks
	4074	* associated with the file. If any I/O errors occur, they are returned.
	4075	*/
	4076	int
	4077	softdep_sync_metadata(ap)
	4078	struct vop_fsync_args /* {
	4079	struct vnode *a_vp;
	4080	struct ucred *a_cred;
	4081	int a_waitfor;
	4082	struct proc *a_p;
	4083	} / ap;
	4084	{
	4085	struct vnode *vp = ap->a_vp;
	4086	struct pagedep *pagedep;
	4087	struct allocdirect *adp;
	4088	struct allocindir *aip;
	4089	struct buf bp, nbp;
	4090	struct worklist *wk;
	4091	int i, error, waitfor;
	4092
	4093	/*
	4094	* Check whether this vnode is involved in a filesystem
	4095	* that is doing soft dependency processing.
	4096	*/
	4097	if (!vn_isdisk(vp, NULL)) {
	4098	if (!DOINGSOFTDEP(vp))
	4099	return (0);
	4100	} else
	4101	if (vp->v_specmountpoint == NULL \|\|
	4102	(vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0)
	4103	return (0);
	4104	/*
	4105	* Ensure that any direct block dependencies have been cleared.
	4106	*/
	4107	ACQUIRE_LOCK(&lk);
	4108	if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
	4109	FREE_LOCK(&lk);
	4110	return (error);
	4111	}
	4112	/*
	4113	* For most files, the only metadata dependencies are the
	4114	* cylinder group maps that allocate their inode or blocks.
	4115	* The block allocation dependencies can be found by traversing
	4116	* the dependency lists for any buffers that remain on their
	4117	* dirty buffer list. The inode allocation dependency will
	4118	* be resolved when the inode is updated with MNT_WAIT.
	4119	* This work is done in two passes. The first pass grabs most
	4120	* of the buffers and begins asynchronously writing them. The
	4121	* only way to wait for these asynchronous writes is to sleep
	4122	* on the filesystem vnode which may stay busy for a long time
	4123	* if the filesystem is active. So, instead, we make a second
	4124	* pass over the dependencies blocking on each write. In the
	4125	* usual case we will be blocking against a write that we
	4126	* initiated, so when it is done the dependency will have been
	4127	* resolved. Thus the second pass is expected to end quickly.
	4128	*/
	4129	waitfor = MNT_NOWAIT;
	4130	top:
	4131	/*
	4132	* We must wait for any I/O in progress to finish so that
	4133	* all potential buffers on the dirty list will be visible.
	4134	*/
	4135	drain_output(vp, 1);
	4136	if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) {
	4137	FREE_LOCK(&lk);
	4138	return (0);
	4139	}
	4140	bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
	4141	loop:
	4142	/*
	4143	* As we hold the buffer locked, none of its dependencies
	4144	* will disappear.
	4145	*/
	4146	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	4147	switch (wk->wk_type) {
	4148
	4149	case D_ALLOCDIRECT:
	4150	adp = WK_ALLOCDIRECT(wk);
	4151	if (adp->ad_state & DEPCOMPLETE)
	4152	break;
	4153	nbp = adp->ad_buf;
	4154	if (getdirtybuf(&nbp, waitfor) == 0)
	4155	break;
	4156	FREE_LOCK(&lk);
	4157	if (waitfor == MNT_NOWAIT) {
	4158	bawrite(nbp);
	4159	} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
	4160	bawrite(bp);
	4161	return (error);
	4162	}
	4163	ACQUIRE_LOCK(&lk);
	4164	break;
	4165
	4166	case D_ALLOCINDIR:
	4167	aip = WK_ALLOCINDIR(wk);
	4168	if (aip->ai_state & DEPCOMPLETE)
	4169	break;
	4170	nbp = aip->ai_buf;
	4171	if (getdirtybuf(&nbp, waitfor) == 0)
	4172	break;
	4173	FREE_LOCK(&lk);
	4174	if (waitfor == MNT_NOWAIT) {
	4175	bawrite(nbp);
	4176	} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
	4177	bawrite(bp);
	4178	return (error);
	4179	}
	4180	ACQUIRE_LOCK(&lk);
	4181	break;
	4182
	4183	case D_INDIRDEP:
	4184	restart:
	4185
	4186	LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
	4187	if (aip->ai_state & DEPCOMPLETE)
	4188	continue;
	4189	nbp = aip->ai_buf;
	4190	if (getdirtybuf(&nbp, MNT_WAIT) == 0)
	4191	goto restart;
	4192	FREE_LOCK(&lk);
	4193	if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
	4194	bawrite(bp);
	4195	return (error);
	4196	}
	4197	ACQUIRE_LOCK(&lk);
	4198	goto restart;
	4199	}
	4200	break;
	4201
	4202	case D_INODEDEP:
	4203	if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
	4204	WK_INODEDEP(wk)->id_ino)) != 0) {
	4205	FREE_LOCK(&lk);
	4206	bawrite(bp);
	4207	return (error);
	4208	}
	4209	break;
	4210
	4211	case D_PAGEDEP:
	4212	/*
	4213	* We are trying to sync a directory that may
	4214	* have dependencies on both its own metadata
	4215	* and/or dependencies on the inodes of any
	4216	* recently allocated files. We walk its diradd
	4217	* lists pushing out the associated inode.
	4218	*/
	4219	pagedep = WK_PAGEDEP(wk);
	4220	for (i = 0; i < DAHASHSZ; i++) {
	4221	if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
	4222	continue;
	4223	if ((error =
	4224	flush_pagedep_deps(vp, pagedep->pd_mnt,
	4225	&pagedep->pd_diraddhd[i]))) {
	4226	FREE_LOCK(&lk);
	4227	bawrite(bp);
	4228	return (error);
	4229	}
	4230	}
	4231	break;
	4232
	4233	case D_MKDIR:
	4234	/*
	4235	* This case should never happen if the vnode has
	4236	* been properly sync'ed. However, if this function
	4237	* is used at a place where the vnode has not yet
	4238	* been sync'ed, this dependency can show up. So,
	4239	* rather than panic, just flush it.
	4240	*/
	4241	nbp = WK_MKDIR(wk)->md_buf;
	4242	if (getdirtybuf(&nbp, waitfor) == 0)
	4243	break;
	4244	FREE_LOCK(&lk);
	4245	if (waitfor == MNT_NOWAIT) {
	4246	bawrite(nbp);
	4247	} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
	4248	bawrite(bp);
	4249	return (error);
	4250	}
	4251	ACQUIRE_LOCK(&lk);
	4252	break;
	4253
	4254	case D_BMSAFEMAP:
	4255	/*
	4256	* This case should never happen if the vnode has
	4257	* been properly sync'ed. However, if this function
	4258	* is used at a place where the vnode has not yet
	4259	* been sync'ed, this dependency can show up. So,
	4260	* rather than panic, just flush it.
	4261	*/
	4262	nbp = WK_BMSAFEMAP(wk)->sm_buf;
	4263	if (getdirtybuf(&nbp, waitfor) == 0)
	4264	break;
	4265	FREE_LOCK(&lk);
	4266	if (waitfor == MNT_NOWAIT) {
	4267	bawrite(nbp);
	4268	} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
	4269	bawrite(bp);
	4270	return (error);
	4271	}
	4272	ACQUIRE_LOCK(&lk);
	4273	break;
	4274
	4275	default:
	4276	FREE_LOCK(&lk);
	4277	panic("softdep_sync_metadata: Unknown type %s",
	4278	TYPENAME(wk->wk_type));
	4279	/* NOTREACHED */
	4280	}
	4281	}
	4282	(void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT);
	4283	nbp = TAILQ_NEXT(bp, b_vnbufs);
	4284	FREE_LOCK(&lk);
	4285	bawrite(bp);
	4286	ACQUIRE_LOCK(&lk);
	4287	if (nbp != NULL) {
	4288	bp = nbp;
	4289	goto loop;
	4290	}
	4291	/*
	4292	* The brief unlock is to allow any pent up dependency
	4293	* processing to be done. Then proceed with the second pass.
	4294	*/
	4295	if (waitfor == MNT_NOWAIT) {
	4296	waitfor = MNT_WAIT;
	4297	FREE_LOCK(&lk);
	4298	ACQUIRE_LOCK(&lk);
	4299	goto top;
	4300	}
	4301
	4302	/*
	4303	* If we have managed to get rid of all the dirty buffers,
	4304	* then we are done. For certain directories and block
	4305	* devices, we may need to do further work.
	4306	*/
	4307	if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) {
	4308	FREE_LOCK(&lk);
	4309	return (0);
	4310	}
	4311
	4312	FREE_LOCK(&lk);
	4313	/*
	4314	* If we are trying to sync a block device, some of its buffers may
	4315	* contain metadata that cannot be written until the contents of some
	4316	* partially written files have been written to disk. The only easy
	4317	* way to accomplish this is to sync the entire filesystem (luckily
	4318	* this happens rarely).
	4319	*
	4320	* We must wait for any I/O in progress to finish so that
	4321	* all potential buffers on the dirty list will be visible.
	4322	*/
	4323	drain_output(vp, 1);
	4324	if (vn_isdisk(vp, NULL) &&
	4325	vp->v_specmountpoint && !VOP_ISLOCKED(vp, NULL) &&
	4326	(error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred,
	4327	ap->a_td)) != 0)
	4328	return (error);
	4329	return (0);
	4330	}
	4331
	4332	/*
	4333	* Flush the dependencies associated with an inodedep.
	4334	* Called with splbio blocked.
	4335	*/
	4336	static int
	4337	flush_inodedep_deps(fs, ino)
	4338	struct fs *fs;
	4339	ino_t ino;
	4340	{
	4341	struct inodedep *inodedep;
	4342	struct allocdirect *adp;
	4343	int error, waitfor;
	4344	struct buf *bp;
	4345
	4346	/*
	4347	* This work is done in two passes. The first pass grabs most
	4348	* of the buffers and begins asynchronously writing them. The
	4349	* only way to wait for these asynchronous writes is to sleep
	4350	* on the filesystem vnode which may stay busy for a long time
	4351	* if the filesystem is active. So, instead, we make a second
	4352	* pass over the dependencies blocking on each write. In the
	4353	* usual case we will be blocking against a write that we
	4354	* initiated, so when it is done the dependency will have been
	4355	* resolved. Thus the second pass is expected to end quickly.
	4356	* We give a brief window at the top of the loop to allow
	4357	* any pending I/O to complete.
	4358	*/
	4359	for (waitfor = MNT_NOWAIT; ; ) {
	4360	FREE_LOCK(&lk);
	4361	ACQUIRE_LOCK(&lk);
	4362	if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
	4363	return (0);
	4364	TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) {
	4365	if (adp->ad_state & DEPCOMPLETE)
	4366	continue;
	4367	bp = adp->ad_buf;
	4368	if (getdirtybuf(&bp, waitfor) == 0) {
	4369	if (waitfor == MNT_NOWAIT)
	4370	continue;
	4371	break;
	4372	}
	4373	FREE_LOCK(&lk);
	4374	if (waitfor == MNT_NOWAIT) {
	4375	bawrite(bp);
	4376	} else if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) {
	4377	ACQUIRE_LOCK(&lk);
	4378	return (error);
	4379	}
	4380	ACQUIRE_LOCK(&lk);
	4381	break;
	4382	}
	4383	if (adp != NULL)
	4384	continue;
	4385	TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) {
	4386	if (adp->ad_state & DEPCOMPLETE)
	4387	continue;
	4388	bp = adp->ad_buf;
	4389	if (getdirtybuf(&bp, waitfor) == 0) {
	4390	if (waitfor == MNT_NOWAIT)
	4391	continue;
	4392	break;
	4393	}
	4394	FREE_LOCK(&lk);
	4395	if (waitfor == MNT_NOWAIT) {
	4396	bawrite(bp);
	4397	} else if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) {
	4398	ACQUIRE_LOCK(&lk);
	4399	return (error);
	4400	}
	4401	ACQUIRE_LOCK(&lk);
	4402	break;
	4403	}
	4404	if (adp != NULL)
	4405	continue;
	4406	/*
	4407	* If pass2, we are done, otherwise do pass 2.
	4408	*/
	4409	if (waitfor == MNT_WAIT)
	4410	break;
	4411	waitfor = MNT_WAIT;
	4412	}
	4413	/*
	4414	* Try freeing inodedep in case all dependencies have been removed.
	4415	*/
	4416	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
	4417	(void) free_inodedep(inodedep);
	4418	return (0);
	4419	}
	4420
	4421	/*
	4422	* Eliminate a pagedep dependency by flushing out all its diradd dependencies.
	4423	* Called with splbio blocked.
	4424	*/
	4425	static int
	4426	flush_pagedep_deps(pvp, mp, diraddhdp)
	4427	struct vnode *pvp;
	4428	struct mount *mp;
	4429	struct diraddhd *diraddhdp;
	4430	{
	4431	struct thread td = curthread; / XXX */
	4432	struct ucred *cr;
	4433	struct inodedep *inodedep;
	4434	struct ufsmount *ump;
	4435	struct diradd *dap;
	4436	struct vnode *vp;
	4437	int gotit, error = 0;
	4438	struct buf *bp;
	4439	ino_t inum;
	4440
	4441	KKASSERT(td->td_proc);
	4442	cr = td->td_proc->p_ucred;
	4443
	4444	ump = VFSTOUFS(mp);
	4445	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
	4446	/*
	4447	* Flush ourselves if this directory entry
	4448	* has a MKDIR_PARENT dependency.
	4449	*/
	4450	if (dap->da_state & MKDIR_PARENT) {
	4451	FREE_LOCK(&lk);
	4452	if ((error = UFS_UPDATE(pvp, 1)) != 0)
	4453	break;
	4454	ACQUIRE_LOCK(&lk);
	4455	/*
	4456	* If that cleared dependencies, go on to next.
	4457	*/
	4458	if (dap != LIST_FIRST(diraddhdp))
	4459	continue;
	4460	if (dap->da_state & MKDIR_PARENT) {
	4461	FREE_LOCK(&lk);
	4462	panic("flush_pagedep_deps: MKDIR_PARENT");
	4463	}
	4464	}
	4465	/*
	4466	* A newly allocated directory must have its "." and
	4467	* ".." entries written out before its name can be
	4468	* committed in its parent. We do not want or need
	4469	* the full semantics of a synchronous VOP_FSYNC as
	4470	* that may end up here again, once for each directory
	4471	* level in the filesystem. Instead, we push the blocks
	4472	* and wait for them to clear. We have to fsync twice
	4473	* because the first call may choose to defer blocks
	4474	* that still have dependencies, but deferral will
	4475	* happen at most once.
	4476	*/
	4477	inum = dap->da_newinum;
	4478	if (dap->da_state & MKDIR_BODY) {
	4479	FREE_LOCK(&lk);
	4480	if ((error = VFS_VGET(mp, inum, &vp)) != 0)
	4481	break;
	4482	if ((error=VOP_FSYNC(vp, cr, MNT_NOWAIT, td)) \|\|
	4483	(error=VOP_FSYNC(vp, cr, MNT_NOWAIT, td))) {
	4484	vput(vp);
	4485	break;
	4486	}
	4487	drain_output(vp, 0);
	4488	vput(vp);
	4489	ACQUIRE_LOCK(&lk);
	4490	/*
	4491	* If that cleared dependencies, go on to next.
	4492	*/
	4493	if (dap != LIST_FIRST(diraddhdp))
	4494	continue;
	4495	if (dap->da_state & MKDIR_BODY) {
	4496	FREE_LOCK(&lk);
	4497	panic("flush_pagedep_deps: MKDIR_BODY");
	4498	}
	4499	}
	4500	/*
	4501	* Flush the inode on which the directory entry depends.
	4502	* Having accounted for MKDIR_PARENT and MKDIR_BODY above,
	4503	* the only remaining dependency is that the updated inode
	4504	* count must get pushed to disk. The inode has already
	4505	* been pushed into its inode buffer (via VOP_UPDATE) at
	4506	* the time of the reference count change. So we need only
	4507	* locate that buffer, ensure that there will be no rollback
	4508	* caused by a bitmap dependency, then write the inode buffer.
	4509	*/
	4510	if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
	4511	FREE_LOCK(&lk);
	4512	panic("flush_pagedep_deps: lost inode");
	4513	}
	4514	/*
	4515	* If the inode still has bitmap dependencies,
	4516	* push them to disk.
	4517	*/
	4518	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
	4519	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
	4520	FREE_LOCK(&lk);
	4521	if (gotit &&
	4522	(error = VOP_BWRITE(inodedep->id_buf->b_vp,
	4523	inodedep->id_buf)) != 0)
	4524	break;
	4525	ACQUIRE_LOCK(&lk);
	4526	if (dap != LIST_FIRST(diraddhdp))
	4527	continue;
	4528	}
	4529	/*
	4530	* If the inode is still sitting in a buffer waiting
	4531	* to be written, push it to disk.
	4532	*/
	4533	FREE_LOCK(&lk);
	4534	if ((error = bread(ump->um_devvp,
	4535	fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
	4536	(int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0)
	4537	break;
	4538	if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0)
	4539	break;
	4540	ACQUIRE_LOCK(&lk);
	4541	/*
	4542	* If we have failed to get rid of all the dependencies
	4543	* then something is seriously wrong.
	4544	*/
	4545	if (dap == LIST_FIRST(diraddhdp)) {
	4546	FREE_LOCK(&lk);
	4547	panic("flush_pagedep_deps: flush failed");
	4548	}
	4549	}
	4550	if (error)
	4551	ACQUIRE_LOCK(&lk);
	4552	return (error);
	4553	}
	4554
	4555	/*
	4556	* A large burst of file addition or deletion activity can drive the
	4557	* memory load excessively high. First attempt to slow things down
	4558	* using the techniques below. If that fails, this routine requests
	4559	* the offending operations to fall back to running synchronously
	4560	* until the memory load returns to a reasonable level.
	4561	*/
	4562	int
	4563	softdep_slowdown(vp)
	4564	struct vnode *vp;
	4565	{
	4566	int max_softdeps_hard;
	4567
	4568	max_softdeps_hard = max_softdeps * 11 / 10;
	4569	if (num_dirrem < max_softdeps_hard / 2 &&
	4570	num_inodedep < max_softdeps_hard)
	4571	return (0);
	4572	stat_sync_limit_hit += 1;
	4573	return (1);
	4574	}
	4575
	4576	/*
	4577	* If memory utilization has gotten too high, deliberately slow things
	4578	* down and speed up the I/O processing.
	4579	*/
	4580	static int
	4581	request_cleanup(resource, islocked)
	4582	int resource;
	4583	int islocked;
	4584	{
	4585	struct thread td = curthread; / XXX */
	4586
	4587	/*
	4588	* We never hold up the filesystem syncer process.
	4589	*/
	4590	if (td == filesys_syncer)
	4591	return (0);
	4592	/*
	4593	* First check to see if the work list has gotten backlogged.
	4594	* If it has, co-opt this process to help clean up two entries.
	4595	* Because this process may hold inodes locked, we cannot
	4596	* handle any remove requests that might block on a locked
	4597	* inode as that could lead to deadlock.
	4598	*/
	4599	if (num_on_worklist > max_softdeps / 10) {
	4600	if (islocked)
	4601	FREE_LOCK(&lk);
	4602	process_worklist_item(NULL, LK_NOWAIT);
	4603	process_worklist_item(NULL, LK_NOWAIT);
	4604	stat_worklist_push += 2;
	4605	if (islocked)
	4606	ACQUIRE_LOCK(&lk);
	4607	return(1);
	4608	}
	4609
	4610	/*
	4611	* If we are resource constrained on inode dependencies, try
	4612	* flushing some dirty inodes. Otherwise, we are constrained
	4613	* by file deletions, so try accelerating flushes of directories
	4614	* with removal dependencies. We would like to do the cleanup
	4615	* here, but we probably hold an inode locked at this point and
	4616	* that might deadlock against one that we try to clean. So,
	4617	* the best that we can do is request the syncer daemon to do
	4618	* the cleanup for us.
	4619	*/
	4620	switch (resource) {
	4621
	4622	case FLUSH_INODES:
	4623	stat_ino_limit_push += 1;
	4624	req_clear_inodedeps += 1;
	4625	stat_countp = &stat_ino_limit_hit;
	4626	break;
	4627
	4628	case FLUSH_REMOVE:
	4629	stat_blk_limit_push += 1;
	4630	req_clear_remove += 1;
	4631	stat_countp = &stat_blk_limit_hit;
	4632	break;
	4633
	4634	default:
	4635	if (islocked)
	4636	FREE_LOCK(&lk);
	4637	panic("request_cleanup: unknown type");
	4638	}
	4639	/*
	4640	* Hopefully the syncer daemon will catch up and awaken us.
	4641	* We wait at most tickdelay before proceeding in any case.
	4642	*/
	4643	if (islocked == 0)
	4644	ACQUIRE_LOCK(&lk);
	4645	proc_waiting += 1;
	4646	if (handle.callout == NULL)
	4647	handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
	4648	interlocked_sleep(&lk, SLEEP, (caddr_t)&proc_waiting, PPAUSE,
	4649	"softupdate", 0);
	4650	proc_waiting -= 1;
	4651	if (islocked == 0)
	4652	FREE_LOCK(&lk);
	4653	return (1);
	4654	}
	4655
	4656	/*
	4657	* Awaken processes pausing in request_cleanup and clear proc_waiting
	4658	* to indicate that there is no longer a timer running.
	4659	*/
	4660	void
	4661	pause_timer(arg)
	4662	void *arg;
	4663	{
	4664
	4665	*stat_countp += 1;
	4666	wakeup_one(&proc_waiting);
	4667	if (proc_waiting > 0)
	4668	handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
	4669	else
	4670	handle.callout = NULL;
	4671	}
	4672
	4673	/*
	4674	* Flush out a directory with at least one removal dependency in an effort to
	4675	* reduce the number of dirrem, freefile, and freeblks dependency structures.
	4676	*/
	4677	static void
	4678	clear_remove(struct thread *td)
	4679	{
	4680	struct pagedep_hashhead *pagedephd;
	4681	struct pagedep *pagedep;
	4682	static int next = 0;
	4683	struct mount *mp;
	4684	struct vnode *vp;
	4685	int error, cnt;
	4686	ino_t ino;
	4687	struct ucred *cred;
	4688
	4689	KKASSERT(td->td_proc);
	4690	cred = td->td_proc->p_ucred;
	4691
	4692	ACQUIRE_LOCK(&lk);
	4693	for (cnt = 0; cnt < pagedep_hash; cnt++) {
	4694	pagedephd = &pagedep_hashtbl[next++];
	4695	if (next >= pagedep_hash)
	4696	next = 0;
	4697	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
	4698	if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
	4699	continue;
	4700	mp = pagedep->pd_mnt;
	4701	ino = pagedep->pd_ino;
	4702	FREE_LOCK(&lk);
	4703	if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
	4704	softdep_error("clear_remove: vget", error);
	4705	return;
	4706	}
	4707	if ((error = VOP_FSYNC(vp, cred, MNT_NOWAIT, td)))
	4708	softdep_error("clear_remove: fsync", error);
	4709	drain_output(vp, 0);
	4710	vput(vp);
	4711	return;
	4712	}
	4713	}
	4714	FREE_LOCK(&lk);
	4715	}
	4716
	4717	/*
	4718	* Clear out a block of dirty inodes in an effort to reduce
	4719	* the number of inodedep dependency structures.
	4720	*/
	4721	static void
	4722	clear_inodedeps(struct thread *td)
	4723	{
	4724	struct ucred *cred;
	4725	struct inodedep_hashhead *inodedephd;
	4726	struct inodedep *inodedep;
	4727	static int next = 0;
	4728	struct mount *mp;
	4729	struct vnode *vp;
	4730	struct fs *fs;
	4731	int error, cnt;
	4732	ino_t firstino, lastino, ino;
	4733
	4734	KKASSERT(td->td_proc);
	4735	cred = td->td_proc->p_ucred;
	4736
	4737	ACQUIRE_LOCK(&lk);
	4738	/*
	4739	* Pick a random inode dependency to be cleared.
	4740	* We will then gather up all the inodes in its block
	4741	* that have dependencies and flush them out.
	4742	*/
	4743	for (cnt = 0; cnt < inodedep_hash; cnt++) {
	4744	inodedephd = &inodedep_hashtbl[next++];
	4745	if (next >= inodedep_hash)
	4746	next = 0;
	4747	if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
	4748	break;
	4749	}
	4750	if (inodedep == NULL)
	4751	return;
	4752	/*
	4753	* Ugly code to find mount point given pointer to superblock.
	4754	*/
	4755	fs = inodedep->id_fs;
	4756	TAILQ_FOREACH(mp, &mountlist, mnt_list)
	4757	if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
	4758	break;
	4759	/*
	4760	* Find the last inode in the block with dependencies.
	4761	*/
	4762	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
	4763	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
	4764	if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
	4765	break;
	4766	/*
	4767	* Asynchronously push all but the last inode with dependencies.
	4768	* Synchronously push the last inode with dependencies to ensure
	4769	* that the inode block gets written to free up the inodedeps.
	4770	*/
	4771	for (ino = firstino; ino <= lastino; ino++) {
	4772	if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
	4773	continue;
	4774	FREE_LOCK(&lk);
	4775	if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
	4776	softdep_error("clear_inodedeps: vget", error);
	4777	return;
	4778	}
	4779	if (ino == lastino) {
	4780	if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)))
	4781	softdep_error("clear_inodedeps: fsync1", error);
	4782	} else {
	4783	if ((error = VOP_FSYNC(vp, cred, MNT_NOWAIT, td)))
	4784	softdep_error("clear_inodedeps: fsync2", error);
	4785	drain_output(vp, 0);
	4786	}
	4787	vput(vp);
	4788	ACQUIRE_LOCK(&lk);
	4789	}
	4790	FREE_LOCK(&lk);
	4791	}
	4792
	4793	/*
	4794	* Function to determine if the buffer has outstanding dependencies
	4795	* that will cause a roll-back if the buffer is written. If wantcount
	4796	* is set, return number of dependencies, otherwise just yes or no.
	4797	*/
	4798	static int
	4799	softdep_count_dependencies(bp, wantcount)
	4800	struct buf *bp;
	4801	int wantcount;
	4802	{
	4803	struct worklist *wk;
	4804	struct inodedep *inodedep;
	4805	struct indirdep *indirdep;
	4806	struct allocindir *aip;
	4807	struct pagedep *pagedep;
	4808	struct diradd *dap;
	4809	int i, retval;
	4810
	4811	retval = 0;
	4812	ACQUIRE_LOCK(&lk);
	4813	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	4814	switch (wk->wk_type) {
	4815
	4816	case D_INODEDEP:
	4817	inodedep = WK_INODEDEP(wk);
	4818	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
	4819	/* bitmap allocation dependency */
	4820	retval += 1;
	4821	if (!wantcount)
	4822	goto out;
	4823	}
	4824	if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
	4825	/* direct block pointer dependency */
	4826	retval += 1;
	4827	if (!wantcount)
	4828	goto out;
	4829	}
	4830	continue;
	4831
	4832	case D_INDIRDEP:
	4833	indirdep = WK_INDIRDEP(wk);
	4834
	4835	LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
	4836	/* indirect block pointer dependency */
	4837	retval += 1;
	4838	if (!wantcount)
	4839	goto out;
	4840	}
	4841	continue;
	4842
	4843	case D_PAGEDEP:
	4844	pagedep = WK_PAGEDEP(wk);
	4845	for (i = 0; i < DAHASHSZ; i++) {
	4846
	4847	LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
	4848	/* directory entry dependency */
	4849	retval += 1;
	4850	if (!wantcount)
	4851	goto out;
	4852	}
	4853	}
	4854	continue;
	4855
	4856	case D_BMSAFEMAP:
	4857	case D_ALLOCDIRECT:
	4858	case D_ALLOCINDIR:
	4859	case D_MKDIR:
	4860	/* never a dependency on these blocks */
	4861	continue;
	4862
	4863	default:
	4864	FREE_LOCK(&lk);
	4865	panic("softdep_check_for_rollback: Unexpected type %s",
	4866	TYPENAME(wk->wk_type));
	4867	/* NOTREACHED */
	4868	}
	4869	}
	4870	out:
	4871	FREE_LOCK(&lk);
	4872	return retval;
	4873	}
	4874
	4875	/*
	4876	* Acquire exclusive access to a buffer.
	4877	* Must be called with splbio blocked.
	4878	* Return 1 if buffer was acquired.
	4879	*/
	4880	static int
	4881	getdirtybuf(bpp, waitfor)
	4882	struct buf **bpp;
	4883	int waitfor;
	4884	{
	4885	struct buf *bp;
	4886	int error;
	4887
	4888	for (;;) {
	4889	if ((bp = *bpp) == NULL)
	4890	return (0);
	4891	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT) == 0) {
	4892	if ((bp->b_xflags & BX_BKGRDINPROG) == 0)
	4893	break;
	4894	BUF_UNLOCK(bp);
	4895	if (waitfor != MNT_WAIT)
	4896	return (0);
	4897	bp->b_xflags \|= BX_BKGRDWAIT;
	4898	interlocked_sleep(&lk, SLEEP, &bp->b_xflags, PRIBIO,
	4899	"getbuf", 0);
	4900	continue;
	4901	}
	4902	if (waitfor != MNT_WAIT)
	4903	return (0);
	4904	error = interlocked_sleep(&lk, LOCKBUF, bp,
	4905	LK_EXCLUSIVE \| LK_SLEEPFAIL, 0, 0);
	4906	if (error != ENOLCK) {
	4907	FREE_LOCK(&lk);
	4908	panic("getdirtybuf: inconsistent lock");
	4909	}
	4910	}
	4911	if ((bp->b_flags & B_DELWRI) == 0) {
	4912	BUF_UNLOCK(bp);
	4913	return (0);
	4914	}
	4915	bremfree(bp);
	4916	return (1);
	4917	}
	4918
	4919	/*
	4920	* Wait for pending output on a vnode to complete.
	4921	* Must be called with vnode locked.
	4922	*/
	4923	static void
	4924	drain_output(vp, islocked)
	4925	struct vnode *vp;
	4926	int islocked;
	4927	{
	4928
	4929	if (!islocked)
	4930	ACQUIRE_LOCK(&lk);
	4931	while (vp->v_numoutput) {
	4932	vp->v_flag \|= VBWAIT;
	4933	interlocked_sleep(&lk, SLEEP, (caddr_t)&vp->v_numoutput,
	4934	PRIBIO + 1, "drainvp", 0);
	4935	}
	4936	if (!islocked)
	4937	FREE_LOCK(&lk);
	4938	}
	4939
	4940	/*
	4941	* Called whenever a buffer that is being invalidated or reallocated
	4942	* contains dependencies. This should only happen if an I/O error has
	4943	* occurred. The routine is called with the buffer locked.
	4944	*/
	4945	static void
	4946	softdep_deallocate_dependencies(bp)
	4947	struct buf *bp;
	4948	{
	4949
	4950	if ((bp->b_flags & B_ERROR) == 0)
	4951	panic("softdep_deallocate_dependencies: dangling deps");
	4952	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
	4953	panic("softdep_deallocate_dependencies: unrecovered I/O error");
	4954	}
	4955
	4956	/*
	4957	* Function to handle asynchronous write errors in the filesystem.
	4958	*/
	4959	void
	4960	softdep_error(func, error)
	4961	char *func;
	4962	int error;
	4963	{
	4964
	4965	/* XXX should do something better! */
	4966	printf("%s: got error %d while accessing filesystem\n", func, error);
	4967	}