gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
	3	*
	4	* The soft updates code is derived from the appendix of a University
	5	* of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
	6	* "Soft Updates: A Solution to the Metadata Update Problem in File
	7	* Systems", CSE-TR-254-95, August 1995).
	8	*
	9	* Further information about soft updates can be obtained from:
	10	*
	11	* Marshall Kirk McKusick http://www.mckusick.com/softdep/
	12	* 1614 Oxford Street mckusick@mckusick.com
	13	* Berkeley, CA 94709-1608 +1-510-843-9542
	14	* USA
	15	*
	16	* Redistribution and use in source and binary forms, with or without
	17	* modification, are permitted provided that the following conditions
	18	* are met:
	19	*
	20	* 1. Redistributions of source code must retain the above copyright
	21	* notice, this list of conditions and the following disclaimer.
	22	* 2. Redistributions in binary form must reproduce the above copyright
	23	* notice, this list of conditions and the following disclaimer in the
	24	* documentation and/or other materials provided with the distribution.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
	27	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	28	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	29	* DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
	30	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
	39	* $FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.57.2.11 2002/02/05 18:46:53 dillon Exp $
	40	* $DragonFly: src/sys/vfs/ufs/ffs_softdep.c,v 1.42 2006/05/06 02:43:14 dillon Exp $
	41	*/
	42
	43	/*
	44	* For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
	45	*/
	46	#ifndef DIAGNOSTIC
	47	#define DIAGNOSTIC
	48	#endif
	49	#ifndef DEBUG
	50	#define DEBUG
	51	#endif
	52
	53	#include <sys/param.h>
	54	#include <sys/kernel.h>
	55	#include <sys/systm.h>
	56	#include <sys/buf.h>
	57	#include <sys/malloc.h>
	58	#include <sys/mount.h>
	59	#include <sys/proc.h>
	60	#include <sys/syslog.h>
	61	#include <sys/vnode.h>
	62	#include <sys/conf.h>
	63	#include <sys/buf2.h>
	64	#include <machine/inttypes.h>
	65	#include "dir.h"
	66	#include "quota.h"
	67	#include "inode.h"
	68	#include "ufsmount.h"
	69	#include "fs.h"
	70	#include "softdep.h"
	71	#include "ffs_extern.h"
	72	#include "ufs_extern.h"
	73
	74	#include <sys/thread2.h>
	75
	76	/*
	77	* These definitions need to be adapted to the system to which
	78	* this file is being ported.
	79	*/
	80	/*
	81	* malloc types defined for the softdep system.
	82	*/
	83	MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
	84	MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
	85	MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
	86	MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
	87	MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
	88	MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
	89	MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
	90	MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
	91	MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
	92	MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
	93	MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
	94	MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
	95	MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
	96
	97	#define M_SOFTDEP_FLAGS (M_WAITOK \| M_USE_RESERVE)
	98
	99	#define D_PAGEDEP 0
	100	#define D_INODEDEP 1
	101	#define D_NEWBLK 2
	102	#define D_BMSAFEMAP 3
	103	#define D_ALLOCDIRECT 4
	104	#define D_INDIRDEP 5
	105	#define D_ALLOCINDIR 6
	106	#define D_FREEFRAG 7
	107	#define D_FREEBLKS 8
	108	#define D_FREEFILE 9
	109	#define D_DIRADD 10
	110	#define D_MKDIR 11
	111	#define D_DIRREM 12
	112	#define D_LAST D_DIRREM
	113
	114	/*
	115	* translate from workitem type to memory type
	116	* MUST match the defines above, such that memtype[D_XXX] == M_XXX
	117	*/
	118	static struct malloc_type *memtype[] = {
	119	M_PAGEDEP,
	120	M_INODEDEP,
	121	M_NEWBLK,
	122	M_BMSAFEMAP,
	123	M_ALLOCDIRECT,
	124	M_INDIRDEP,
	125	M_ALLOCINDIR,
	126	M_FREEFRAG,
	127	M_FREEBLKS,
	128	M_FREEFILE,
	129	M_DIRADD,
	130	M_MKDIR,
	131	M_DIRREM
	132	};
	133
	134	#define DtoM(type) (memtype[type])
	135
	136	/*
	137	* Names of malloc types.
	138	*/
	139	#define TYPENAME(type) \
	140	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
	141	/*
	142	* End system adaptaion definitions.
	143	*/
	144
	145	/*
	146	* Internal function prototypes.
	147	*/
	148	static void softdep_error(char *, int);
	149	static void drain_output(struct vnode *, int);
	150	static int getdirtybuf(struct buf **, int);
	151	static void clear_remove(struct thread *);
	152	static void clear_inodedeps(struct thread *);
	153	static int flush_pagedep_deps(struct vnode , struct mount ,
	154	struct diraddhd *);
	155	static int flush_inodedep_deps(struct fs *, ino_t);
	156	static int handle_written_filepage(struct pagedep , struct buf );
	157	static void diradd_inode_written(struct diradd , struct inodedep );
	158	static int handle_written_inodeblock(struct inodedep , struct buf );
	159	static void handle_allocdirect_partdone(struct allocdirect *);
	160	static void handle_allocindir_partdone(struct allocindir *);
	161	static void initiate_write_filepage(struct pagedep , struct buf );
	162	static void handle_written_mkdir(struct mkdir *, int);
	163	static void initiate_write_inodeblock(struct inodedep , struct buf );
	164	static void handle_workitem_freefile(struct freefile *);
	165	static void handle_workitem_remove(struct dirrem *);
	166	static struct dirrem newdirrem(struct buf , struct inode *,
	167	struct inode , int, struct dirrem *);
	168	static void free_diradd(struct diradd *);
	169	static void free_allocindir(struct allocindir , struct inodedep );
	170	static int indir_trunc (struct inode , off_t, int, ufs_lbn_t, long );
	171	static void deallocate_dependencies(struct buf , struct inodedep );
	172	static void free_allocdirect(struct allocdirectlst *,
	173	struct allocdirect *, int);
	174	static int check_inode_unwritten(struct inodedep *);
	175	static int free_inodedep(struct inodedep *);
	176	static void handle_workitem_freeblocks(struct freeblks *);
	177	static void merge_inode_lists(struct inodedep *);
	178	static void setup_allocindir_phase2(struct buf , struct inode ,
	179	struct allocindir *);
	180	static struct allocindir newallocindir(struct inode , int, ufs_daddr_t,
	181	ufs_daddr_t);
	182	static void handle_workitem_freefrag(struct freefrag *);
	183	static struct freefrag newfreefrag(struct inode , ufs_daddr_t, long);
	184	static void allocdirect_merge(struct allocdirectlst *,
	185	struct allocdirect , struct allocdirect );
	186	static struct bmsafemap bmsafemap_lookup(struct buf );
	187	static int newblk_lookup(struct fs *, ufs_daddr_t, int,
	188	struct newblk **);
	189	static int inodedep_lookup(struct fs , ino_t, int, struct inodedep *);
	190	static int pagedep_lookup(struct inode *, ufs_lbn_t, int,
	191	struct pagedep **);
	192	static void pause_timer(void *);
	193	static int request_cleanup(int, int);
	194	static int process_worklist_item(struct mount *, int);
	195	static void add_to_worklist(struct worklist *);
	196
	197	/*
	198	* Exported softdep operations.
	199	*/
	200	static void softdep_disk_io_initiation(struct buf *);
	201	static void softdep_disk_write_complete(struct buf *);
	202	static void softdep_deallocate_dependencies(struct buf *);
	203	static int softdep_fsync(struct vnode *);
	204	static int softdep_process_worklist(struct mount *);
	205	static void softdep_move_dependencies(struct buf , struct buf );
	206	static int softdep_count_dependencies(struct buf *bp, int);
	207
	208	static struct bio_ops softdep_bioops = {
	209	softdep_disk_io_initiation, /* io_start */
	210	softdep_disk_write_complete, /* io_complete */
	211	softdep_deallocate_dependencies, /* io_deallocate */
	212	softdep_fsync, /* io_fsync */
	213	softdep_process_worklist, /* io_sync */
	214	softdep_move_dependencies, /* io_movedeps */
	215	softdep_count_dependencies, /* io_countdeps */
	216	};
	217
	218	/*
	219	* Locking primitives.
	220	*
	221	* For a uniprocessor, all we need to do is protect against disk
	222	* interrupts. For a multiprocessor, this lock would have to be
	223	* a mutex. A single mutex is used throughout this file, though
	224	* finer grain locking could be used if contention warranted it.
	225	*
	226	* For a multiprocessor, the sleep call would accept a lock and
	227	* release it after the sleep processing was complete. In a uniprocessor
	228	* implementation there is no such interlock, so we simple mark
	229	* the places where it needs to be done with the `interlocked' form
	230	* of the lock calls. Since the uniprocessor sleep already interlocks
	231	* the spl, there is nothing that really needs to be done.
	232	*/
	233	#ifndef /* NOT */ DEBUG
	234	static struct lockit {
	235	} lk = { 0 };
	236	#define ACQUIRE_LOCK(lk) crit_enter_id("softupdates");
	237	#define FREE_LOCK(lk) crit_exit_id("softupdates");
	238
	239	#else /* DEBUG */
	240	#define NOHOLDER ((struct thread *)-1)
	241	#define SPECIAL_FLAG ((struct thread *)-2)
	242	static struct lockit {
	243	int lkt_spl;
	244	struct thread *lkt_held;
	245	} lk = { 0, NOHOLDER };
	246	static int lockcnt;
	247
	248	static void acquire_lock(struct lockit *);
	249	static void free_lock(struct lockit *);
	250	void softdep_panic(char *);
	251
	252	#define ACQUIRE_LOCK(lk) acquire_lock(lk)
	253	#define FREE_LOCK(lk) free_lock(lk)
	254
	255	static void
	256	acquire_lock(lk)
	257	struct lockit *lk;
	258	{
	259	thread_t holder;
	260
	261	if (lk->lkt_held != NOHOLDER) {
	262	holder = lk->lkt_held;
	263	FREE_LOCK(lk);
	264	if (holder == curthread)
	265	panic("softdep_lock: locking against myself");
	266	else
	267	panic("softdep_lock: lock held by %p", holder);
	268	}
	269	crit_enter_id("softupdates");
	270	lk->lkt_held = curthread;
	271	lockcnt++;
	272	}
	273
	274	static void
	275	free_lock(lk)
	276	struct lockit *lk;
	277	{
	278
	279	if (lk->lkt_held == NOHOLDER)
	280	panic("softdep_unlock: lock not held");
	281	lk->lkt_held = NOHOLDER;
	282	crit_exit_id("softupdates");
	283	}
	284
	285	/*
	286	* Function to release soft updates lock and panic.
	287	*/
	288	void
	289	softdep_panic(msg)
	290	char *msg;
	291	{
	292
	293	if (lk.lkt_held != NOHOLDER)
	294	FREE_LOCK(&lk);
	295	panic(msg);
	296	}
	297	#endif /* DEBUG */
	298
	299	static int interlocked_sleep(struct lockit , int, void , int,
	300	const char *, int);
	301
	302	/*
	303	* When going to sleep, we must save our SPL so that it does
	304	* not get lost if some other process uses the lock while we
	305	* are sleeping. We restore it after we have slept. This routine
	306	* wraps the interlocking with functions that sleep. The list
	307	* below enumerates the available set of operations.
	308	*/
	309	#define UNKNOWN 0
	310	#define SLEEP 1
	311	#define LOCKBUF 2
	312
	313	static int
	314	interlocked_sleep(lk, op, ident, flags, wmesg, timo)
	315	struct lockit *lk;
	316	int op;
	317	void *ident;
	318	int flags;
	319	const char *wmesg;
	320	int timo;
	321	{
	322	thread_t holder;
	323	int s, retval;
	324
	325	s = lk->lkt_spl;
	326	# ifdef DEBUG
	327	if (lk->lkt_held == NOHOLDER)
	328	panic("interlocked_sleep: lock not held");
	329	lk->lkt_held = NOHOLDER;
	330	# endif /* DEBUG */
	331	switch (op) {
	332	case SLEEP:
	333	retval = tsleep(ident, flags, wmesg, timo);
	334	break;
	335	case LOCKBUF:
	336	retval = BUF_LOCK((struct buf *)ident, flags);
	337	break;
	338	default:
	339	panic("interlocked_sleep: unknown operation");
	340	}
	341	# ifdef DEBUG
	342	if (lk->lkt_held != NOHOLDER) {
	343	holder = lk->lkt_held;
	344	FREE_LOCK(lk);
	345	if (holder == curthread)
	346	panic("interlocked_sleep: locking against self");
	347	else
	348	panic("interlocked_sleep: lock held by %p", holder);
	349	}
	350	lk->lkt_held = curthread;
	351	lockcnt++;
	352	# endif /* DEBUG */
	353	lk->lkt_spl = s;
	354	return (retval);
	355	}
	356
	357	/*
	358	* Place holder for real semaphores.
	359	*/
	360	struct sema {
	361	int value;
	362	thread_t holder;
	363	char *name;
	364	int prio;
	365	int timo;
	366	};
	367	static void sema_init(struct sema , char , int, int);
	368	static int sema_get(struct sema , struct lockit );
	369	static void sema_release(struct sema *);
	370
	371	static void
	372	sema_init(semap, name, prio, timo)
	373	struct sema *semap;
	374	char *name;
	375	int prio, timo;
	376	{
	377
	378	semap->holder = NOHOLDER;
	379	semap->value = 0;
	380	semap->name = name;
	381	semap->prio = prio;
	382	semap->timo = timo;
	383	}
	384
	385	static int
	386	sema_get(semap, interlock)
	387	struct sema *semap;
	388	struct lockit *interlock;
	389	{
	390
	391	if (semap->value++ > 0) {
	392	if (interlock != NULL) {
	393	interlocked_sleep(interlock, SLEEP, (caddr_t)semap,
	394	semap->prio, semap->name, semap->timo);
	395	FREE_LOCK(interlock);
	396	} else {
	397	tsleep((caddr_t)semap, semap->prio, semap->name,
	398	semap->timo);
	399	}
	400	return (0);
	401	}
	402	semap->holder = curthread;
	403	if (interlock != NULL)
	404	FREE_LOCK(interlock);
	405	return (1);
	406	}
	407
	408	static void
	409	sema_release(semap)
	410	struct sema *semap;
	411	{
	412
	413	if (semap->value <= 0 \|\| semap->holder != curthread) {
	414	if (lk.lkt_held != NOHOLDER)
	415	FREE_LOCK(&lk);
	416	panic("sema_release: not held");
	417	}
	418	if (--semap->value > 0) {
	419	semap->value = 0;
	420	wakeup(semap);
	421	}
	422	semap->holder = NOHOLDER;
	423	}
	424
	425	/*
	426	* Worklist queue management.
	427	* These routines require that the lock be held.
	428	*/
	429	#ifndef /* NOT */ DEBUG
	430	#define WORKLIST_INSERT(head, item) do { \
	431	(item)->wk_state \|= ONWORKLIST; \
	432	LIST_INSERT_HEAD(head, item, wk_list); \
	433	} while (0)
	434	#define WORKLIST_REMOVE(item) do { \
	435	(item)->wk_state &= ~ONWORKLIST; \
	436	LIST_REMOVE(item, wk_list); \
	437	} while (0)
	438	#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
	439
	440	#else /* DEBUG */
	441	static void worklist_insert(struct workhead , struct worklist );
	442	static void worklist_remove(struct worklist *);
	443	static void workitem_free(struct worklist *, int);
	444
	445	#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
	446	#define WORKLIST_REMOVE(item) worklist_remove(item)
	447	#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
	448
	449	static void
	450	worklist_insert(head, item)
	451	struct workhead *head;
	452	struct worklist *item;
	453	{
	454
	455	if (lk.lkt_held == NOHOLDER)
	456	panic("worklist_insert: lock not held");
	457	if (item->wk_state & ONWORKLIST) {
	458	FREE_LOCK(&lk);
	459	panic("worklist_insert: already on list");
	460	}
	461	item->wk_state \|= ONWORKLIST;
	462	LIST_INSERT_HEAD(head, item, wk_list);
	463	}
	464
	465	static void
	466	worklist_remove(item)
	467	struct worklist *item;
	468	{
	469
	470	if (lk.lkt_held == NOHOLDER)
	471	panic("worklist_remove: lock not held");
	472	if ((item->wk_state & ONWORKLIST) == 0) {
	473	FREE_LOCK(&lk);
	474	panic("worklist_remove: not on list");
	475	}
	476	item->wk_state &= ~ONWORKLIST;
	477	LIST_REMOVE(item, wk_list);
	478	}
	479
	480	static void
	481	workitem_free(item, type)
	482	struct worklist *item;
	483	int type;
	484	{
	485
	486	if (item->wk_state & ONWORKLIST) {
	487	if (lk.lkt_held != NOHOLDER)
	488	FREE_LOCK(&lk);
	489	panic("workitem_free: still on list");
	490	}
	491	if (item->wk_type != type) {
	492	if (lk.lkt_held != NOHOLDER)
	493	FREE_LOCK(&lk);
	494	panic("workitem_free: type mismatch");
	495	}
	496	FREE(item, DtoM(type));
	497	}
	498	#endif /* DEBUG */
	499
	500	/*
	501	* Workitem queue management
	502	*/
	503	static struct workhead softdep_workitem_pending;
	504	static int num_on_worklist; /* number of worklist items to be processed */
	505	static int softdep_worklist_busy; /* 1 => trying to do unmount */
	506	static int softdep_worklist_req; /* serialized waiters */
	507	static int max_softdeps; /* maximum number of structs before slowdown */
	508	static int tickdelay = 2; /* number of ticks to pause during slowdown */
	509	static int stat_countp; / statistic to count in proc_waiting timeout */
	510	static int proc_waiting; /* tracks whether we have a timeout posted */
	511	static struct callout handle; /* handle on posted proc_waiting timeout */
	512	static struct thread filesys_syncer; / proc of filesystem syncer process */
	513	static int req_clear_inodedeps; /* syncer process flush some inodedeps */
	514	#define FLUSH_INODES 1
	515	static int req_clear_remove; /* syncer process flush some freeblks */
	516	#define FLUSH_REMOVE 2
	517	/*
	518	* runtime statistics
	519	*/
	520	static int stat_worklist_push; /* number of worklist cleanups */
	521	static int stat_blk_limit_push; /* number of times block limit neared */
	522	static int stat_ino_limit_push; /* number of times inode limit neared */
	523	static int stat_blk_limit_hit; /* number of times block slowdown imposed */
	524	static int stat_ino_limit_hit; /* number of times inode slowdown imposed */
	525	static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
	526	static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
	527	static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
	528	static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
	529	static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
	530	#ifdef DEBUG
	531	#include <vm/vm.h>
	532	#include <sys/sysctl.h>
	533	SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
	534	SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
	535	SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
	536	SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
	537	SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
	538	SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
	539	SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
	540	SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
	541	SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
	542	SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
	543	SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
	544	SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
	545	#endif /* DEBUG */
	546
	547	/*
	548	* Add an item to the end of the work queue.
	549	* This routine requires that the lock be held.
	550	* This is the only routine that adds items to the list.
	551	* The following routine is the only one that removes items
	552	* and does so in order from first to last.
	553	*/
	554	static void
	555	add_to_worklist(wk)
	556	struct worklist *wk;
	557	{
	558	static struct worklist *worklist_tail;
	559
	560	if (wk->wk_state & ONWORKLIST) {
	561	if (lk.lkt_held != NOHOLDER)
	562	FREE_LOCK(&lk);
	563	panic("add_to_worklist: already on list");
	564	}
	565	wk->wk_state \|= ONWORKLIST;
	566	if (LIST_FIRST(&softdep_workitem_pending) == NULL)
	567	LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
	568	else
	569	LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
	570	worklist_tail = wk;
	571	num_on_worklist += 1;
	572	}
	573
	574	/*
	575	* Process that runs once per second to handle items in the background queue.
	576	*
	577	* Note that we ensure that everything is done in the order in which they
	578	* appear in the queue. The code below depends on this property to ensure
	579	* that blocks of a file are freed before the inode itself is freed. This
	580	* ordering ensures that no new <vfsid, inum, lbn> triples will be generated
	581	* until all the old ones have been purged from the dependency lists.
	582	*/
	583	static int
	584	softdep_process_worklist(matchmnt)
	585	struct mount *matchmnt;
	586	{
	587	thread_t td = curthread;
	588	int matchcnt, loopcount;
	589	long starttime;
	590
	591	/*
	592	* Record the process identifier of our caller so that we can give
	593	* this process preferential treatment in request_cleanup below.
	594	*/
	595	filesys_syncer = td;
	596	matchcnt = 0;
	597
	598	/*
	599	* There is no danger of having multiple processes run this
	600	* code, but we have to single-thread it when softdep_flushfiles()
	601	* is in operation to get an accurate count of the number of items
	602	* related to its mount point that are in the list.
	603	*/
	604	if (matchmnt == NULL) {
	605	if (softdep_worklist_busy < 0)
	606	return(-1);
	607	softdep_worklist_busy += 1;
	608	}
	609
	610	/*
	611	* If requested, try removing inode or removal dependencies.
	612	*/
	613	if (req_clear_inodedeps) {
	614	clear_inodedeps(td);
	615	req_clear_inodedeps -= 1;
	616	wakeup_one(&proc_waiting);
	617	}
	618	if (req_clear_remove) {
	619	clear_remove(td);
	620	req_clear_remove -= 1;
	621	wakeup_one(&proc_waiting);
	622	}
	623	loopcount = 1;
	624	starttime = time_second;
	625	while (num_on_worklist > 0) {
	626	matchcnt += process_worklist_item(matchmnt, 0);
	627
	628	/*
	629	* If a umount operation wants to run the worklist
	630	* accurately, abort.
	631	*/
	632	if (softdep_worklist_req && matchmnt == NULL) {
	633	matchcnt = -1;
	634	break;
	635	}
	636
	637	/*
	638	* If requested, try removing inode or removal dependencies.
	639	*/
	640	if (req_clear_inodedeps) {
	641	clear_inodedeps(td);
	642	req_clear_inodedeps -= 1;
	643	wakeup_one(&proc_waiting);
	644	}
	645	if (req_clear_remove) {
	646	clear_remove(td);
	647	req_clear_remove -= 1;
	648	wakeup_one(&proc_waiting);
	649	}
	650	/*
	651	* We do not generally want to stop for buffer space, but if
	652	* we are really being a buffer hog, we will stop and wait.
	653	*/
	654	if (loopcount++ % 128 == 0)
	655	bwillwrite();
	656	/*
	657	* Never allow processing to run for more than one
	658	* second. Otherwise the other syncer tasks may get
	659	* excessively backlogged.
	660	*/
	661	if (starttime != time_second && matchmnt == NULL) {
	662	matchcnt = -1;
	663	break;
	664	}
	665	}
	666	if (matchmnt == NULL) {
	667	--softdep_worklist_busy;
	668	if (softdep_worklist_req && softdep_worklist_busy == 0)
	669	wakeup(&softdep_worklist_req);
	670	}
	671	return (matchcnt);
	672	}
	673
	674	/*
	675	* Process one item on the worklist.
	676	*/
	677	static int
	678	process_worklist_item(matchmnt, flags)
	679	struct mount *matchmnt;
	680	int flags;
	681	{
	682	struct worklist *wk;
	683	struct dirrem *dirrem;
	684	struct fs *matchfs;
	685	struct vnode *vp;
	686	int matchcnt = 0;
	687
	688	matchfs = NULL;
	689	if (matchmnt != NULL)
	690	matchfs = VFSTOUFS(matchmnt)->um_fs;
	691	ACQUIRE_LOCK(&lk);
	692	/*
	693	* Normally we just process each item on the worklist in order.
	694	* However, if we are in a situation where we cannot lock any
	695	* inodes, we have to skip over any dirrem requests whose
	696	* vnodes are resident and locked.
	697	*/
	698	LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
	699	if ((flags & LK_NOWAIT) == 0 \|\| wk->wk_type != D_DIRREM)
	700	break;
	701	dirrem = WK_DIRREM(wk);
	702	vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev,
	703	dirrem->dm_oldinum);
	704	if (vp == NULL \|\| !VOP_ISLOCKED(vp, curthread))
	705	break;
	706	}
	707	if (wk == 0) {
	708	FREE_LOCK(&lk);
	709	return (0);
	710	}
	711	WORKLIST_REMOVE(wk);
	712	num_on_worklist -= 1;
	713	FREE_LOCK(&lk);
	714	switch (wk->wk_type) {
	715
	716	case D_DIRREM:
	717	/* removal of a directory entry */
	718	if (WK_DIRREM(wk)->dm_mnt == matchmnt)
	719	matchcnt += 1;
	720	handle_workitem_remove(WK_DIRREM(wk));
	721	break;
	722
	723	case D_FREEBLKS:
	724	/* releasing blocks and/or fragments from a file */
	725	if (WK_FREEBLKS(wk)->fb_fs == matchfs)
	726	matchcnt += 1;
	727	handle_workitem_freeblocks(WK_FREEBLKS(wk));
	728	break;
	729
	730	case D_FREEFRAG:
	731	/* releasing a fragment when replaced as a file grows */
	732	if (WK_FREEFRAG(wk)->ff_fs == matchfs)
	733	matchcnt += 1;
	734	handle_workitem_freefrag(WK_FREEFRAG(wk));
	735	break;
	736
	737	case D_FREEFILE:
	738	/* releasing an inode when its link count drops to 0 */
	739	if (WK_FREEFILE(wk)->fx_fs == matchfs)
	740	matchcnt += 1;
	741	handle_workitem_freefile(WK_FREEFILE(wk));
	742	break;
	743
	744	default:
	745	panic("%s_process_worklist: Unknown type %s",
	746	"softdep", TYPENAME(wk->wk_type));
	747	/* NOTREACHED */
	748	}
	749	return (matchcnt);
	750	}
	751
	752	/*
	753	* Move dependencies from one buffer to another.
	754	*/
	755	static void
	756	softdep_move_dependencies(oldbp, newbp)
	757	struct buf *oldbp;
	758	struct buf *newbp;
	759	{
	760	struct worklist wk, wktail;
	761
	762	if (LIST_FIRST(&newbp->b_dep) != NULL)
	763	panic("softdep_move_dependencies: need merge code");
	764	wktail = 0;
	765	ACQUIRE_LOCK(&lk);
	766	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
	767	LIST_REMOVE(wk, wk_list);
	768	if (wktail == 0)
	769	LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
	770	else
	771	LIST_INSERT_AFTER(wktail, wk, wk_list);
	772	wktail = wk;
	773	}
	774	FREE_LOCK(&lk);
	775	}
	776
	777	/*
	778	* Purge the work list of all items associated with a particular mount point.
	779	*/
	780	int
	781	softdep_flushfiles(struct mount oldmnt, int flags, struct thread td)
	782	{
	783	struct vnode *devvp;
	784	int error, loopcnt;
	785
	786	/*
	787	* Await our turn to clear out the queue, then serialize access.
	788	*/
	789	while (softdep_worklist_busy != 0) {
	790	softdep_worklist_req += 1;
	791	tsleep(&softdep_worklist_req, 0, "softflush", 0);
	792	softdep_worklist_req -= 1;
	793	}
	794	softdep_worklist_busy = -1;
	795
	796	if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) {
	797	softdep_worklist_busy = 0;
	798	if (softdep_worklist_req)
	799	wakeup(&softdep_worklist_req);
	800	return (error);
	801	}
	802	/*
	803	* Alternately flush the block device associated with the mount
	804	* point and process any dependencies that the flushing
	805	* creates. In theory, this loop can happen at most twice,
	806	* but we give it a few extra just to be sure.
	807	*/
	808	devvp = VFSTOUFS(oldmnt)->um_devvp;
	809	for (loopcnt = 10; loopcnt > 0; ) {
	810	if (softdep_process_worklist(oldmnt) == 0) {
	811	loopcnt--;
	812	/*
	813	* Do another flush in case any vnodes were brought in
	814	* as part of the cleanup operations.
	815	*/
	816	if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
	817	break;
	818	/*
	819	* If we still found nothing to do, we are really done.
	820	*/
	821	if (softdep_process_worklist(oldmnt) == 0)
	822	break;
	823	}
	824	vn_lock(devvp, LK_EXCLUSIVE \| LK_RETRY);
	825	error = VOP_FSYNC(devvp, MNT_WAIT);
	826	VOP_UNLOCK(devvp, 0);
	827	if (error)
	828	break;
	829	}
	830	softdep_worklist_busy = 0;
	831	if (softdep_worklist_req)
	832	wakeup(&softdep_worklist_req);
	833
	834	/*
	835	* If we are unmounting then it is an error to fail. If we
	836	* are simply trying to downgrade to read-only, then filesystem
	837	* activity can keep us busy forever, so we just fail with EBUSY.
	838	*/
	839	if (loopcnt == 0) {
	840	if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
	841	panic("softdep_flushfiles: looping");
	842	error = EBUSY;
	843	}
	844	return (error);
	845	}
	846
	847	/*
	848	* Structure hashing.
	849	*
	850	* There are three types of structures that can be looked up:
	851	* 1) pagedep structures identified by mount point, inode number,
	852	* and logical block.
	853	* 2) inodedep structures identified by mount point and inode number.
	854	* 3) newblk structures identified by mount point and
	855	* physical block number.
	856	*
	857	* The "pagedep" and "inodedep" dependency structures are hashed
	858	* separately from the file blocks and inodes to which they correspond.
	859	* This separation helps when the in-memory copy of an inode or
	860	* file block must be replaced. It also obviates the need to access
	861	* an inode or file page when simply updating (or de-allocating)
	862	* dependency structures. Lookup of newblk structures is needed to
	863	* find newly allocated blocks when trying to associate them with
	864	* their allocdirect or allocindir structure.
	865	*
	866	* The lookup routines optionally create and hash a new instance when
	867	* an existing entry is not found.
	868	*/
	869	#define DEPALLOC 0x0001 /* allocate structure if lookup fails */
	870	#define NODELAY 0x0002 /* cannot do background work */
	871
	872	/*
	873	* Structures and routines associated with pagedep caching.
	874	*/
	875	LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
	876	u_long pagedep_hash; /* size of hash table - 1 */
	877	#define PAGEDEP_HASH(mp, inum, lbn) \
	878	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
	879	pagedep_hash])
	880	static struct sema pagedep_in_progress;
	881
	882	/*
	883	* Look up a pagedep. Return 1 if found, 0 if not found.
	884	* If not found, allocate if DEPALLOC flag is passed.
	885	* Found or allocated entry is returned in pagedeppp.
	886	* This routine must be called with splbio interrupts blocked.
	887	*/
	888	static int
	889	pagedep_lookup(ip, lbn, flags, pagedeppp)
	890	struct inode *ip;
	891	ufs_lbn_t lbn;
	892	int flags;
	893	struct pagedep **pagedeppp;
	894	{
	895	struct pagedep *pagedep;
	896	struct pagedep_hashhead *pagedephd;
	897	struct mount *mp;
	898	int i;
	899
	900	#ifdef DEBUG
	901	if (lk.lkt_held == NOHOLDER)
	902	panic("pagedep_lookup: lock not held");
	903	#endif
	904	mp = ITOV(ip)->v_mount;
	905	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
	906	top:
	907	LIST_FOREACH(pagedep, pagedephd, pd_hash)
	908	if (ip->i_number == pagedep->pd_ino &&
	909	lbn == pagedep->pd_lbn &&
	910	mp == pagedep->pd_mnt)
	911	break;
	912	if (pagedep) {
	913	*pagedeppp = pagedep;
	914	return (1);
	915	}
	916	if ((flags & DEPALLOC) == 0) {
	917	*pagedeppp = NULL;
	918	return (0);
	919	}
	920	if (sema_get(&pagedep_in_progress, &lk) == 0) {
	921	ACQUIRE_LOCK(&lk);
	922	goto top;
	923	}
	924	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
	925	M_SOFTDEP_FLAGS);
	926	bzero(pagedep, sizeof(struct pagedep));
	927	pagedep->pd_list.wk_type = D_PAGEDEP;
	928	pagedep->pd_mnt = mp;
	929	pagedep->pd_ino = ip->i_number;
	930	pagedep->pd_lbn = lbn;
	931	LIST_INIT(&pagedep->pd_dirremhd);
	932	LIST_INIT(&pagedep->pd_pendinghd);
	933	for (i = 0; i < DAHASHSZ; i++)
	934	LIST_INIT(&pagedep->pd_diraddhd[i]);
	935	ACQUIRE_LOCK(&lk);
	936	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
	937	sema_release(&pagedep_in_progress);
	938	*pagedeppp = pagedep;
	939	return (0);
	940	}
	941
	942	/*
	943	* Structures and routines associated with inodedep caching.
	944	*/
	945	LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
	946	static u_long inodedep_hash; /* size of hash table - 1 */
	947	static long num_inodedep; /* number of inodedep allocated */
	948	#define INODEDEP_HASH(fs, inum) \
	949	(&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
	950	static struct sema inodedep_in_progress;
	951
	952	/*
	953	* Look up a inodedep. Return 1 if found, 0 if not found.
	954	* If not found, allocate if DEPALLOC flag is passed.
	955	* Found or allocated entry is returned in inodedeppp.
	956	* This routine must be called with splbio interrupts blocked.
	957	*/
	958	static int
	959	inodedep_lookup(fs, inum, flags, inodedeppp)
	960	struct fs *fs;
	961	ino_t inum;
	962	int flags;
	963	struct inodedep **inodedeppp;
	964	{
	965	struct inodedep *inodedep;
	966	struct inodedep_hashhead *inodedephd;
	967	int firsttry;
	968
	969	#ifdef DEBUG
	970	if (lk.lkt_held == NOHOLDER)
	971	panic("inodedep_lookup: lock not held");
	972	#endif
	973	firsttry = 1;
	974	inodedephd = INODEDEP_HASH(fs, inum);
	975	top:
	976	LIST_FOREACH(inodedep, inodedephd, id_hash)
	977	if (inum == inodedep->id_ino && fs == inodedep->id_fs)
	978	break;
	979	if (inodedep) {
	980	*inodedeppp = inodedep;
	981	return (1);
	982	}
	983	if ((flags & DEPALLOC) == 0) {
	984	*inodedeppp = NULL;
	985	return (0);
	986	}
	987	/*
	988	* If we are over our limit, try to improve the situation.
	989	*/
	990	if (num_inodedep > max_softdeps && firsttry &&
	991	speedup_syncer() == 0 && (flags & NODELAY) == 0 &&
	992	request_cleanup(FLUSH_INODES, 1)) {
	993	firsttry = 0;
	994	goto top;
	995	}
	996	if (sema_get(&inodedep_in_progress, &lk) == 0) {
	997	ACQUIRE_LOCK(&lk);
	998	goto top;
	999	}
	1000	num_inodedep += 1;
	1001	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
	1002	M_INODEDEP, M_SOFTDEP_FLAGS);
	1003	inodedep->id_list.wk_type = D_INODEDEP;
	1004	inodedep->id_fs = fs;
	1005	inodedep->id_ino = inum;
	1006	inodedep->id_state = ALLCOMPLETE;
	1007	inodedep->id_nlinkdelta = 0;
	1008	inodedep->id_savedino = NULL;
	1009	inodedep->id_savedsize = -1;
	1010	inodedep->id_buf = NULL;
	1011	LIST_INIT(&inodedep->id_pendinghd);
	1012	LIST_INIT(&inodedep->id_inowait);
	1013	LIST_INIT(&inodedep->id_bufwait);
	1014	TAILQ_INIT(&inodedep->id_inoupdt);
	1015	TAILQ_INIT(&inodedep->id_newinoupdt);
	1016	ACQUIRE_LOCK(&lk);
	1017	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
	1018	sema_release(&inodedep_in_progress);
	1019	*inodedeppp = inodedep;
	1020	return (0);
	1021	}
	1022
	1023	/*
	1024	* Structures and routines associated with newblk caching.
	1025	*/
	1026	LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
	1027	u_long newblk_hash; /* size of hash table - 1 */
	1028	#define NEWBLK_HASH(fs, inum) \
	1029	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
	1030	static struct sema newblk_in_progress;
	1031
	1032	/*
	1033	* Look up a newblk. Return 1 if found, 0 if not found.
	1034	* If not found, allocate if DEPALLOC flag is passed.
	1035	* Found or allocated entry is returned in newblkpp.
	1036	*/
	1037	static int
	1038	newblk_lookup(fs, newblkno, flags, newblkpp)
	1039	struct fs *fs;
	1040	ufs_daddr_t newblkno;
	1041	int flags;
	1042	struct newblk **newblkpp;
	1043	{
	1044	struct newblk *newblk;
	1045	struct newblk_hashhead *newblkhd;
	1046
	1047	newblkhd = NEWBLK_HASH(fs, newblkno);
	1048	top:
	1049	LIST_FOREACH(newblk, newblkhd, nb_hash)
	1050	if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
	1051	break;
	1052	if (newblk) {
	1053	*newblkpp = newblk;
	1054	return (1);
	1055	}
	1056	if ((flags & DEPALLOC) == 0) {
	1057	*newblkpp = NULL;
	1058	return (0);
	1059	}
	1060	if (sema_get(&newblk_in_progress, 0) == 0)
	1061	goto top;
	1062	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
	1063	M_NEWBLK, M_SOFTDEP_FLAGS);
	1064	newblk->nb_state = 0;
	1065	newblk->nb_fs = fs;
	1066	newblk->nb_newblkno = newblkno;
	1067	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
	1068	sema_release(&newblk_in_progress);
	1069	*newblkpp = newblk;
	1070	return (0);
	1071	}
	1072
	1073	/*
	1074	* Executed during filesystem system initialization before
	1075	* mounting any filesystems.
	1076	*/
	1077	void
	1078	softdep_initialize()
	1079	{
	1080	callout_init(&handle);
	1081	bioops = softdep_bioops; /* XXX hack */
	1082
	1083	LIST_INIT(&mkdirlisthd);
	1084	LIST_INIT(&softdep_workitem_pending);
	1085	max_softdeps = min(desiredvnodes * 8,
	1086	M_INODEDEP->ks_limit / (2 * sizeof(struct inodedep)));
	1087	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
	1088	&pagedep_hash);
	1089	sema_init(&pagedep_in_progress, "pagedep", 0, 0);
	1090	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
	1091	sema_init(&inodedep_in_progress, "inodedep", 0, 0);
	1092	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
	1093	sema_init(&newblk_in_progress, "newblk", 0, 0);
	1094	}
	1095
	1096	/*
	1097	* Called at mount time to notify the dependency code that a
	1098	* filesystem wishes to use it.
	1099	*/
	1100	int
	1101	softdep_mount(devvp, mp, fs)
	1102	struct vnode *devvp;
	1103	struct mount *mp;
	1104	struct fs *fs;
	1105	{
	1106	struct csum cstotal;
	1107	struct cg *cgp;
	1108	struct buf *bp;
	1109	int error, cyl;
	1110
	1111	mp->mnt_flag &= ~MNT_ASYNC;
	1112	mp->mnt_flag \|= MNT_SOFTDEP;
	1113	/*
	1114	* When doing soft updates, the counters in the
	1115	* superblock may have gotten out of sync, so we have
	1116	* to scan the cylinder groups and recalculate them.
	1117	*/
	1118	if (fs->fs_clean != 0)
	1119	return (0);
	1120	bzero(&cstotal, sizeof cstotal);
	1121	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
	1122	if ((error = bread(devvp, fsbtodoff(fs, cgtod(fs, cyl)),
	1123	fs->fs_cgsize, &bp)) != 0) {
	1124	brelse(bp);
	1125	return (error);
	1126	}
	1127	cgp = (struct cg *)bp->b_data;
	1128	cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
	1129	cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
	1130	cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
	1131	cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
	1132	fs->fs_cs(fs, cyl) = cgp->cg_cs;
	1133	brelse(bp);
	1134	}
	1135	#ifdef DEBUG
	1136	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
	1137	printf("ffs_mountfs: superblock updated for soft updates\n");
	1138	#endif
	1139	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
	1140	return (0);
	1141	}
	1142
	1143	/*
	1144	* Protecting the freemaps (or bitmaps).
	1145	*
	1146	* To eliminate the need to execute fsck before mounting a filesystem
	1147	* after a power failure, one must (conservatively) guarantee that the
	1148	* on-disk copy of the bitmaps never indicate that a live inode or block is
	1149	* free. So, when a block or inode is allocated, the bitmap should be
	1150	* updated (on disk) before any new pointers. When a block or inode is
	1151	* freed, the bitmap should not be updated until all pointers have been
	1152	* reset. The latter dependency is handled by the delayed de-allocation
	1153	* approach described below for block and inode de-allocation. The former
	1154	* dependency is handled by calling the following procedure when a block or
	1155	* inode is allocated. When an inode is allocated an "inodedep" is created
	1156	* with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
	1157	* Each "inodedep" is also inserted into the hash indexing structure so
	1158	* that any additional link additions can be made dependent on the inode
	1159	* allocation.
	1160	*
	1161	* The ufs filesystem maintains a number of free block counts (e.g., per
	1162	* cylinder group, per cylinder and per <cylinder, rotational position> pair)
	1163	* in addition to the bitmaps. These counts are used to improve efficiency
	1164	* during allocation and therefore must be consistent with the bitmaps.
	1165	* There is no convenient way to guarantee post-crash consistency of these
	1166	* counts with simple update ordering, for two main reasons: (1) The counts
	1167	* and bitmaps for a single cylinder group block are not in the same disk
	1168	* sector. If a disk write is interrupted (e.g., by power failure), one may
	1169	* be written and the other not. (2) Some of the counts are located in the
	1170	* superblock rather than the cylinder group block. So, we focus our soft
	1171	* updates implementation on protecting the bitmaps. When mounting a
	1172	* filesystem, we recompute the auxiliary counts from the bitmaps.
	1173	*/
	1174
	1175	/*
	1176	* Called just after updating the cylinder group block to allocate an inode.
	1177	*/
	1178	void
	1179	softdep_setup_inomapdep(bp, ip, newinum)
	1180	struct buf bp; / buffer for cylgroup block with inode map */
	1181	struct inode ip; / inode related to allocation */
	1182	ino_t newinum; /* new inode number being allocated */
	1183	{
	1184	struct inodedep *inodedep;
	1185	struct bmsafemap *bmsafemap;
	1186
	1187	/*
	1188	* Create a dependency for the newly allocated inode.
	1189	* Panic if it already exists as something is seriously wrong.
	1190	* Otherwise add it to the dependency list for the buffer holding
	1191	* the cylinder group map from which it was allocated.
	1192	*/
	1193	ACQUIRE_LOCK(&lk);
	1194	if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC\|NODELAY, &inodedep))) {
	1195	FREE_LOCK(&lk);
	1196	panic("softdep_setup_inomapdep: found inode");
	1197	}
	1198	inodedep->id_buf = bp;
	1199	inodedep->id_state &= ~DEPCOMPLETE;
	1200	bmsafemap = bmsafemap_lookup(bp);
	1201	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
	1202	FREE_LOCK(&lk);
	1203	}
	1204
	1205	/*
	1206	* Called just after updating the cylinder group block to
	1207	* allocate block or fragment.
	1208	*/
	1209	void
	1210	softdep_setup_blkmapdep(bp, fs, newblkno)
	1211	struct buf bp; / buffer for cylgroup block with block map */
	1212	struct fs fs; / filesystem doing allocation */
	1213	ufs_daddr_t newblkno; /* number of newly allocated block */
	1214	{
	1215	struct newblk *newblk;
	1216	struct bmsafemap *bmsafemap;
	1217
	1218	/*
	1219	* Create a dependency for the newly allocated block.
	1220	* Add it to the dependency list for the buffer holding
	1221	* the cylinder group map from which it was allocated.
	1222	*/
	1223	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
	1224	panic("softdep_setup_blkmapdep: found block");
	1225	ACQUIRE_LOCK(&lk);
	1226	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
	1227	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
	1228	FREE_LOCK(&lk);
	1229	}
	1230
	1231	/*
	1232	* Find the bmsafemap associated with a cylinder group buffer.
	1233	* If none exists, create one. The buffer must be locked when
	1234	* this routine is called and this routine must be called with
	1235	* splbio interrupts blocked.
	1236	*/
	1237	static struct bmsafemap *
	1238	bmsafemap_lookup(bp)
	1239	struct buf *bp;
	1240	{
	1241	struct bmsafemap *bmsafemap;
	1242	struct worklist *wk;
	1243
	1244	#ifdef DEBUG
	1245	if (lk.lkt_held == NOHOLDER)
	1246	panic("bmsafemap_lookup: lock not held");
	1247	#endif
	1248	LIST_FOREACH(wk, &bp->b_dep, wk_list)
	1249	if (wk->wk_type == D_BMSAFEMAP)
	1250	return (WK_BMSAFEMAP(wk));
	1251	FREE_LOCK(&lk);
	1252	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
	1253	M_BMSAFEMAP, M_SOFTDEP_FLAGS);
	1254	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
	1255	bmsafemap->sm_list.wk_state = 0;
	1256	bmsafemap->sm_buf = bp;
	1257	LIST_INIT(&bmsafemap->sm_allocdirecthd);
	1258	LIST_INIT(&bmsafemap->sm_allocindirhd);
	1259	LIST_INIT(&bmsafemap->sm_inodedephd);
	1260	LIST_INIT(&bmsafemap->sm_newblkhd);
	1261	ACQUIRE_LOCK(&lk);
	1262	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
	1263	return (bmsafemap);
	1264	}
	1265
	1266	/*
	1267	* Direct block allocation dependencies.
	1268	*
	1269	* When a new block is allocated, the corresponding disk locations must be
	1270	* initialized (with zeros or new data) before the on-disk inode points to
	1271	* them. Also, the freemap from which the block was allocated must be
	1272	* updated (on disk) before the inode's pointer. These two dependencies are
	1273	* independent of each other and are needed for all file blocks and indirect
	1274	* blocks that are pointed to directly by the inode. Just before the
	1275	* "in-core" version of the inode is updated with a newly allocated block
	1276	* number, a procedure (below) is called to setup allocation dependency
	1277	* structures. These structures are removed when the corresponding
	1278	* dependencies are satisfied or when the block allocation becomes obsolete
	1279	* (i.e., the file is deleted, the block is de-allocated, or the block is a
	1280	* fragment that gets upgraded). All of these cases are handled in
	1281	* procedures described later.
	1282	*
	1283	* When a file extension causes a fragment to be upgraded, either to a larger
	1284	* fragment or to a full block, the on-disk location may change (if the
	1285	* previous fragment could not simply be extended). In this case, the old
	1286	* fragment must be de-allocated, but not until after the inode's pointer has
	1287	* been updated. In most cases, this is handled by later procedures, which
	1288	* will construct a "freefrag" structure to be added to the workitem queue
	1289	* when the inode update is complete (or obsolete). The main exception to
	1290	* this is when an allocation occurs while a pending allocation dependency
	1291	* (for the same block pointer) remains. This case is handled in the main
	1292	* allocation dependency setup procedure by immediately freeing the
	1293	* unreferenced fragments.
	1294	*/
	1295	void
	1296	softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
	1297	struct inode ip; / inode to which block is being added */
	1298	ufs_lbn_t lbn; /* block pointer within inode */
	1299	ufs_daddr_t newblkno; /* disk block number being added */
	1300	ufs_daddr_t oldblkno; /* previous block number, 0 unless frag */
	1301	long newsize; /* size of new block */
	1302	long oldsize; /* size of new block */
	1303	struct buf bp; / bp for allocated block */
	1304	{
	1305	struct allocdirect adp, oldadp;
	1306	struct allocdirectlst *adphead;
	1307	struct bmsafemap *bmsafemap;
	1308	struct inodedep *inodedep;
	1309	struct pagedep *pagedep;
	1310	struct newblk *newblk;
	1311
	1312	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
	1313	M_ALLOCDIRECT, M_SOFTDEP_FLAGS);
	1314	bzero(adp, sizeof(struct allocdirect));
	1315	adp->ad_list.wk_type = D_ALLOCDIRECT;
	1316	adp->ad_lbn = lbn;
	1317	adp->ad_newblkno = newblkno;
	1318	adp->ad_oldblkno = oldblkno;
	1319	adp->ad_newsize = newsize;
	1320	adp->ad_oldsize = oldsize;
	1321	adp->ad_state = ATTACHED;
	1322	if (newblkno == oldblkno)
	1323	adp->ad_freefrag = NULL;
	1324	else
	1325	adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
	1326
	1327	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
	1328	panic("softdep_setup_allocdirect: lost block");
	1329
	1330	ACQUIRE_LOCK(&lk);
	1331	inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC \| NODELAY, &inodedep);
	1332	adp->ad_inodedep = inodedep;
	1333
	1334	if (newblk->nb_state == DEPCOMPLETE) {
	1335	adp->ad_state \|= DEPCOMPLETE;
	1336	adp->ad_buf = NULL;
	1337	} else {
	1338	bmsafemap = newblk->nb_bmsafemap;
	1339	adp->ad_buf = bmsafemap->sm_buf;
	1340	LIST_REMOVE(newblk, nb_deps);
	1341	LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
	1342	}
	1343	LIST_REMOVE(newblk, nb_hash);
	1344	FREE(newblk, M_NEWBLK);
	1345
	1346	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
	1347	if (lbn >= NDADDR) {
	1348	/* allocating an indirect block */
	1349	if (oldblkno != 0) {
	1350	FREE_LOCK(&lk);
	1351	panic("softdep_setup_allocdirect: non-zero indir");
	1352	}
	1353	} else {
	1354	/*
	1355	* Allocating a direct block.
	1356	*
	1357	* If we are allocating a directory block, then we must
	1358	* allocate an associated pagedep to track additions and
	1359	* deletions.
	1360	*/
	1361	if ((ip->i_mode & IFMT) == IFDIR &&
	1362	pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
	1363	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
	1364	}
	1365	/*
	1366	* The list of allocdirects must be kept in sorted and ascending
	1367	* order so that the rollback routines can quickly determine the
	1368	* first uncommitted block (the size of the file stored on disk
	1369	* ends at the end of the lowest committed fragment, or if there
	1370	* are no fragments, at the end of the highest committed block).
	1371	* Since files generally grow, the typical case is that the new
	1372	* block is to be added at the end of the list. We speed this
	1373	* special case by checking against the last allocdirect in the
	1374	* list before laboriously traversing the list looking for the
	1375	* insertion point.
	1376	*/
	1377	adphead = &inodedep->id_newinoupdt;
	1378	oldadp = TAILQ_LAST(adphead, allocdirectlst);
	1379	if (oldadp == NULL \|\| oldadp->ad_lbn <= lbn) {
	1380	/* insert at end of list */
	1381	TAILQ_INSERT_TAIL(adphead, adp, ad_next);
	1382	if (oldadp != NULL && oldadp->ad_lbn == lbn)
	1383	allocdirect_merge(adphead, adp, oldadp);
	1384	FREE_LOCK(&lk);
	1385	return;
	1386	}
	1387	TAILQ_FOREACH(oldadp, adphead, ad_next) {
	1388	if (oldadp->ad_lbn >= lbn)
	1389	break;
	1390	}
	1391	if (oldadp == NULL) {
	1392	FREE_LOCK(&lk);
	1393	panic("softdep_setup_allocdirect: lost entry");
	1394	}
	1395	/* insert in middle of list */
	1396	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
	1397	if (oldadp->ad_lbn == lbn)
	1398	allocdirect_merge(adphead, adp, oldadp);
	1399	FREE_LOCK(&lk);
	1400	}
	1401
	1402	/*
	1403	* Replace an old allocdirect dependency with a newer one.
	1404	* This routine must be called with splbio interrupts blocked.
	1405	*/
	1406	static void
	1407	allocdirect_merge(adphead, newadp, oldadp)
	1408	struct allocdirectlst adphead; / head of list holding allocdirects */
	1409	struct allocdirect newadp; / allocdirect being added */
	1410	struct allocdirect oldadp; / existing allocdirect being checked */
	1411	{
	1412	struct freefrag *freefrag;
	1413
	1414	#ifdef DEBUG
	1415	if (lk.lkt_held == NOHOLDER)
	1416	panic("allocdirect_merge: lock not held");
	1417	#endif
	1418	if (newadp->ad_oldblkno != oldadp->ad_newblkno \|\|
	1419	newadp->ad_oldsize != oldadp->ad_newsize \|\|
	1420	newadp->ad_lbn >= NDADDR) {
	1421	FREE_LOCK(&lk);
	1422	panic("allocdirect_check: old %d != new %d \|\| lbn %ld >= %d",
	1423	newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
	1424	NDADDR);
	1425	}
	1426	newadp->ad_oldblkno = oldadp->ad_oldblkno;
	1427	newadp->ad_oldsize = oldadp->ad_oldsize;
	1428	/*
	1429	* If the old dependency had a fragment to free or had never
	1430	* previously had a block allocated, then the new dependency
	1431	* can immediately post its freefrag and adopt the old freefrag.
	1432	* This action is done by swapping the freefrag dependencies.
	1433	* The new dependency gains the old one's freefrag, and the
	1434	* old one gets the new one and then immediately puts it on
	1435	* the worklist when it is freed by free_allocdirect. It is
	1436	* not possible to do this swap when the old dependency had a
	1437	* non-zero size but no previous fragment to free. This condition
	1438	* arises when the new block is an extension of the old block.
	1439	* Here, the first part of the fragment allocated to the new
	1440	* dependency is part of the block currently claimed on disk by
	1441	* the old dependency, so cannot legitimately be freed until the
	1442	* conditions for the new dependency are fulfilled.
	1443	*/
	1444	if (oldadp->ad_freefrag != NULL \|\| oldadp->ad_oldblkno == 0) {
	1445	freefrag = newadp->ad_freefrag;
	1446	newadp->ad_freefrag = oldadp->ad_freefrag;
	1447	oldadp->ad_freefrag = freefrag;
	1448	}
	1449	free_allocdirect(adphead, oldadp, 0);
	1450	}
	1451
	1452	/*
	1453	* Allocate a new freefrag structure if needed.
	1454	*/
	1455	static struct freefrag *
	1456	newfreefrag(ip, blkno, size)
	1457	struct inode *ip;
	1458	ufs_daddr_t blkno;
	1459	long size;
	1460	{
	1461	struct freefrag *freefrag;
	1462	struct fs *fs;
	1463
	1464	if (blkno == 0)
	1465	return (NULL);
	1466	fs = ip->i_fs;
	1467	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
	1468	panic("newfreefrag: frag size");
	1469	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
	1470	M_FREEFRAG, M_SOFTDEP_FLAGS);
	1471	freefrag->ff_list.wk_type = D_FREEFRAG;
	1472	freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */
	1473	freefrag->ff_inum = ip->i_number;
	1474	freefrag->ff_fs = fs;
	1475	freefrag->ff_devvp = ip->i_devvp;
	1476	freefrag->ff_blkno = blkno;
	1477	freefrag->ff_fragsize = size;
	1478	return (freefrag);
	1479	}
	1480
	1481	/*
	1482	* This workitem de-allocates fragments that were replaced during
	1483	* file block allocation.
	1484	*/
	1485	static void
	1486	handle_workitem_freefrag(freefrag)
	1487	struct freefrag *freefrag;
	1488	{
	1489	struct inode tip;
	1490
	1491	tip.i_fs = freefrag->ff_fs;
	1492	tip.i_devvp = freefrag->ff_devvp;
	1493	tip.i_dev = freefrag->ff_devvp->v_rdev;
	1494	tip.i_number = freefrag->ff_inum;
	1495	tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above */
	1496	ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
	1497	FREE(freefrag, M_FREEFRAG);
	1498	}
	1499
	1500	/*
	1501	* Indirect block allocation dependencies.
	1502	*
	1503	* The same dependencies that exist for a direct block also exist when
	1504	* a new block is allocated and pointed to by an entry in a block of
	1505	* indirect pointers. The undo/redo states described above are also
	1506	* used here. Because an indirect block contains many pointers that
	1507	* may have dependencies, a second copy of the entire in-memory indirect
	1508	* block is kept. The buffer cache copy is always completely up-to-date.
	1509	* The second copy, which is used only as a source for disk writes,
	1510	* contains only the safe pointers (i.e., those that have no remaining
	1511	* update dependencies). The second copy is freed when all pointers
	1512	* are safe. The cache is not allowed to replace indirect blocks with
	1513	* pending update dependencies. If a buffer containing an indirect
	1514	* block with dependencies is written, these routines will mark it
	1515	* dirty again. It can only be successfully written once all the
	1516	* dependencies are removed. The ffs_fsync routine in conjunction with
	1517	* softdep_sync_metadata work together to get all the dependencies
	1518	* removed so that a file can be successfully written to disk. Three
	1519	* procedures are used when setting up indirect block pointer
	1520	* dependencies. The division is necessary because of the organization
	1521	* of the "balloc" routine and because of the distinction between file
	1522	* pages and file metadata blocks.
	1523	*/
	1524
	1525	/*
	1526	* Allocate a new allocindir structure.
	1527	*/
	1528	static struct allocindir *
	1529	newallocindir(ip, ptrno, newblkno, oldblkno)
	1530	struct inode ip; / inode for file being extended */
	1531	int ptrno; /* offset of pointer in indirect block */
	1532	ufs_daddr_t newblkno; /* disk block number being added */
	1533	ufs_daddr_t oldblkno; /* previous block number, 0 if none */
	1534	{
	1535	struct allocindir *aip;
	1536
	1537	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
	1538	M_ALLOCINDIR, M_SOFTDEP_FLAGS);
	1539	bzero(aip, sizeof(struct allocindir));
	1540	aip->ai_list.wk_type = D_ALLOCINDIR;
	1541	aip->ai_state = ATTACHED;
	1542	aip->ai_offset = ptrno;
	1543	aip->ai_newblkno = newblkno;
	1544	aip->ai_oldblkno = oldblkno;
	1545	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
	1546	return (aip);
	1547	}
	1548
	1549	/*
	1550	* Called just before setting an indirect block pointer
	1551	* to a newly allocated file page.
	1552	*/
	1553	void
	1554	softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
	1555	struct inode ip; / inode for file being extended */
	1556	ufs_lbn_t lbn; /* allocated block number within file */
	1557	struct buf bp; / buffer with indirect blk referencing page */
	1558	int ptrno; /* offset of pointer in indirect block */
	1559	ufs_daddr_t newblkno; /* disk block number being added */
	1560	ufs_daddr_t oldblkno; /* previous block number, 0 if none */
	1561	struct buf nbp; / buffer holding allocated page */
	1562	{
	1563	struct allocindir *aip;
	1564	struct pagedep *pagedep;
	1565
	1566	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
	1567	ACQUIRE_LOCK(&lk);
	1568	/*
	1569	* If we are allocating a directory page, then we must
	1570	* allocate an associated pagedep to track additions and
	1571	* deletions.
	1572	*/
	1573	if ((ip->i_mode & IFMT) == IFDIR &&
	1574	pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
	1575	WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
	1576	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
	1577	FREE_LOCK(&lk);
	1578	setup_allocindir_phase2(bp, ip, aip);
	1579	}
	1580
	1581	/*
	1582	* Called just before setting an indirect block pointer to a
	1583	* newly allocated indirect block.
	1584	*/
	1585	void
	1586	softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
	1587	struct buf nbp; / newly allocated indirect block */
	1588	struct inode ip; / inode for file being extended */
	1589	struct buf bp; / indirect block referencing allocated block */
	1590	int ptrno; /* offset of pointer in indirect block */
	1591	ufs_daddr_t newblkno; /* disk block number being added */
	1592	{
	1593	struct allocindir *aip;
	1594
	1595	aip = newallocindir(ip, ptrno, newblkno, 0);
	1596	ACQUIRE_LOCK(&lk);
	1597	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
	1598	FREE_LOCK(&lk);
	1599	setup_allocindir_phase2(bp, ip, aip);
	1600	}
	1601
	1602	/*
	1603	* Called to finish the allocation of the "aip" allocated
	1604	* by one of the two routines above.
	1605	*/
	1606	static void
	1607	setup_allocindir_phase2(bp, ip, aip)
	1608	struct buf bp; / in-memory copy of the indirect block */
	1609	struct inode ip; / inode for file being extended */
	1610	struct allocindir aip; / allocindir allocated by the above routines */
	1611	{
	1612	struct worklist *wk;
	1613	struct indirdep indirdep, newindirdep;
	1614	struct bmsafemap *bmsafemap;
	1615	struct allocindir *oldaip;
	1616	struct freefrag *freefrag;
	1617	struct newblk *newblk;
	1618
	1619	if (bp->b_loffset >= 0)
	1620	panic("setup_allocindir_phase2: not indir blk");
	1621	for (indirdep = NULL, newindirdep = NULL; ; ) {
	1622	ACQUIRE_LOCK(&lk);
	1623	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	1624	if (wk->wk_type != D_INDIRDEP)
	1625	continue;
	1626	indirdep = WK_INDIRDEP(wk);
	1627	break;
	1628	}
	1629	if (indirdep == NULL && newindirdep) {
	1630	indirdep = newindirdep;
	1631	WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
	1632	newindirdep = NULL;
	1633	}
	1634	FREE_LOCK(&lk);
	1635	if (indirdep) {
	1636	if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
	1637	&newblk) == 0)
	1638	panic("setup_allocindir: lost block");
	1639	ACQUIRE_LOCK(&lk);
	1640	if (newblk->nb_state == DEPCOMPLETE) {
	1641	aip->ai_state \|= DEPCOMPLETE;
	1642	aip->ai_buf = NULL;
	1643	} else {
	1644	bmsafemap = newblk->nb_bmsafemap;
	1645	aip->ai_buf = bmsafemap->sm_buf;
	1646	LIST_REMOVE(newblk, nb_deps);
	1647	LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
	1648	aip, ai_deps);
	1649	}
	1650	LIST_REMOVE(newblk, nb_hash);
	1651	FREE(newblk, M_NEWBLK);
	1652	aip->ai_indirdep = indirdep;
	1653	/*
	1654	* Check to see if there is an existing dependency
	1655	* for this block. If there is, merge the old
	1656	* dependency into the new one.
	1657	*/
	1658	if (aip->ai_oldblkno == 0)
	1659	oldaip = NULL;
	1660	else
	1661
	1662	LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
	1663	if (oldaip->ai_offset == aip->ai_offset)
	1664	break;
	1665	if (oldaip != NULL) {
	1666	if (oldaip->ai_newblkno != aip->ai_oldblkno) {
	1667	FREE_LOCK(&lk);
	1668	panic("setup_allocindir_phase2: blkno");
	1669	}
	1670	aip->ai_oldblkno = oldaip->ai_oldblkno;
	1671	freefrag = oldaip->ai_freefrag;
	1672	oldaip->ai_freefrag = aip->ai_freefrag;
	1673	aip->ai_freefrag = freefrag;
	1674	free_allocindir(oldaip, NULL);
	1675	}
	1676	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
	1677	((ufs_daddr_t *)indirdep->ir_savebp->b_data)
	1678	[aip->ai_offset] = aip->ai_oldblkno;
	1679	FREE_LOCK(&lk);
	1680	}
	1681	if (newindirdep) {
	1682	/*
	1683	* Avoid any possibility of data corruption by
	1684	* ensuring that our old version is thrown away.
	1685	*/
	1686	newindirdep->ir_savebp->b_flags \|= B_INVAL \| B_NOCACHE;
	1687	brelse(newindirdep->ir_savebp);
	1688	WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
	1689	}
	1690	if (indirdep)
	1691	break;
	1692	MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
	1693	M_INDIRDEP, M_SOFTDEP_FLAGS);
	1694	newindirdep->ir_list.wk_type = D_INDIRDEP;
	1695	newindirdep->ir_state = ATTACHED;
	1696	LIST_INIT(&newindirdep->ir_deplisthd);
	1697	LIST_INIT(&newindirdep->ir_donehd);
	1698	if (bp->b_bio2.bio_offset == NOOFFSET) {
	1699	VOP_BMAP(bp->b_vp, bp->b_bio1.bio_offset,
	1700	NULL, &bp->b_bio2.bio_offset,
	1701	NULL, NULL);
	1702	}
	1703	KKASSERT(bp->b_bio2.bio_offset != NOOFFSET);
	1704	newindirdep->ir_savebp = getblk(ip->i_devvp,
	1705	bp->b_bio2.bio_offset,
	1706	bp->b_bcount, 0, 0);
	1707	BUF_KERNPROC(newindirdep->ir_savebp);
	1708	bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
	1709	}
	1710	}
	1711
	1712	/*
	1713	* Block de-allocation dependencies.
	1714	*
	1715	* When blocks are de-allocated, the on-disk pointers must be nullified before
	1716	* the blocks are made available for use by other files. (The true
	1717	* requirement is that old pointers must be nullified before new on-disk
	1718	* pointers are set. We chose this slightly more stringent requirement to
	1719	* reduce complexity.) Our implementation handles this dependency by updating
	1720	* the inode (or indirect block) appropriately but delaying the actual block
	1721	* de-allocation (i.e., freemap and free space count manipulation) until
	1722	* after the updated versions reach stable storage. After the disk is
	1723	* updated, the blocks can be safely de-allocated whenever it is convenient.
	1724	* This implementation handles only the common case of reducing a file's
	1725	* length to zero. Other cases are handled by the conventional synchronous
	1726	* write approach.
	1727	*
	1728	* The ffs implementation with which we worked double-checks
	1729	* the state of the block pointers and file size as it reduces
	1730	* a file's length. Some of this code is replicated here in our
	1731	* soft updates implementation. The freeblks->fb_chkcnt field is
	1732	* used to transfer a part of this information to the procedure
	1733	* that eventually de-allocates the blocks.
	1734	*
	1735	* This routine should be called from the routine that shortens
	1736	* a file's length, before the inode's size or block pointers
	1737	* are modified. It will save the block pointer information for
	1738	* later release and zero the inode so that the calling routine
	1739	* can release it.
	1740	*/
	1741	struct softdep_setup_freeblocks_info {
	1742	struct fs *fs;
	1743	struct inode *ip;
	1744	};
	1745
	1746	static int softdep_setup_freeblocks_bp(struct buf bp, void data);
	1747
	1748	void
	1749	softdep_setup_freeblocks(ip, length)
	1750	struct inode ip; / The inode whose length is to be reduced */
	1751	off_t length; /* The new length for the file */
	1752	{
	1753	struct softdep_setup_freeblocks_info info;
	1754	struct freeblks *freeblks;
	1755	struct inodedep *inodedep;
	1756	struct allocdirect *adp;
	1757	struct vnode *vp;
	1758	struct buf *bp;
	1759	struct fs *fs;
	1760	int i, error, delay;
	1761	int count;
	1762
	1763	fs = ip->i_fs;
	1764	if (length != 0)
	1765	panic("softde_setup_freeblocks: non-zero length");
	1766	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
	1767	M_FREEBLKS, M_SOFTDEP_FLAGS);
	1768	bzero(freeblks, sizeof(struct freeblks));
	1769	freeblks->fb_list.wk_type = D_FREEBLKS;
	1770	freeblks->fb_state = ATTACHED;
	1771	freeblks->fb_uid = ip->i_uid;
	1772	freeblks->fb_previousinum = ip->i_number;
	1773	freeblks->fb_devvp = ip->i_devvp;
	1774	freeblks->fb_fs = fs;
	1775	freeblks->fb_oldsize = ip->i_size;
	1776	freeblks->fb_newsize = length;
	1777	freeblks->fb_chkcnt = ip->i_blocks;
	1778	for (i = 0; i < NDADDR; i++) {
	1779	freeblks->fb_dblks[i] = ip->i_db[i];
	1780	ip->i_db[i] = 0;
	1781	}
	1782	for (i = 0; i < NIADDR; i++) {
	1783	freeblks->fb_iblks[i] = ip->i_ib[i];
	1784	ip->i_ib[i] = 0;
	1785	}
	1786	ip->i_blocks = 0;
	1787	ip->i_size = 0;
	1788	/*
	1789	* Push the zero'ed inode to to its disk buffer so that we are free
	1790	* to delete its dependencies below. Once the dependencies are gone
	1791	* the buffer can be safely released.
	1792	*/
	1793	if ((error = bread(ip->i_devvp,
	1794	fsbtodoff(fs, ino_to_fsba(fs, ip->i_number)),
	1795	(int)fs->fs_bsize, &bp)) != 0)
	1796	softdep_error("softdep_setup_freeblocks", error);
	1797	((struct ufs1_dinode )bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
	1798	ip->i_din;
	1799	/*
	1800	* Find and eliminate any inode dependencies.
	1801	*/
	1802	ACQUIRE_LOCK(&lk);
	1803	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
	1804	if ((inodedep->id_state & IOSTARTED) != 0) {
	1805	FREE_LOCK(&lk);
	1806	panic("softdep_setup_freeblocks: inode busy");
	1807	}
	1808	/*
	1809	* Add the freeblks structure to the list of operations that
	1810	* must await the zero'ed inode being written to disk. If we
	1811	* still have a bitmap dependency (delay == 0), then the inode
	1812	* has never been written to disk, so we can process the
	1813	* freeblks below once we have deleted the dependencies.
	1814	*/
	1815	delay = (inodedep->id_state & DEPCOMPLETE);
	1816	if (delay)
	1817	WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
	1818	/*
	1819	* Because the file length has been truncated to zero, any
	1820	* pending block allocation dependency structures associated
	1821	* with this inode are obsolete and can simply be de-allocated.
	1822	* We must first merge the two dependency lists to get rid of
	1823	* any duplicate freefrag structures, then purge the merged list.
	1824	*/
	1825	merge_inode_lists(inodedep);
	1826	while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
	1827	free_allocdirect(&inodedep->id_inoupdt, adp, 1);
	1828	FREE_LOCK(&lk);
	1829	bdwrite(bp);
	1830	/*
	1831	* We must wait for any I/O in progress to finish so that
	1832	* all potential buffers on the dirty list will be visible.
	1833	* Once they are all there, walk the list and get rid of
	1834	* any dependencies.
	1835	*/
	1836	vp = ITOV(ip);
	1837	ACQUIRE_LOCK(&lk);
	1838	drain_output(vp, 1);
	1839
	1840	info.fs = fs;
	1841	info.ip = ip;
	1842	crit_enter();
	1843	do {
	1844	count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
	1845	softdep_setup_freeblocks_bp, &info);
	1846	} while (count != 0);
	1847	crit_exit();
	1848	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
	1849	(void)free_inodedep(inodedep);
	1850
	1851	if (delay) {
	1852	freeblks->fb_state \|= DEPCOMPLETE;
	1853	/*
	1854	* If the inode with zeroed block pointers is now on disk
	1855	* we can start freeing blocks. Add freeblks to the worklist
	1856	* instead of calling handle_workitem_freeblocks directly as
	1857	* it is more likely that additional IO is needed to complete
	1858	* the request here than in the !delay case.
	1859	*/
	1860	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
	1861	add_to_worklist(&freeblks->fb_list);
	1862	}
	1863
	1864	FREE_LOCK(&lk);
	1865	/*
	1866	* If the inode has never been written to disk (delay == 0),
	1867	* then we can process the freeblks now that we have deleted
	1868	* the dependencies.
	1869	*/
	1870	if (!delay)
	1871	handle_workitem_freeblocks(freeblks);
	1872	}
	1873
	1874	static int
	1875	softdep_setup_freeblocks_bp(struct buf bp, void data)
	1876	{
	1877	struct softdep_setup_freeblocks_info *info = data;
	1878	struct inodedep *inodedep;
	1879
	1880	if (getdirtybuf(&bp, MNT_WAIT) == 0) {
	1881	printf("softdep_setup_freeblocks_bp(1): caught bp %p going away\n", bp);
	1882	return(-1);
	1883	}
	1884	if (bp->b_vp != ITOV(info->ip) \|\| (bp->b_flags & B_DELWRI) == 0) {
	1885	printf("softdep_setup_freeblocks_bp(2): caught bp %p going away\n", bp);
	1886	BUF_UNLOCK(bp);
	1887	return(-1);
	1888	}
	1889	(void) inodedep_lookup(info->fs, info->ip->i_number, 0, &inodedep);
	1890	deallocate_dependencies(bp, inodedep);
	1891	bp->b_flags \|= B_INVAL \| B_NOCACHE;
	1892	FREE_LOCK(&lk);
	1893	brelse(bp);
	1894	ACQUIRE_LOCK(&lk);
	1895	return(1);
	1896	}
	1897
	1898	/*
	1899	* Reclaim any dependency structures from a buffer that is about to
	1900	* be reallocated to a new vnode. The buffer must be locked, thus,
	1901	* no I/O completion operations can occur while we are manipulating
	1902	* its associated dependencies. The mutex is held so that other I/O's
	1903	* associated with related dependencies do not occur.
	1904	*/
	1905	static void
	1906	deallocate_dependencies(bp, inodedep)
	1907	struct buf *bp;
	1908	struct inodedep *inodedep;
	1909	{
	1910	struct worklist *wk;
	1911	struct indirdep *indirdep;
	1912	struct allocindir *aip;
	1913	struct pagedep *pagedep;
	1914	struct dirrem *dirrem;
	1915	struct diradd *dap;
	1916	int i;
	1917
	1918	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
	1919	switch (wk->wk_type) {
	1920
	1921	case D_INDIRDEP:
	1922	indirdep = WK_INDIRDEP(wk);
	1923	/*
	1924	* None of the indirect pointers will ever be visible,
	1925	* so they can simply be tossed. GOINGAWAY ensures
	1926	* that allocated pointers will be saved in the buffer
	1927	* cache until they are freed. Note that they will
	1928	* only be able to be found by their physical address
	1929	* since the inode mapping the logical address will
	1930	* be gone. The save buffer used for the safe copy
	1931	* was allocated in setup_allocindir_phase2 using
	1932	* the physical address so it could be used for this
	1933	* purpose. Hence we swap the safe copy with the real
	1934	* copy, allowing the safe copy to be freed and holding
	1935	* on to the real copy for later use in indir_trunc.
	1936	*
	1937	* NOTE: ir_savebp is relative to the block device
	1938	* so b_bio1 contains the device block number.
	1939	*/
	1940	if (indirdep->ir_state & GOINGAWAY) {
	1941	FREE_LOCK(&lk);
	1942	panic("deallocate_dependencies: already gone");
	1943	}
	1944	indirdep->ir_state \|= GOINGAWAY;
	1945	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
	1946	free_allocindir(aip, inodedep);
	1947	if (bp->b_bio1.bio_offset >= 0 \|\|
	1948	bp->b_bio2.bio_offset != indirdep->ir_savebp->b_bio1.bio_offset) {
	1949	FREE_LOCK(&lk);
	1950	panic("deallocate_dependencies: not indir");
	1951	}
	1952	bcopy(bp->b_data, indirdep->ir_savebp->b_data,
	1953	bp->b_bcount);
	1954	WORKLIST_REMOVE(wk);
	1955	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
	1956	continue;
	1957
	1958	case D_PAGEDEP:
	1959	pagedep = WK_PAGEDEP(wk);
	1960	/*
	1961	* None of the directory additions will ever be
	1962	* visible, so they can simply be tossed.
	1963	*/
	1964	for (i = 0; i < DAHASHSZ; i++)
	1965	while ((dap =
	1966	LIST_FIRST(&pagedep->pd_diraddhd[i])))
	1967	free_diradd(dap);
	1968	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
	1969	free_diradd(dap);
	1970	/*
	1971	* Copy any directory remove dependencies to the list
	1972	* to be processed after the zero'ed inode is written.
	1973	* If the inode has already been written, then they
	1974	* can be dumped directly onto the work list.
	1975	*/
	1976	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
	1977	LIST_REMOVE(dirrem, dm_next);
	1978	dirrem->dm_dirinum = pagedep->pd_ino;
	1979	if (inodedep == NULL \|\|
	1980	(inodedep->id_state & ALLCOMPLETE) ==
	1981	ALLCOMPLETE)
	1982	add_to_worklist(&dirrem->dm_list);
	1983	else
	1984	WORKLIST_INSERT(&inodedep->id_bufwait,
	1985	&dirrem->dm_list);
	1986	}
	1987	WORKLIST_REMOVE(&pagedep->pd_list);
	1988	LIST_REMOVE(pagedep, pd_hash);
	1989	WORKITEM_FREE(pagedep, D_PAGEDEP);
	1990	continue;
	1991
	1992	case D_ALLOCINDIR:
	1993	free_allocindir(WK_ALLOCINDIR(wk), inodedep);
	1994	continue;
	1995
	1996	case D_ALLOCDIRECT:
	1997	case D_INODEDEP:
	1998	FREE_LOCK(&lk);
	1999	panic("deallocate_dependencies: Unexpected type %s",
	2000	TYPENAME(wk->wk_type));
	2001	/* NOTREACHED */
	2002
	2003	default:
	2004	FREE_LOCK(&lk);
	2005	panic("deallocate_dependencies: Unknown type %s",
	2006	TYPENAME(wk->wk_type));
	2007	/* NOTREACHED */
	2008	}
	2009	}
	2010	}
	2011
	2012	/*
	2013	* Free an allocdirect. Generate a new freefrag work request if appropriate.
	2014	* This routine must be called with splbio interrupts blocked.
	2015	*/
	2016	static void
	2017	free_allocdirect(adphead, adp, delay)
	2018	struct allocdirectlst *adphead;
	2019	struct allocdirect *adp;
	2020	int delay;
	2021	{
	2022
	2023	#ifdef DEBUG
	2024	if (lk.lkt_held == NOHOLDER)
	2025	panic("free_allocdirect: lock not held");
	2026	#endif
	2027	if ((adp->ad_state & DEPCOMPLETE) == 0)
	2028	LIST_REMOVE(adp, ad_deps);
	2029	TAILQ_REMOVE(adphead, adp, ad_next);
	2030	if ((adp->ad_state & COMPLETE) == 0)
	2031	WORKLIST_REMOVE(&adp->ad_list);
	2032	if (adp->ad_freefrag != NULL) {
	2033	if (delay)
	2034	WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
	2035	&adp->ad_freefrag->ff_list);
	2036	else
	2037	add_to_worklist(&adp->ad_freefrag->ff_list);
	2038	}
	2039	WORKITEM_FREE(adp, D_ALLOCDIRECT);
	2040	}
	2041
	2042	/*
	2043	* Prepare an inode to be freed. The actual free operation is not
	2044	* done until the zero'ed inode has been written to disk.
	2045	*/
	2046	void
	2047	softdep_freefile(pvp, ino, mode)
	2048	struct vnode *pvp;
	2049	ino_t ino;
	2050	int mode;
	2051	{
	2052	struct inode *ip = VTOI(pvp);
	2053	struct inodedep *inodedep;
	2054	struct freefile *freefile;
	2055
	2056	/*
	2057	* This sets up the inode de-allocation dependency.
	2058	*/
	2059	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
	2060	M_FREEFILE, M_SOFTDEP_FLAGS);
	2061	freefile->fx_list.wk_type = D_FREEFILE;
	2062	freefile->fx_list.wk_state = 0;
	2063	freefile->fx_mode = mode;
	2064	freefile->fx_oldinum = ino;
	2065	freefile->fx_devvp = ip->i_devvp;
	2066	freefile->fx_fs = ip->i_fs;
	2067
	2068	/*
	2069	* If the inodedep does not exist, then the zero'ed inode has
	2070	* been written to disk. If the allocated inode has never been
	2071	* written to disk, then the on-disk inode is zero'ed. In either
	2072	* case we can free the file immediately.
	2073	*/
	2074	ACQUIRE_LOCK(&lk);
	2075	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 \|\|
	2076	check_inode_unwritten(inodedep)) {
	2077	FREE_LOCK(&lk);
	2078	handle_workitem_freefile(freefile);
	2079	return;
	2080	}
	2081	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
	2082	FREE_LOCK(&lk);
	2083	}
	2084
	2085	/*
	2086	* Check to see if an inode has never been written to disk. If
	2087	* so free the inodedep and return success, otherwise return failure.
	2088	* This routine must be called with splbio interrupts blocked.
	2089	*
	2090	* If we still have a bitmap dependency, then the inode has never
	2091	* been written to disk. Drop the dependency as it is no longer
	2092	* necessary since the inode is being deallocated. We set the
	2093	* ALLCOMPLETE flags since the bitmap now properly shows that the
	2094	* inode is not allocated. Even if the inode is actively being
	2095	* written, it has been rolled back to its zero'ed state, so we
	2096	* are ensured that a zero inode is what is on the disk. For short
	2097	* lived files, this change will usually result in removing all the
	2098	* dependencies from the inode so that it can be freed immediately.
	2099	*/
	2100	static int
	2101	check_inode_unwritten(inodedep)
	2102	struct inodedep *inodedep;
	2103	{
	2104
	2105	if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\|
	2106	LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\|
	2107	LIST_FIRST(&inodedep->id_bufwait) != NULL \|\|
	2108	LIST_FIRST(&inodedep->id_inowait) != NULL \|\|
	2109	TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\|
	2110	TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\|
	2111	inodedep->id_nlinkdelta != 0)
	2112	return (0);
	2113
	2114	/*
	2115	* Another process might be in initiate_write_inodeblock
	2116	* trying to allocate memory without holding "Softdep Lock".
	2117	*/
	2118	if ((inodedep->id_state & IOSTARTED) != 0 &&
	2119	inodedep->id_savedino == NULL)
	2120	return(0);
	2121
	2122	inodedep->id_state \|= ALLCOMPLETE;
	2123	LIST_REMOVE(inodedep, id_deps);
	2124	inodedep->id_buf = NULL;
	2125	if (inodedep->id_state & ONWORKLIST)
	2126	WORKLIST_REMOVE(&inodedep->id_list);
	2127	if (inodedep->id_savedino != NULL) {
	2128	FREE(inodedep->id_savedino, M_INODEDEP);
	2129	inodedep->id_savedino = NULL;
	2130	}
	2131	if (free_inodedep(inodedep) == 0) {
	2132	FREE_LOCK(&lk);
	2133	panic("check_inode_unwritten: busy inode");
	2134	}
	2135	return (1);
	2136	}
	2137
	2138	/*
	2139	* Try to free an inodedep structure. Return 1 if it could be freed.
	2140	*/
	2141	static int
	2142	free_inodedep(inodedep)
	2143	struct inodedep *inodedep;
	2144	{
	2145
	2146	if ((inodedep->id_state & ONWORKLIST) != 0 \|\|
	2147	(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE \|\|
	2148	LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\|
	2149	LIST_FIRST(&inodedep->id_bufwait) != NULL \|\|
	2150	LIST_FIRST(&inodedep->id_inowait) != NULL \|\|
	2151	TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\|
	2152	TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\|
	2153	inodedep->id_nlinkdelta != 0 \|\| inodedep->id_savedino != NULL)
	2154	return (0);
	2155	LIST_REMOVE(inodedep, id_hash);
	2156	WORKITEM_FREE(inodedep, D_INODEDEP);
	2157	num_inodedep -= 1;
	2158	return (1);
	2159	}
	2160
	2161	/*
	2162	* This workitem routine performs the block de-allocation.
	2163	* The workitem is added to the pending list after the updated
	2164	* inode block has been written to disk. As mentioned above,
	2165	* checks regarding the number of blocks de-allocated (compared
	2166	* to the number of blocks allocated for the file) are also
	2167	* performed in this function.
	2168	*/
	2169	static void
	2170	handle_workitem_freeblocks(freeblks)
	2171	struct freeblks *freeblks;
	2172	{
	2173	struct inode tip;
	2174	ufs_daddr_t bn;
	2175	struct fs *fs;
	2176	int i, level, bsize;
	2177	long nblocks, blocksreleased = 0;
	2178	int error, allerror = 0;
	2179	ufs_lbn_t baselbns[NIADDR], tmpval;
	2180
	2181	tip.i_number = freeblks->fb_previousinum;
	2182	tip.i_devvp = freeblks->fb_devvp;
	2183	tip.i_dev = freeblks->fb_devvp->v_rdev;
	2184	tip.i_fs = freeblks->fb_fs;
	2185	tip.i_size = freeblks->fb_oldsize;
	2186	tip.i_uid = freeblks->fb_uid;
	2187	fs = freeblks->fb_fs;
	2188	tmpval = 1;
	2189	baselbns[0] = NDADDR;
	2190	for (i = 1; i < NIADDR; i++) {
	2191	tmpval *= NINDIR(fs);
	2192	baselbns[i] = baselbns[i - 1] + tmpval;
	2193	}
	2194	nblocks = btodb(fs->fs_bsize);
	2195	blocksreleased = 0;
	2196	/*
	2197	* Indirect blocks first.
	2198	*/
	2199	for (level = (NIADDR - 1); level >= 0; level--) {
	2200	if ((bn = freeblks->fb_iblks[level]) == 0)
	2201	continue;
	2202	if ((error = indir_trunc(&tip, fsbtodoff(fs, bn), level,
	2203	baselbns[level], &blocksreleased)) == 0)
	2204	allerror = error;
	2205	ffs_blkfree(&tip, bn, fs->fs_bsize);
	2206	blocksreleased += nblocks;
	2207	}
	2208	/*
	2209	* All direct blocks or frags.
	2210	*/
	2211	for (i = (NDADDR - 1); i >= 0; i--) {
	2212	if ((bn = freeblks->fb_dblks[i]) == 0)
	2213	continue;
	2214	bsize = blksize(fs, &tip, i);
	2215	ffs_blkfree(&tip, bn, bsize);
	2216	blocksreleased += btodb(bsize);
	2217	}
	2218
	2219	#ifdef DIAGNOSTIC
	2220	if (freeblks->fb_chkcnt != blocksreleased)
	2221	printf("handle_workitem_freeblocks: block count\n");
	2222	if (allerror)
	2223	softdep_error("handle_workitem_freeblks", allerror);
	2224	#endif /* DIAGNOSTIC */
	2225	WORKITEM_FREE(freeblks, D_FREEBLKS);
	2226	}
	2227
	2228	/*
	2229	* Release blocks associated with the inode ip and stored in the indirect
	2230	* block at doffset. If level is greater than SINGLE, the block is an
	2231	* indirect block and recursive calls to indirtrunc must be used to
	2232	* cleanse other indirect blocks.
	2233	*/
	2234	static int
	2235	indir_trunc(ip, doffset, level, lbn, countp)
	2236	struct inode *ip;
	2237	off_t doffset;
	2238	int level;
	2239	ufs_lbn_t lbn;
	2240	long *countp;
	2241	{
	2242	struct buf *bp;
	2243	ufs_daddr_t *bap;
	2244	ufs_daddr_t nb;
	2245	struct fs *fs;
	2246	struct worklist *wk;
	2247	struct indirdep *indirdep;
	2248	int i, lbnadd, nblocks;
	2249	int error, allerror = 0;
	2250
	2251	fs = ip->i_fs;
	2252	lbnadd = 1;
	2253	for (i = level; i > 0; i--)
	2254	lbnadd *= NINDIR(fs);
	2255	/*
	2256	* Get buffer of block pointers to be freed. This routine is not
	2257	* called until the zero'ed inode has been written, so it is safe
	2258	* to free blocks as they are encountered. Because the inode has
	2259	* been zero'ed, calls to bmap on these blocks will fail. So, we
	2260	* have to use the on-disk address and the block device for the
	2261	* filesystem to look them up. If the file was deleted before its
	2262	* indirect blocks were all written to disk, the routine that set
	2263	* us up (deallocate_dependencies) will have arranged to leave
	2264	* a complete copy of the indirect block in memory for our use.
	2265	* Otherwise we have to read the blocks in from the disk.
	2266	*/
	2267	ACQUIRE_LOCK(&lk);
	2268	if ((bp = findblk(ip->i_devvp, doffset)) != NULL &&
	2269	(wk = LIST_FIRST(&bp->b_dep)) != NULL) {
	2270	/*
	2271	* bp must be ir_savebp, which is held locked for our use.
	2272	*/
	2273	if (wk->wk_type != D_INDIRDEP \|\|
	2274	(indirdep = WK_INDIRDEP(wk))->ir_savebp != bp \|\|
	2275	(indirdep->ir_state & GOINGAWAY) == 0) {
	2276	FREE_LOCK(&lk);
	2277	panic("indir_trunc: lost indirdep");
	2278	}
	2279	WORKLIST_REMOVE(wk);
	2280	WORKITEM_FREE(indirdep, D_INDIRDEP);
	2281	if (LIST_FIRST(&bp->b_dep) != NULL) {
	2282	FREE_LOCK(&lk);
	2283	panic("indir_trunc: dangling dep");
	2284	}
	2285	FREE_LOCK(&lk);
	2286	} else {
	2287	FREE_LOCK(&lk);
	2288	error = bread(ip->i_devvp, doffset, (int)fs->fs_bsize, &bp);
	2289	if (error)
	2290	return (error);
	2291	}
	2292	/*
	2293	* Recursively free indirect blocks.
	2294	*/
	2295	bap = (ufs_daddr_t *)bp->b_data;
	2296	nblocks = btodb(fs->fs_bsize);
	2297	for (i = NINDIR(fs) - 1; i >= 0; i--) {
	2298	if ((nb = bap[i]) == 0)
	2299	continue;
	2300	if (level != 0) {
	2301	if ((error = indir_trunc(ip, fsbtodoff(fs, nb),
	2302	level - 1, lbn + (i * lbnadd), countp)) != 0)
	2303	allerror = error;
	2304	}
	2305	ffs_blkfree(ip, nb, fs->fs_bsize);
	2306	*countp += nblocks;
	2307	}
	2308	bp->b_flags \|= B_INVAL \| B_NOCACHE;
	2309	brelse(bp);
	2310	return (allerror);
	2311	}
	2312
	2313	/*
	2314	* Free an allocindir.
	2315	* This routine must be called with splbio interrupts blocked.
	2316	*/
	2317	static void
	2318	free_allocindir(aip, inodedep)
	2319	struct allocindir *aip;
	2320	struct inodedep *inodedep;
	2321	{
	2322	struct freefrag *freefrag;
	2323
	2324	#ifdef DEBUG
	2325	if (lk.lkt_held == NOHOLDER)
	2326	panic("free_allocindir: lock not held");
	2327	#endif
	2328	if ((aip->ai_state & DEPCOMPLETE) == 0)
	2329	LIST_REMOVE(aip, ai_deps);
	2330	if (aip->ai_state & ONWORKLIST)
	2331	WORKLIST_REMOVE(&aip->ai_list);
	2332	LIST_REMOVE(aip, ai_next);
	2333	if ((freefrag = aip->ai_freefrag) != NULL) {
	2334	if (inodedep == NULL)
	2335	add_to_worklist(&freefrag->ff_list);
	2336	else
	2337	WORKLIST_INSERT(&inodedep->id_bufwait,
	2338	&freefrag->ff_list);
	2339	}
	2340	WORKITEM_FREE(aip, D_ALLOCINDIR);
	2341	}
	2342
	2343	/*
	2344	* Directory entry addition dependencies.
	2345	*
	2346	* When adding a new directory entry, the inode (with its incremented link
	2347	* count) must be written to disk before the directory entry's pointer to it.
	2348	* Also, if the inode is newly allocated, the corresponding freemap must be
	2349	* updated (on disk) before the directory entry's pointer. These requirements
	2350	* are met via undo/redo on the directory entry's pointer, which consists
	2351	* simply of the inode number.
	2352	*
	2353	* As directory entries are added and deleted, the free space within a
	2354	* directory block can become fragmented. The ufs filesystem will compact
	2355	* a fragmented directory block to make space for a new entry. When this
	2356	* occurs, the offsets of previously added entries change. Any "diradd"
	2357	* dependency structures corresponding to these entries must be updated with
	2358	* the new offsets.
	2359	*/
	2360
	2361	/*
	2362	* This routine is called after the in-memory inode's link
	2363	* count has been incremented, but before the directory entry's
	2364	* pointer to the inode has been set.
	2365	*/
	2366	void
	2367	softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
	2368	struct buf bp; / buffer containing directory block */
	2369	struct inode dp; / inode for directory */
	2370	off_t diroffset; /* offset of new entry in directory */
	2371	ino_t newinum; /* inode referenced by new directory entry */
	2372	struct buf newdirbp; / non-NULL => contents of new mkdir */
	2373	{
	2374	int offset; /* offset of new entry within directory block */
	2375	ufs_lbn_t lbn; /* block in directory containing new entry */
	2376	struct fs *fs;
	2377	struct diradd *dap;
	2378	struct pagedep *pagedep;
	2379	struct inodedep *inodedep;
	2380	struct mkdir mkdir1, mkdir2;
	2381
	2382	/*
	2383	* Whiteouts have no dependencies.
	2384	*/
	2385	if (newinum == WINO) {
	2386	if (newdirbp != NULL)
	2387	bdwrite(newdirbp);
	2388	return;
	2389	}
	2390
	2391	fs = dp->i_fs;
	2392	lbn = lblkno(fs, diroffset);
	2393	offset = blkoff(fs, diroffset);
	2394	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
	2395	M_SOFTDEP_FLAGS);
	2396	bzero(dap, sizeof(struct diradd));
	2397	dap->da_list.wk_type = D_DIRADD;
	2398	dap->da_offset = offset;
	2399	dap->da_newinum = newinum;
	2400	dap->da_state = ATTACHED;
	2401	if (newdirbp == NULL) {
	2402	dap->da_state \|= DEPCOMPLETE;
	2403	ACQUIRE_LOCK(&lk);
	2404	} else {
	2405	dap->da_state \|= MKDIR_BODY \| MKDIR_PARENT;
	2406	MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
	2407	M_SOFTDEP_FLAGS);
	2408	mkdir1->md_list.wk_type = D_MKDIR;
	2409	mkdir1->md_state = MKDIR_BODY;
	2410	mkdir1->md_diradd = dap;
	2411	MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
	2412	M_SOFTDEP_FLAGS);
	2413	mkdir2->md_list.wk_type = D_MKDIR;
	2414	mkdir2->md_state = MKDIR_PARENT;
	2415	mkdir2->md_diradd = dap;
	2416	/*
	2417	* Dependency on "." and ".." being written to disk.
	2418	*/
	2419	mkdir1->md_buf = newdirbp;
	2420	ACQUIRE_LOCK(&lk);
	2421	LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
	2422	WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
	2423	FREE_LOCK(&lk);
	2424	bdwrite(newdirbp);
	2425	/*
	2426	* Dependency on link count increase for parent directory
	2427	*/
	2428	ACQUIRE_LOCK(&lk);
	2429	if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
	2430	\|\| (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
	2431	dap->da_state &= ~MKDIR_PARENT;
	2432	WORKITEM_FREE(mkdir2, D_MKDIR);
	2433	} else {
	2434	LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
	2435	WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
	2436	}
	2437	}
	2438	/*
	2439	* Link into parent directory pagedep to await its being written.
	2440	*/
	2441	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
	2442	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
	2443	dap->da_pagedep = pagedep;
	2444	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
	2445	da_pdlist);
	2446	/*
	2447	* Link into its inodedep. Put it on the id_bufwait list if the inode
	2448	* is not yet written. If it is written, do the post-inode write
	2449	* processing to put it on the id_pendinghd list.
	2450	*/
	2451	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
	2452	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
	2453	diradd_inode_written(dap, inodedep);
	2454	else
	2455	WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
	2456	FREE_LOCK(&lk);
	2457	}
	2458
	2459	/*
	2460	* This procedure is called to change the offset of a directory
	2461	* entry when compacting a directory block which must be owned
	2462	* exclusively by the caller. Note that the actual entry movement
	2463	* must be done in this procedure to ensure that no I/O completions
	2464	* occur while the move is in progress.
	2465	*/
	2466	void
	2467	softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
	2468	struct inode dp; / inode for directory */
	2469	caddr_t base; /* address of dp->i_offset */
	2470	caddr_t oldloc; /* address of old directory location */
	2471	caddr_t newloc; /* address of new directory location */
	2472	int entrysize; /* size of directory entry */
	2473	{
	2474	int offset, oldoffset, newoffset;
	2475	struct pagedep *pagedep;
	2476	struct diradd *dap;
	2477	ufs_lbn_t lbn;
	2478
	2479	ACQUIRE_LOCK(&lk);
	2480	lbn = lblkno(dp->i_fs, dp->i_offset);
	2481	offset = blkoff(dp->i_fs, dp->i_offset);
	2482	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
	2483	goto done;
	2484	oldoffset = offset + (oldloc - base);
	2485	newoffset = offset + (newloc - base);
	2486
	2487	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
	2488	if (dap->da_offset != oldoffset)
	2489	continue;
	2490	dap->da_offset = newoffset;
	2491	if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
	2492	break;
	2493	LIST_REMOVE(dap, da_pdlist);
	2494	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
	2495	dap, da_pdlist);
	2496	break;
	2497	}
	2498	if (dap == NULL) {
	2499
	2500	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
	2501	if (dap->da_offset == oldoffset) {
	2502	dap->da_offset = newoffset;
	2503	break;
	2504	}
	2505	}
	2506	}
	2507	done:
	2508	bcopy(oldloc, newloc, entrysize);
	2509	FREE_LOCK(&lk);
	2510	}
	2511
	2512	/*
	2513	* Free a diradd dependency structure. This routine must be called
	2514	* with splbio interrupts blocked.
	2515	*/
	2516	static void
	2517	free_diradd(dap)
	2518	struct diradd *dap;
	2519	{
	2520	struct dirrem *dirrem;
	2521	struct pagedep *pagedep;
	2522	struct inodedep *inodedep;
	2523	struct mkdir mkdir, nextmd;
	2524
	2525	#ifdef DEBUG
	2526	if (lk.lkt_held == NOHOLDER)
	2527	panic("free_diradd: lock not held");
	2528	#endif
	2529	WORKLIST_REMOVE(&dap->da_list);
	2530	LIST_REMOVE(dap, da_pdlist);
	2531	if ((dap->da_state & DIRCHG) == 0) {
	2532	pagedep = dap->da_pagedep;
	2533	} else {
	2534	dirrem = dap->da_previous;
	2535	pagedep = dirrem->dm_pagedep;
	2536	dirrem->dm_dirinum = pagedep->pd_ino;
	2537	add_to_worklist(&dirrem->dm_list);
	2538	}
	2539	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
	2540	0, &inodedep) != 0)
	2541	(void) free_inodedep(inodedep);
	2542	if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) {
	2543	for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
	2544	nextmd = LIST_NEXT(mkdir, md_mkdirs);
	2545	if (mkdir->md_diradd != dap)
	2546	continue;
	2547	dap->da_state &= ~mkdir->md_state;
	2548	WORKLIST_REMOVE(&mkdir->md_list);
	2549	LIST_REMOVE(mkdir, md_mkdirs);
	2550	WORKITEM_FREE(mkdir, D_MKDIR);
	2551	}
	2552	if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) {
	2553	FREE_LOCK(&lk);
	2554	panic("free_diradd: unfound ref");
	2555	}
	2556	}
	2557	WORKITEM_FREE(dap, D_DIRADD);
	2558	}
	2559
	2560	/*
	2561	* Directory entry removal dependencies.
	2562	*
	2563	* When removing a directory entry, the entry's inode pointer must be
	2564	* zero'ed on disk before the corresponding inode's link count is decremented
	2565	* (possibly freeing the inode for re-use). This dependency is handled by
	2566	* updating the directory entry but delaying the inode count reduction until
	2567	* after the directory block has been written to disk. After this point, the
	2568	* inode count can be decremented whenever it is convenient.
	2569	*/
	2570
	2571	/*
	2572	* This routine should be called immediately after removing
	2573	* a directory entry. The inode's link count should not be
	2574	* decremented by the calling procedure -- the soft updates
	2575	* code will do this task when it is safe.
	2576	*/
	2577	void
	2578	softdep_setup_remove(bp, dp, ip, isrmdir)
	2579	struct buf bp; / buffer containing directory block */
	2580	struct inode dp; / inode for the directory being modified */
	2581	struct inode ip; / inode for directory entry being removed */
	2582	int isrmdir; /* indicates if doing RMDIR */
	2583	{
	2584	struct dirrem dirrem, prevdirrem;
	2585
	2586	/*
	2587	* Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
	2588	*/
	2589	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
	2590
	2591	/*
	2592	* If the COMPLETE flag is clear, then there were no active
	2593	* entries and we want to roll back to a zeroed entry until
	2594	* the new inode is committed to disk. If the COMPLETE flag is
	2595	* set then we have deleted an entry that never made it to
	2596	* disk. If the entry we deleted resulted from a name change,
	2597	* then the old name still resides on disk. We cannot delete
	2598	* its inode (returned to us in prevdirrem) until the zeroed
	2599	* directory entry gets to disk. The new inode has never been
	2600	* referenced on the disk, so can be deleted immediately.
	2601	*/
	2602	if ((dirrem->dm_state & COMPLETE) == 0) {
	2603	LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
	2604	dm_next);
	2605	FREE_LOCK(&lk);
	2606	} else {
	2607	if (prevdirrem != NULL)
	2608	LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
	2609	prevdirrem, dm_next);
	2610	dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
	2611	FREE_LOCK(&lk);
	2612	handle_workitem_remove(dirrem);
	2613	}
	2614	}
	2615
	2616	/*
	2617	* Allocate a new dirrem if appropriate and return it along with
	2618	* its associated pagedep. Called without a lock, returns with lock.
	2619	*/
	2620	static long num_dirrem; /* number of dirrem allocated */
	2621	static struct dirrem *
	2622	newdirrem(bp, dp, ip, isrmdir, prevdirremp)
	2623	struct buf bp; / buffer containing directory block */
	2624	struct inode dp; / inode for the directory being modified */
	2625	struct inode ip; / inode for directory entry being removed */
	2626	int isrmdir; /* indicates if doing RMDIR */
	2627	struct dirrem *prevdirremp; / previously referenced inode, if any */
	2628	{
	2629	int offset;
	2630	ufs_lbn_t lbn;
	2631	struct diradd *dap;
	2632	struct dirrem *dirrem;
	2633	struct pagedep *pagedep;
	2634
	2635	/*
	2636	* Whiteouts have no deletion dependencies.
	2637	*/
	2638	if (ip == NULL)
	2639	panic("newdirrem: whiteout");
	2640	/*
	2641	* If we are over our limit, try to improve the situation.
	2642	* Limiting the number of dirrem structures will also limit
	2643	* the number of freefile and freeblks structures.
	2644	*/
	2645	if (num_dirrem > max_softdeps / 2 && speedup_syncer() == 0)
	2646	(void) request_cleanup(FLUSH_REMOVE, 0);
	2647	num_dirrem += 1;
	2648	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
	2649	M_DIRREM, M_SOFTDEP_FLAGS);
	2650	bzero(dirrem, sizeof(struct dirrem));
	2651	dirrem->dm_list.wk_type = D_DIRREM;
	2652	dirrem->dm_state = isrmdir ? RMDIR : 0;
	2653	dirrem->dm_mnt = ITOV(ip)->v_mount;
	2654	dirrem->dm_oldinum = ip->i_number;
	2655	*prevdirremp = NULL;
	2656
	2657	ACQUIRE_LOCK(&lk);
	2658	lbn = lblkno(dp->i_fs, dp->i_offset);
	2659	offset = blkoff(dp->i_fs, dp->i_offset);
	2660	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
	2661	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
	2662	dirrem->dm_pagedep = pagedep;
	2663	/*
	2664	* Check for a diradd dependency for the same directory entry.
	2665	* If present, then both dependencies become obsolete and can
	2666	* be de-allocated. Check for an entry on both the pd_dirraddhd
	2667	* list and the pd_pendinghd list.
	2668	*/
	2669
	2670	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
	2671	if (dap->da_offset == offset)
	2672	break;
	2673	if (dap == NULL) {
	2674
	2675	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
	2676	if (dap->da_offset == offset)
	2677	break;
	2678	if (dap == NULL)
	2679	return (dirrem);
	2680	}
	2681	/*
	2682	* Must be ATTACHED at this point.
	2683	*/
	2684	if ((dap->da_state & ATTACHED) == 0) {
	2685	FREE_LOCK(&lk);
	2686	panic("newdirrem: not ATTACHED");
	2687	}
	2688	if (dap->da_newinum != ip->i_number) {
	2689	FREE_LOCK(&lk);
	2690	panic("newdirrem: inum %"PRId64" should be %"PRId64,
	2691	ip->i_number, dap->da_newinum);
	2692	}
	2693	/*
	2694	* If we are deleting a changed name that never made it to disk,
	2695	* then return the dirrem describing the previous inode (which
	2696	* represents the inode currently referenced from this entry on disk).
	2697	*/
	2698	if ((dap->da_state & DIRCHG) != 0) {
	2699	*prevdirremp = dap->da_previous;
	2700	dap->da_state &= ~DIRCHG;
	2701	dap->da_pagedep = pagedep;
	2702	}
	2703	/*
	2704	* We are deleting an entry that never made it to disk.
	2705	* Mark it COMPLETE so we can delete its inode immediately.
	2706	*/
	2707	dirrem->dm_state \|= COMPLETE;
	2708	free_diradd(dap);
	2709	return (dirrem);
	2710	}
	2711
	2712	/*
	2713	* Directory entry change dependencies.
	2714	*
	2715	* Changing an existing directory entry requires that an add operation
	2716	* be completed first followed by a deletion. The semantics for the addition
	2717	* are identical to the description of adding a new entry above except
	2718	* that the rollback is to the old inode number rather than zero. Once
	2719	* the addition dependency is completed, the removal is done as described
	2720	* in the removal routine above.
	2721	*/
	2722
	2723	/*
	2724	* This routine should be called immediately after changing
	2725	* a directory entry. The inode's link count should not be
	2726	* decremented by the calling procedure -- the soft updates
	2727	* code will perform this task when it is safe.
	2728	*/
	2729	void
	2730	softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
	2731	struct buf bp; / buffer containing directory block */
	2732	struct inode dp; / inode for the directory being modified */
	2733	struct inode ip; / inode for directory entry being removed */
	2734	ino_t newinum; /* new inode number for changed entry */
	2735	int isrmdir; /* indicates if doing RMDIR */
	2736	{
	2737	int offset;
	2738	struct diradd *dap = NULL;
	2739	struct dirrem dirrem, prevdirrem;
	2740	struct pagedep *pagedep;
	2741	struct inodedep *inodedep;
	2742
	2743	offset = blkoff(dp->i_fs, dp->i_offset);
	2744
	2745	/*
	2746	* Whiteouts do not need diradd dependencies.
	2747	*/
	2748	if (newinum != WINO) {
	2749	MALLOC(dap, struct diradd *, sizeof(struct diradd),
	2750	M_DIRADD, M_SOFTDEP_FLAGS);
	2751	bzero(dap, sizeof(struct diradd));
	2752	dap->da_list.wk_type = D_DIRADD;
	2753	dap->da_state = DIRCHG \| ATTACHED \| DEPCOMPLETE;
	2754	dap->da_offset = offset;
	2755	dap->da_newinum = newinum;
	2756	}
	2757
	2758	/*
	2759	* Allocate a new dirrem and ACQUIRE_LOCK.
	2760	*/
	2761	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
	2762	pagedep = dirrem->dm_pagedep;
	2763	/*
	2764	* The possible values for isrmdir:
	2765	* 0 - non-directory file rename
	2766	* 1 - directory rename within same directory
	2767	* inum - directory rename to new directory of given inode number
	2768	* When renaming to a new directory, we are both deleting and
	2769	* creating a new directory entry, so the link count on the new
	2770	* directory should not change. Thus we do not need the followup
	2771	* dirrem which is usually done in handle_workitem_remove. We set
	2772	* the DIRCHG flag to tell handle_workitem_remove to skip the
	2773	* followup dirrem.
	2774	*/
	2775	if (isrmdir > 1)
	2776	dirrem->dm_state \|= DIRCHG;
	2777
	2778	/*
	2779	* Whiteouts have no additional dependencies,
	2780	* so just put the dirrem on the correct list.
	2781	*/
	2782	if (newinum == WINO) {
	2783	if ((dirrem->dm_state & COMPLETE) == 0) {
	2784	LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
	2785	dm_next);
	2786	} else {
	2787	dirrem->dm_dirinum = pagedep->pd_ino;
	2788	add_to_worklist(&dirrem->dm_list);
	2789	}
	2790	FREE_LOCK(&lk);
	2791	return;
	2792	}
	2793
	2794	/*
	2795	* If the COMPLETE flag is clear, then there were no active
	2796	* entries and we want to roll back to the previous inode until
	2797	* the new inode is committed to disk. If the COMPLETE flag is
	2798	* set, then we have deleted an entry that never made it to disk.
	2799	* If the entry we deleted resulted from a name change, then the old
	2800	* inode reference still resides on disk. Any rollback that we do
	2801	* needs to be to that old inode (returned to us in prevdirrem). If
	2802	* the entry we deleted resulted from a create, then there is
	2803	* no entry on the disk, so we want to roll back to zero rather
	2804	* than the uncommitted inode. In either of the COMPLETE cases we
	2805	* want to immediately free the unwritten and unreferenced inode.
	2806	*/
	2807	if ((dirrem->dm_state & COMPLETE) == 0) {
	2808	dap->da_previous = dirrem;
	2809	} else {
	2810	if (prevdirrem != NULL) {
	2811	dap->da_previous = prevdirrem;
	2812	} else {
	2813	dap->da_state &= ~DIRCHG;
	2814	dap->da_pagedep = pagedep;
	2815	}
	2816	dirrem->dm_dirinum = pagedep->pd_ino;
	2817	add_to_worklist(&dirrem->dm_list);
	2818	}
	2819	/*
	2820	* Link into its inodedep. Put it on the id_bufwait list if the inode
	2821	* is not yet written. If it is written, do the post-inode write
	2822	* processing to put it on the id_pendinghd list.
	2823	*/
	2824	if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 \|\|
	2825	(inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
	2826	dap->da_state \|= COMPLETE;
	2827	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
	2828	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
	2829	} else {
	2830	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
	2831	dap, da_pdlist);
	2832	WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
	2833	}
	2834	FREE_LOCK(&lk);
	2835	}
	2836
	2837	/*
	2838	* Called whenever the link count on an inode is changed.
	2839	* It creates an inode dependency so that the new reference(s)
	2840	* to the inode cannot be committed to disk until the updated
	2841	* inode has been written.
	2842	*/
	2843	void
	2844	softdep_change_linkcnt(ip)
	2845	struct inode ip; / the inode with the increased link count */
	2846	{
	2847	struct inodedep *inodedep;
	2848
	2849	ACQUIRE_LOCK(&lk);
	2850	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
	2851	if (ip->i_nlink < ip->i_effnlink) {
	2852	FREE_LOCK(&lk);
	2853	panic("softdep_change_linkcnt: bad delta");
	2854	}
	2855	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
	2856	FREE_LOCK(&lk);
	2857	}
	2858
	2859	/*
	2860	* This workitem decrements the inode's link count.
	2861	* If the link count reaches zero, the file is removed.
	2862	*/
	2863	static void
	2864	handle_workitem_remove(dirrem)
	2865	struct dirrem *dirrem;
	2866	{
	2867	struct inodedep *inodedep;
	2868	struct vnode *vp;
	2869	struct inode *ip;
	2870	ino_t oldinum;
	2871	int error;
	2872
	2873	if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
	2874	softdep_error("handle_workitem_remove: vget", error);
	2875	return;
	2876	}
	2877	ip = VTOI(vp);
	2878	ACQUIRE_LOCK(&lk);
	2879	if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){
	2880	FREE_LOCK(&lk);
	2881	panic("handle_workitem_remove: lost inodedep");
	2882	}
	2883	/*
	2884	* Normal file deletion.
	2885	*/
	2886	if ((dirrem->dm_state & RMDIR) == 0) {
	2887	ip->i_nlink--;
	2888	ip->i_flag \|= IN_CHANGE;
	2889	if (ip->i_nlink < ip->i_effnlink) {
	2890	FREE_LOCK(&lk);
	2891	panic("handle_workitem_remove: bad file delta");
	2892	}
	2893	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
	2894	FREE_LOCK(&lk);
	2895	vput(vp);
	2896	num_dirrem -= 1;
	2897	WORKITEM_FREE(dirrem, D_DIRREM);
	2898	return;
	2899	}
	2900	/*
	2901	* Directory deletion. Decrement reference count for both the
	2902	* just deleted parent directory entry and the reference for ".".
	2903	* Next truncate the directory to length zero. When the
	2904	* truncation completes, arrange to have the reference count on
	2905	* the parent decremented to account for the loss of "..".
	2906	*/
	2907	ip->i_nlink -= 2;
	2908	ip->i_flag \|= IN_CHANGE;
	2909	if (ip->i_nlink < ip->i_effnlink) {
	2910	FREE_LOCK(&lk);
	2911	panic("handle_workitem_remove: bad dir delta");
	2912	}
	2913	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
	2914	FREE_LOCK(&lk);
	2915	if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, proc0.p_ucred)) != 0)
	2916	softdep_error("handle_workitem_remove: truncate", error);
	2917	/*
	2918	* Rename a directory to a new parent. Since, we are both deleting
	2919	* and creating a new directory entry, the link count on the new
	2920	* directory should not change. Thus we skip the followup dirrem.
	2921	*/
	2922	if (dirrem->dm_state & DIRCHG) {
	2923	vput(vp);
	2924	num_dirrem -= 1;
	2925	WORKITEM_FREE(dirrem, D_DIRREM);
	2926	return;
	2927	}
	2928	/*
	2929	* If the inodedep does not exist, then the zero'ed inode has
	2930	* been written to disk. If the allocated inode has never been
	2931	* written to disk, then the on-disk inode is zero'ed. In either
	2932	* case we can remove the file immediately.
	2933	*/
	2934	ACQUIRE_LOCK(&lk);
	2935	dirrem->dm_state = 0;
	2936	oldinum = dirrem->dm_oldinum;
	2937	dirrem->dm_oldinum = dirrem->dm_dirinum;
	2938	if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 \|\|
	2939	check_inode_unwritten(inodedep)) {
	2940	FREE_LOCK(&lk);
	2941	vput(vp);
	2942	handle_workitem_remove(dirrem);
	2943	return;
	2944	}
	2945	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
	2946	FREE_LOCK(&lk);
	2947	ip->i_flag \|= IN_CHANGE;
	2948	ffs_update(vp, 0);
	2949	vput(vp);
	2950	}
	2951
	2952	/*
	2953	* Inode de-allocation dependencies.
	2954	*
	2955	* When an inode's link count is reduced to zero, it can be de-allocated. We
	2956	* found it convenient to postpone de-allocation until after the inode is
	2957	* written to disk with its new link count (zero). At this point, all of the
	2958	* on-disk inode's block pointers are nullified and, with careful dependency
	2959	* list ordering, all dependencies related to the inode will be satisfied and
	2960	* the corresponding dependency structures de-allocated. So, if/when the
	2961	* inode is reused, there will be no mixing of old dependencies with new
	2962	* ones. This artificial dependency is set up by the block de-allocation
	2963	* procedure above (softdep_setup_freeblocks) and completed by the
	2964	* following procedure.
	2965	*/
	2966	static void
	2967	handle_workitem_freefile(freefile)
	2968	struct freefile *freefile;
	2969	{
	2970	struct vnode vp;
	2971	struct inode tip;
	2972	struct inodedep *idp;
	2973	int error;
	2974
	2975	#ifdef DEBUG
	2976	ACQUIRE_LOCK(&lk);
	2977	error = inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp);
	2978	FREE_LOCK(&lk);
	2979	if (error)
	2980	panic("handle_workitem_freefile: inodedep survived");
	2981	#endif
	2982	tip.i_devvp = freefile->fx_devvp;
	2983	tip.i_dev = freefile->fx_devvp->v_rdev;
	2984	tip.i_fs = freefile->fx_fs;
	2985	vp.v_data = &tip;
	2986	if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
	2987	softdep_error("handle_workitem_freefile", error);
	2988	WORKITEM_FREE(freefile, D_FREEFILE);
	2989	}
	2990
	2991	/*
	2992	* Helper function which unlinks marker element from work list and returns
	2993	* the next element on the list.
	2994	*/
	2995	static __inline struct worklist *
	2996	markernext(struct worklist *marker)
	2997	{
	2998	struct worklist *next;
	2999
	3000	next = LIST_NEXT(marker, wk_list);
	3001	LIST_REMOVE(marker, wk_list);
	3002	return next;
	3003	}
	3004
	3005	/*
	3006	* Disk writes.
	3007	*
	3008	* The dependency structures constructed above are most actively used when file
	3009	* system blocks are written to disk. No constraints are placed on when a
	3010	* block can be written, but unsatisfied update dependencies are made safe by
	3011	* modifying (or replacing) the source memory for the duration of the disk
	3012	* write. When the disk write completes, the memory block is again brought
	3013	* up-to-date.
	3014	*
	3015	* In-core inode structure reclamation.
	3016	*
	3017	* Because there are a finite number of "in-core" inode structures, they are
	3018	* reused regularly. By transferring all inode-related dependencies to the
	3019	* in-memory inode block and indexing them separately (via "inodedep"s), we
	3020	* can allow "in-core" inode structures to be reused at any time and avoid
	3021	* any increase in contention.
	3022	*
	3023	* Called just before entering the device driver to initiate a new disk I/O.
	3024	* The buffer must be locked, thus, no I/O completion operations can occur
	3025	* while we are manipulating its associated dependencies.
	3026	*/
	3027	static void
	3028	softdep_disk_io_initiation(bp)
	3029	struct buf bp; / structure describing disk write to occur */
	3030	{
	3031	struct worklist *wk;
	3032	struct worklist marker;
	3033	struct indirdep *indirdep;
	3034
	3035	/*
	3036	* We only care about write operations. There should never
	3037	* be dependencies for reads.
	3038	*/
	3039	if (bp->b_cmd == BUF_CMD_READ)
	3040	panic("softdep_disk_io_initiation: read");
	3041
	3042	marker.wk_type = D_LAST + 1; /* Not a normal workitem */
	3043
	3044	/*
	3045	* Do any necessary pre-I/O processing.
	3046	*/
	3047	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = markernext(&marker)) {
	3048	LIST_INSERT_AFTER(wk, &marker, wk_list);
	3049
	3050	switch (wk->wk_type) {
	3051
	3052	case D_PAGEDEP:
	3053	initiate_write_filepage(WK_PAGEDEP(wk), bp);
	3054	continue;
	3055
	3056	case D_INODEDEP:
	3057	initiate_write_inodeblock(WK_INODEDEP(wk), bp);
	3058	continue;
	3059
	3060	case D_INDIRDEP:
	3061	indirdep = WK_INDIRDEP(wk);
	3062	if (indirdep->ir_state & GOINGAWAY)
	3063	panic("disk_io_initiation: indirdep gone");
	3064	/*
	3065	* If there are no remaining dependencies, this
	3066	* will be writing the real pointers, so the
	3067	* dependency can be freed.
	3068	*/
	3069	if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
	3070	indirdep->ir_savebp->b_flags \|= B_INVAL \| B_NOCACHE;
	3071	brelse(indirdep->ir_savebp);
	3072	/* inline expand WORKLIST_REMOVE(wk); */
	3073	wk->wk_state &= ~ONWORKLIST;
	3074	LIST_REMOVE(wk, wk_list);
	3075	WORKITEM_FREE(indirdep, D_INDIRDEP);
	3076	continue;
	3077	}
	3078	/*
	3079	* Replace up-to-date version with safe version.
	3080	*/
	3081	MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
	3082	M_INDIRDEP, M_SOFTDEP_FLAGS);
	3083	ACQUIRE_LOCK(&lk);
	3084	indirdep->ir_state &= ~ATTACHED;
	3085	indirdep->ir_state \|= UNDONE;
	3086	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
	3087	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
	3088	bp->b_bcount);
	3089	FREE_LOCK(&lk);
	3090	continue;
	3091
	3092	case D_MKDIR:
	3093	case D_BMSAFEMAP:
	3094	case D_ALLOCDIRECT:
	3095	case D_ALLOCINDIR:
	3096	continue;
	3097
	3098	default:
	3099	panic("handle_disk_io_initiation: Unexpected type %s",
	3100	TYPENAME(wk->wk_type));
	3101	/* NOTREACHED */
	3102	}
	3103	}
	3104	}
	3105
	3106	/*
	3107	* Called from within the procedure above to deal with unsatisfied
	3108	* allocation dependencies in a directory. The buffer must be locked,
	3109	* thus, no I/O completion operations can occur while we are
	3110	* manipulating its associated dependencies.
	3111	*/
	3112	static void
	3113	initiate_write_filepage(pagedep, bp)
	3114	struct pagedep *pagedep;
	3115	struct buf *bp;
	3116	{
	3117	struct diradd *dap;
	3118	struct direct *ep;
	3119	int i;
	3120
	3121	if (pagedep->pd_state & IOSTARTED) {
	3122	/*
	3123	* This can only happen if there is a driver that does not
	3124	* understand chaining. Here biodone will reissue the call
	3125	* to strategy for the incomplete buffers.
	3126	*/
	3127	printf("initiate_write_filepage: already started\n");
	3128	return;
	3129	}
	3130	pagedep->pd_state \|= IOSTARTED;
	3131	ACQUIRE_LOCK(&lk);
	3132	for (i = 0; i < DAHASHSZ; i++) {
	3133	LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
	3134	ep = (struct direct *)
	3135	((char *)bp->b_data + dap->da_offset);
	3136	if (ep->d_ino != dap->da_newinum) {
	3137	FREE_LOCK(&lk);
	3138	panic("%s: dir inum %d != new %"PRId64,
	3139	"initiate_write_filepage",
	3140	ep->d_ino, dap->da_newinum);
	3141	}
	3142	if (dap->da_state & DIRCHG)
	3143	ep->d_ino = dap->da_previous->dm_oldinum;
	3144	else
	3145	ep->d_ino = 0;
	3146	dap->da_state &= ~ATTACHED;
	3147	dap->da_state \|= UNDONE;
	3148	}
	3149	}
	3150	FREE_LOCK(&lk);
	3151	}
	3152
	3153	/*
	3154	* Called from within the procedure above to deal with unsatisfied
	3155	* allocation dependencies in an inodeblock. The buffer must be
	3156	* locked, thus, no I/O completion operations can occur while we
	3157	* are manipulating its associated dependencies.
	3158	*/
	3159	static void
	3160	initiate_write_inodeblock(inodedep, bp)
	3161	struct inodedep *inodedep;
	3162	struct buf bp; / The inode block */
	3163	{
	3164	struct allocdirect adp, lastadp;
	3165	struct ufs1_dinode *dp;
	3166	struct ufs1_dinode *sip;
	3167	struct fs *fs;
	3168	ufs_lbn_t prevlbn = 0;
	3169	int i, deplist;
	3170
	3171	if (inodedep->id_state & IOSTARTED)
	3172	panic("initiate_write_inodeblock: already started");
	3173	inodedep->id_state \|= IOSTARTED;
	3174	fs = inodedep->id_fs;
	3175	dp = (struct ufs1_dinode *)bp->b_data +
	3176	ino_to_fsbo(fs, inodedep->id_ino);
	3177	/*
	3178	* If the bitmap is not yet written, then the allocated
	3179	* inode cannot be written to disk.
	3180	*/
	3181	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
	3182	if (inodedep->id_savedino != NULL)
	3183	panic("initiate_write_inodeblock: already doing I/O");
	3184	MALLOC(sip, struct ufs1_dinode *,
	3185	sizeof(struct ufs1_dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
	3186	inodedep->id_savedino = sip;
	3187	inodedep->id_savedino = dp;
	3188	bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
	3189	dp->di_gen = inodedep->id_savedino->di_gen;
	3190	return;
	3191	}
	3192	/*
	3193	* If no dependencies, then there is nothing to roll back.
	3194	*/
	3195	inodedep->id_savedsize = dp->di_size;
	3196	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
	3197	return;
	3198	/*
	3199	* Set the dependencies to busy.
	3200	*/
	3201	ACQUIRE_LOCK(&lk);
	3202	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
	3203	adp = TAILQ_NEXT(adp, ad_next)) {
	3204	#ifdef DIAGNOSTIC
	3205	if (deplist != 0 && prevlbn >= adp->ad_lbn) {
	3206	FREE_LOCK(&lk);
	3207	panic("softdep_write_inodeblock: lbn order");
	3208	}
	3209	prevlbn = adp->ad_lbn;
	3210	if (adp->ad_lbn < NDADDR &&
	3211	dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
	3212	FREE_LOCK(&lk);
	3213	panic("%s: direct pointer #%ld mismatch %d != %d",
	3214	"softdep_write_inodeblock", adp->ad_lbn,
	3215	dp->di_db[adp->ad_lbn], adp->ad_newblkno);
	3216	}
	3217	if (adp->ad_lbn >= NDADDR &&
	3218	dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) {
	3219	FREE_LOCK(&lk);
	3220	panic("%s: indirect pointer #%ld mismatch %d != %d",
	3221	"softdep_write_inodeblock", adp->ad_lbn - NDADDR,
	3222	dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno);
	3223	}
	3224	deplist \|= 1 << adp->ad_lbn;
	3225	if ((adp->ad_state & ATTACHED) == 0) {
	3226	FREE_LOCK(&lk);
	3227	panic("softdep_write_inodeblock: Unknown state 0x%x",
	3228	adp->ad_state);
	3229	}
	3230	#endif /* DIAGNOSTIC */
	3231	adp->ad_state &= ~ATTACHED;
	3232	adp->ad_state \|= UNDONE;
	3233	}
	3234	/*
	3235	* The on-disk inode cannot claim to be any larger than the last
	3236	* fragment that has been written. Otherwise, the on-disk inode
	3237	* might have fragments that were not the last block in the file
	3238	* which would corrupt the filesystem.
	3239	*/
	3240	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
	3241	lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
	3242	if (adp->ad_lbn >= NDADDR)
	3243	break;
	3244	dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
	3245	/* keep going until hitting a rollback to a frag */
	3246	if (adp->ad_oldsize == 0 \|\| adp->ad_oldsize == fs->fs_bsize)
	3247	continue;
	3248	dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
	3249	for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
	3250	#ifdef DIAGNOSTIC
	3251	if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
	3252	FREE_LOCK(&lk);
	3253	panic("softdep_write_inodeblock: lost dep1");
	3254	}
	3255	#endif /* DIAGNOSTIC */
	3256	dp->di_db[i] = 0;
	3257	}
	3258	for (i = 0; i < NIADDR; i++) {
	3259	#ifdef DIAGNOSTIC
	3260	if (dp->di_ib[i] != 0 &&
	3261	(deplist & ((1 << NDADDR) << i)) == 0) {
	3262	FREE_LOCK(&lk);
	3263	panic("softdep_write_inodeblock: lost dep2");
	3264	}
	3265	#endif /* DIAGNOSTIC */
	3266	dp->di_ib[i] = 0;
	3267	}
	3268	FREE_LOCK(&lk);
	3269	return;
	3270	}
	3271	/*
	3272	* If we have zero'ed out the last allocated block of the file,
	3273	* roll back the size to the last currently allocated block.
	3274	* We know that this last allocated block is a full-sized as
	3275	* we already checked for fragments in the loop above.
	3276	*/
	3277	if (lastadp != NULL &&
	3278	dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
	3279	for (i = lastadp->ad_lbn; i >= 0; i--)
	3280	if (dp->di_db[i] != 0)
	3281	break;
	3282	dp->di_size = (i + 1) * fs->fs_bsize;
	3283	}
	3284	/*
	3285	* The only dependencies are for indirect blocks.
	3286	*
	3287	* The file size for indirect block additions is not guaranteed.
	3288	* Such a guarantee would be non-trivial to achieve. The conventional
	3289	* synchronous write implementation also does not make this guarantee.
	3290	* Fsck should catch and fix discrepancies. Arguably, the file size
	3291	* can be over-estimated without destroying integrity when the file
	3292	* moves into the indirect blocks (i.e., is large). If we want to
	3293	* postpone fsck, we are stuck with this argument.
	3294	*/
	3295	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
	3296	dp->di_ib[adp->ad_lbn - NDADDR] = 0;
	3297	FREE_LOCK(&lk);
	3298	}
	3299
	3300	/*
	3301	* This routine is called during the completion interrupt
	3302	* service routine for a disk write (from the procedure called
	3303	* by the device driver to inform the filesystem caches of
	3304	* a request completion). It should be called early in this
	3305	* procedure, before the block is made available to other
	3306	* processes or other routines are called.
	3307	*/
	3308	static void
	3309	softdep_disk_write_complete(bp)
	3310	struct buf bp; / describes the completed disk write */
	3311	{
	3312	struct worklist *wk;
	3313	struct workhead reattach;
	3314	struct newblk *newblk;
	3315	struct allocindir *aip;
	3316	struct allocdirect *adp;
	3317	struct indirdep *indirdep;
	3318	struct inodedep *inodedep;
	3319	struct bmsafemap *bmsafemap;
	3320
	3321	#ifdef DEBUG
	3322	if (lk.lkt_held != NOHOLDER)
	3323	panic("softdep_disk_write_complete: lock is held");
	3324	lk.lkt_held = SPECIAL_FLAG;
	3325	#endif
	3326	LIST_INIT(&reattach);
	3327	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
	3328	WORKLIST_REMOVE(wk);
	3329	switch (wk->wk_type) {
	3330
	3331	case D_PAGEDEP:
	3332	if (handle_written_filepage(WK_PAGEDEP(wk), bp))
	3333	WORKLIST_INSERT(&reattach, wk);
	3334	continue;
	3335
	3336	case D_INODEDEP:
	3337	if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
	3338	WORKLIST_INSERT(&reattach, wk);
	3339	continue;
	3340
	3341	case D_BMSAFEMAP:
	3342	bmsafemap = WK_BMSAFEMAP(wk);
	3343	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
	3344	newblk->nb_state \|= DEPCOMPLETE;
	3345	newblk->nb_bmsafemap = NULL;
	3346	LIST_REMOVE(newblk, nb_deps);
	3347	}
	3348	while ((adp =
	3349	LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
	3350	adp->ad_state \|= DEPCOMPLETE;
	3351	adp->ad_buf = NULL;
	3352	LIST_REMOVE(adp, ad_deps);
	3353	handle_allocdirect_partdone(adp);
	3354	}
	3355	while ((aip =
	3356	LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
	3357	aip->ai_state \|= DEPCOMPLETE;
	3358	aip->ai_buf = NULL;
	3359	LIST_REMOVE(aip, ai_deps);
	3360	handle_allocindir_partdone(aip);
	3361	}
	3362	while ((inodedep =
	3363	LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
	3364	inodedep->id_state \|= DEPCOMPLETE;
	3365	LIST_REMOVE(inodedep, id_deps);
	3366	inodedep->id_buf = NULL;
	3367	}
	3368	WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
	3369	continue;
	3370
	3371	case D_MKDIR:
	3372	handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
	3373	continue;
	3374
	3375	case D_ALLOCDIRECT:
	3376	adp = WK_ALLOCDIRECT(wk);
	3377	adp->ad_state \|= COMPLETE;
	3378	handle_allocdirect_partdone(adp);
	3379	continue;
	3380
	3381	case D_ALLOCINDIR:
	3382	aip = WK_ALLOCINDIR(wk);
	3383	aip->ai_state \|= COMPLETE;
	3384	handle_allocindir_partdone(aip);
	3385	continue;
	3386
	3387	case D_INDIRDEP:
	3388	indirdep = WK_INDIRDEP(wk);
	3389	if (indirdep->ir_state & GOINGAWAY) {
	3390	lk.lkt_held = NOHOLDER;
	3391	panic("disk_write_complete: indirdep gone");
	3392	}
	3393	bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
	3394	FREE(indirdep->ir_saveddata, M_INDIRDEP);
	3395	indirdep->ir_saveddata = 0;
	3396	indirdep->ir_state &= ~UNDONE;
	3397	indirdep->ir_state \|= ATTACHED;
	3398	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
	3399	handle_allocindir_partdone(aip);
	3400	if (aip == LIST_FIRST(&indirdep->ir_donehd)) {
	3401	lk.lkt_held = NOHOLDER;
	3402	panic("disk_write_complete: not gone");
	3403	}
	3404	}
	3405	WORKLIST_INSERT(&reattach, wk);
	3406	if ((bp->b_flags & B_DELWRI) == 0)
	3407	stat_indir_blk_ptrs++;
	3408	bdirty(bp);
	3409	continue;
	3410
	3411	default:
	3412	lk.lkt_held = NOHOLDER;
	3413	panic("handle_disk_write_complete: Unknown type %s",
	3414	TYPENAME(wk->wk_type));
	3415	/* NOTREACHED */
	3416	}
	3417	}
	3418	/*
	3419	* Reattach any requests that must be redone.
	3420	*/
	3421	while ((wk = LIST_FIRST(&reattach)) != NULL) {
	3422	WORKLIST_REMOVE(wk);
	3423	WORKLIST_INSERT(&bp->b_dep, wk);
	3424	}
	3425	#ifdef DEBUG
	3426	if (lk.lkt_held != SPECIAL_FLAG)
	3427	panic("softdep_disk_write_complete: lock lost");
	3428	lk.lkt_held = NOHOLDER;
	3429	#endif
	3430	}
	3431
	3432	/*
	3433	* Called from within softdep_disk_write_complete above. Note that
	3434	* this routine is always called from interrupt level with further
	3435	* splbio interrupts blocked.
	3436	*/
	3437	static void
	3438	handle_allocdirect_partdone(adp)
	3439	struct allocdirect adp; / the completed allocdirect */
	3440	{
	3441	struct allocdirect *listadp;
	3442	struct inodedep *inodedep;
	3443	long bsize;
	3444
	3445	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
	3446	return;
	3447	if (adp->ad_buf != NULL) {
	3448	lk.lkt_held = NOHOLDER;
	3449	panic("handle_allocdirect_partdone: dangling dep");
	3450	}
	3451	/*
	3452	* The on-disk inode cannot claim to be any larger than the last
	3453	* fragment that has been written. Otherwise, the on-disk inode
	3454	* might have fragments that were not the last block in the file
	3455	* which would corrupt the filesystem. Thus, we cannot free any
	3456	* allocdirects after one whose ad_oldblkno claims a fragment as
	3457	* these blocks must be rolled back to zero before writing the inode.
	3458	* We check the currently active set of allocdirects in id_inoupdt.
	3459	*/
	3460	inodedep = adp->ad_inodedep;
	3461	bsize = inodedep->id_fs->fs_bsize;
	3462	TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) {
	3463	/* found our block */
	3464	if (listadp == adp)
	3465	break;
	3466	/* continue if ad_oldlbn is not a fragment */
	3467	if (listadp->ad_oldsize == 0 \|\|
	3468	listadp->ad_oldsize == bsize)
	3469	continue;
	3470	/* hit a fragment */
	3471	return;
	3472	}
	3473	/*
	3474	* If we have reached the end of the current list without
	3475	* finding the just finished dependency, then it must be
	3476	* on the future dependency list. Future dependencies cannot
	3477	* be freed until they are moved to the current list.
	3478	*/
	3479	if (listadp == NULL) {
	3480	#ifdef DEBUG
	3481	TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next)
	3482	/* found our block */
	3483	if (listadp == adp)
	3484	break;
	3485	if (listadp == NULL) {
	3486	lk.lkt_held = NOHOLDER;
	3487	panic("handle_allocdirect_partdone: lost dep");
	3488	}
	3489	#endif /* DEBUG */
	3490	return;
	3491	}
	3492	/*
	3493	* If we have found the just finished dependency, then free
	3494	* it along with anything that follows it that is complete.
	3495	*/
	3496	for (; adp; adp = listadp) {
	3497	listadp = TAILQ_NEXT(adp, ad_next);
	3498	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
	3499	return;
	3500	free_allocdirect(&inodedep->id_inoupdt, adp, 1);
	3501	}
	3502	}
	3503
	3504	/*
	3505	* Called from within softdep_disk_write_complete above. Note that
	3506	* this routine is always called from interrupt level with further
	3507	* splbio interrupts blocked.
	3508	*/
	3509	static void
	3510	handle_allocindir_partdone(aip)
	3511	struct allocindir aip; / the completed allocindir */
	3512	{
	3513	struct indirdep *indirdep;
	3514
	3515	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
	3516	return;
	3517	if (aip->ai_buf != NULL) {
	3518	lk.lkt_held = NOHOLDER;
	3519	panic("handle_allocindir_partdone: dangling dependency");
	3520	}
	3521	indirdep = aip->ai_indirdep;
	3522	if (indirdep->ir_state & UNDONE) {
	3523	LIST_REMOVE(aip, ai_next);
	3524	LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
	3525	return;
	3526	}
	3527	((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
	3528	aip->ai_newblkno;
	3529	LIST_REMOVE(aip, ai_next);
	3530	if (aip->ai_freefrag != NULL)
	3531	add_to_worklist(&aip->ai_freefrag->ff_list);
	3532	WORKITEM_FREE(aip, D_ALLOCINDIR);
	3533	}
	3534
	3535	/*
	3536	* Called from within softdep_disk_write_complete above to restore
	3537	* in-memory inode block contents to their most up-to-date state. Note
	3538	* that this routine is always called from interrupt level with further
	3539	* splbio interrupts blocked.
	3540	*/
	3541	static int
	3542	handle_written_inodeblock(inodedep, bp)
	3543	struct inodedep *inodedep;
	3544	struct buf bp; / buffer containing the inode block */
	3545	{
	3546	struct worklist wk, filefree;
	3547	struct allocdirect adp, nextadp;
	3548	struct ufs1_dinode *dp;
	3549	int hadchanges;
	3550
	3551	if ((inodedep->id_state & IOSTARTED) == 0) {
	3552	lk.lkt_held = NOHOLDER;
	3553	panic("handle_written_inodeblock: not started");
	3554	}
	3555	inodedep->id_state &= ~IOSTARTED;
	3556	dp = (struct ufs1_dinode *)bp->b_data +
	3557	ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
	3558	/*
	3559	* If we had to rollback the inode allocation because of
	3560	* bitmaps being incomplete, then simply restore it.
	3561	* Keep the block dirty so that it will not be reclaimed until
	3562	* all associated dependencies have been cleared and the
	3563	* corresponding updates written to disk.
	3564	*/
	3565	if (inodedep->id_savedino != NULL) {
	3566	dp = inodedep->id_savedino;
	3567	FREE(inodedep->id_savedino, M_INODEDEP);
	3568	inodedep->id_savedino = NULL;
	3569	if ((bp->b_flags & B_DELWRI) == 0)
	3570	stat_inode_bitmap++;
	3571	bdirty(bp);
	3572	return (1);
	3573	}
	3574	inodedep->id_state \|= COMPLETE;
	3575	/*
	3576	* Roll forward anything that had to be rolled back before
	3577	* the inode could be updated.
	3578	*/
	3579	hadchanges = 0;
	3580	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
	3581	nextadp = TAILQ_NEXT(adp, ad_next);
	3582	if (adp->ad_state & ATTACHED) {
	3583	lk.lkt_held = NOHOLDER;
	3584	panic("handle_written_inodeblock: new entry");
	3585	}
	3586	if (adp->ad_lbn < NDADDR) {
	3587	if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) {
	3588	lk.lkt_held = NOHOLDER;
	3589	panic("%s: %s #%ld mismatch %d != %d",
	3590	"handle_written_inodeblock",
	3591	"direct pointer", adp->ad_lbn,
	3592	dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
	3593	}
	3594	dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
	3595	} else {
	3596	if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) {
	3597	lk.lkt_held = NOHOLDER;
	3598	panic("%s: %s #%ld allocated as %d",
	3599	"handle_written_inodeblock",
	3600	"indirect pointer", adp->ad_lbn - NDADDR,
	3601	dp->di_ib[adp->ad_lbn - NDADDR]);
	3602	}
	3603	dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno;
	3604	}
	3605	adp->ad_state &= ~UNDONE;
	3606	adp->ad_state \|= ATTACHED;
	3607	hadchanges = 1;
	3608	}
	3609	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
	3610	stat_direct_blk_ptrs++;
	3611	/*
	3612	* Reset the file size to its most up-to-date value.
	3613	*/
	3614	if (inodedep->id_savedsize == -1) {
	3615	lk.lkt_held = NOHOLDER;
	3616	panic("handle_written_inodeblock: bad size");
	3617	}
	3618	if (dp->di_size != inodedep->id_savedsize) {
	3619	dp->di_size = inodedep->id_savedsize;
	3620	hadchanges = 1;
	3621	}
	3622	inodedep->id_savedsize = -1;
	3623	/*
	3624	* If there were any rollbacks in the inode block, then it must be
	3625	* marked dirty so that its will eventually get written back in
	3626	* its correct form.
	3627	*/
	3628	if (hadchanges)
	3629	bdirty(bp);
	3630	/*
	3631	* Process any allocdirects that completed during the update.
	3632	*/
	3633	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
	3634	handle_allocdirect_partdone(adp);
	3635	/*
	3636	* Process deallocations that were held pending until the
	3637	* inode had been written to disk. Freeing of the inode
	3638	* is delayed until after all blocks have been freed to
	3639	* avoid creation of new <vfsid, inum, lbn> triples
	3640	* before the old ones have been deleted.
	3641	*/
	3642	filefree = NULL;
	3643	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
	3644	WORKLIST_REMOVE(wk);
	3645	switch (wk->wk_type) {
	3646
	3647	case D_FREEFILE:
	3648	/*
	3649	* We defer adding filefree to the worklist until
	3650	* all other additions have been made to ensure
	3651	* that it will be done after all the old blocks
	3652	* have been freed.
	3653	*/
	3654	if (filefree != NULL) {
	3655	lk.lkt_held = NOHOLDER;
	3656	panic("handle_written_inodeblock: filefree");
	3657	}
	3658	filefree = wk;
	3659	continue;
	3660
	3661	case D_MKDIR:
	3662	handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
	3663	continue;
	3664
	3665	case D_DIRADD:
	3666	diradd_inode_written(WK_DIRADD(wk), inodedep);
	3667	continue;
	3668
	3669	case D_FREEBLKS:
	3670	wk->wk_state \|= COMPLETE;
	3671	if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE)
	3672	continue;
	3673	/* -- fall through -- */
	3674	case D_FREEFRAG:
	3675	case D_DIRREM:
	3676	add_to_worklist(wk);
	3677	continue;
	3678
	3679	default:
	3680	lk.lkt_held = NOHOLDER;
	3681	panic("handle_written_inodeblock: Unknown type %s",
	3682	TYPENAME(wk->wk_type));
	3683	/* NOTREACHED */
	3684	}
	3685	}
	3686	if (filefree != NULL) {
	3687	if (free_inodedep(inodedep) == 0) {
	3688	lk.lkt_held = NOHOLDER;
	3689	panic("handle_written_inodeblock: live inodedep");
	3690	}
	3691	add_to_worklist(filefree);
	3692	return (0);
	3693	}
	3694
	3695	/*
	3696	* If no outstanding dependencies, free it.
	3697	*/
	3698	if (free_inodedep(inodedep) \|\| TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
	3699	return (0);
	3700	return (hadchanges);
	3701	}
	3702
	3703	/*
	3704	* Process a diradd entry after its dependent inode has been written.
	3705	* This routine must be called with splbio interrupts blocked.
	3706	*/
	3707	static void
	3708	diradd_inode_written(dap, inodedep)
	3709	struct diradd *dap;
	3710	struct inodedep *inodedep;
	3711	{
	3712	struct pagedep *pagedep;
	3713
	3714	dap->da_state \|= COMPLETE;
	3715	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
	3716	if (dap->da_state & DIRCHG)
	3717	pagedep = dap->da_previous->dm_pagedep;
	3718	else
	3719	pagedep = dap->da_pagedep;
	3720	LIST_REMOVE(dap, da_pdlist);
	3721	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
	3722	}
	3723	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
	3724	}
	3725
	3726	/*
	3727	* Handle the completion of a mkdir dependency.
	3728	*/
	3729	static void
	3730	handle_written_mkdir(mkdir, type)
	3731	struct mkdir *mkdir;
	3732	int type;
	3733	{
	3734	struct diradd *dap;
	3735	struct pagedep *pagedep;
	3736
	3737	if (mkdir->md_state != type) {
	3738	lk.lkt_held = NOHOLDER;
	3739	panic("handle_written_mkdir: bad type");
	3740	}
	3741	dap = mkdir->md_diradd;
	3742	dap->da_state &= ~type;
	3743	if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) == 0)
	3744	dap->da_state \|= DEPCOMPLETE;
	3745	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
	3746	if (dap->da_state & DIRCHG)
	3747	pagedep = dap->da_previous->dm_pagedep;
	3748	else
	3749	pagedep = dap->da_pagedep;
	3750	LIST_REMOVE(dap, da_pdlist);
	3751	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
	3752	}
	3753	LIST_REMOVE(mkdir, md_mkdirs);
	3754	WORKITEM_FREE(mkdir, D_MKDIR);
	3755	}
	3756
	3757	/*
	3758	* Called from within softdep_disk_write_complete above.
	3759	* A write operation was just completed. Removed inodes can
	3760	* now be freed and associated block pointers may be committed.
	3761	* Note that this routine is always called from interrupt level
	3762	* with further splbio interrupts blocked.
	3763	*/
	3764	static int
	3765	handle_written_filepage(pagedep, bp)
	3766	struct pagedep *pagedep;
	3767	struct buf bp; / buffer containing the written page */
	3768	{
	3769	struct dirrem *dirrem;
	3770	struct diradd dap, nextdap;
	3771	struct direct *ep;
	3772	int i, chgs;
	3773
	3774	if ((pagedep->pd_state & IOSTARTED) == 0) {
	3775	lk.lkt_held = NOHOLDER;
	3776	panic("handle_written_filepage: not started");
	3777	}
	3778	pagedep->pd_state &= ~IOSTARTED;
	3779	/*
	3780	* Process any directory removals that have been committed.
	3781	*/
	3782	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
	3783	LIST_REMOVE(dirrem, dm_next);
	3784	dirrem->dm_dirinum = pagedep->pd_ino;
	3785	add_to_worklist(&dirrem->dm_list);
	3786	}
	3787	/*
	3788	* Free any directory additions that have been committed.
	3789	*/
	3790	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
	3791	free_diradd(dap);
	3792	/*
	3793	* Uncommitted directory entries must be restored.
	3794	*/
	3795	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
	3796	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
	3797	dap = nextdap) {
	3798	nextdap = LIST_NEXT(dap, da_pdlist);
	3799	if (dap->da_state & ATTACHED) {
	3800	lk.lkt_held = NOHOLDER;
	3801	panic("handle_written_filepage: attached");
	3802	}
	3803	ep = (struct direct *)
	3804	((char *)bp->b_data + dap->da_offset);
	3805	ep->d_ino = dap->da_newinum;
	3806	dap->da_state &= ~UNDONE;
	3807	dap->da_state \|= ATTACHED;
	3808	chgs = 1;
	3809	/*
	3810	* If the inode referenced by the directory has
	3811	* been written out, then the dependency can be
	3812	* moved to the pending list.
	3813	*/
	3814	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
	3815	LIST_REMOVE(dap, da_pdlist);
	3816	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
	3817	da_pdlist);
	3818	}
	3819	}
	3820	}
	3821	/*
	3822	* If there were any rollbacks in the directory, then it must be
	3823	* marked dirty so that its will eventually get written back in
	3824	* its correct form.
	3825	*/
	3826	if (chgs) {
	3827	if ((bp->b_flags & B_DELWRI) == 0)
	3828	stat_dir_entry++;
	3829	bdirty(bp);
	3830	}
	3831	/*
	3832	* If no dependencies remain, the pagedep will be freed.
	3833	* Otherwise it will remain to update the page before it
	3834	* is written back to disk.
	3835	*/
	3836	if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) {
	3837	for (i = 0; i < DAHASHSZ; i++)
	3838	if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
	3839	break;
	3840	if (i == DAHASHSZ) {
	3841	LIST_REMOVE(pagedep, pd_hash);
	3842	WORKITEM_FREE(pagedep, D_PAGEDEP);
	3843	return (0);
	3844	}
	3845	}
	3846	return (1);
	3847	}
	3848
	3849	/*
	3850	* Writing back in-core inode structures.
	3851	*
	3852	* The filesystem only accesses an inode's contents when it occupies an
	3853	* "in-core" inode structure. These "in-core" structures are separate from
	3854	* the page frames used to cache inode blocks. Only the latter are
	3855	* transferred to/from the disk. So, when the updated contents of the
	3856	* "in-core" inode structure are copied to the corresponding in-memory inode
	3857	* block, the dependencies are also transferred. The following procedure is
	3858	* called when copying a dirty "in-core" inode to a cached inode block.
	3859	*/
	3860
	3861	/*
	3862	* Called when an inode is loaded from disk. If the effective link count
	3863	* differed from the actual link count when it was last flushed, then we
	3864	* need to ensure that the correct effective link count is put back.
	3865	*/
	3866	void
	3867	softdep_load_inodeblock(ip)
	3868	struct inode ip; / the "in_core" copy of the inode */
	3869	{
	3870	struct inodedep *inodedep;
	3871
	3872	/*
	3873	* Check for alternate nlink count.
	3874	*/
	3875	ip->i_effnlink = ip->i_nlink;
	3876	ACQUIRE_LOCK(&lk);
	3877	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
	3878	FREE_LOCK(&lk);
	3879	return;
	3880	}
	3881	ip->i_effnlink -= inodedep->id_nlinkdelta;
	3882	FREE_LOCK(&lk);
	3883	}
	3884
	3885	/*
	3886	* This routine is called just before the "in-core" inode
	3887	* information is to be copied to the in-memory inode block.
	3888	* Recall that an inode block contains several inodes. If
	3889	* the force flag is set, then the dependencies will be
	3890	* cleared so that the update can always be made. Note that
	3891	* the buffer is locked when this routine is called, so we
	3892	* will never be in the middle of writing the inode block
	3893	* to disk.
	3894	*/
	3895	void
	3896	softdep_update_inodeblock(ip, bp, waitfor)
	3897	struct inode ip; / the "in_core" copy of the inode */
	3898	struct buf bp; / the buffer containing the inode block */
	3899	int waitfor; /* nonzero => update must be allowed */
	3900	{
	3901	struct inodedep *inodedep;
	3902	struct worklist *wk;
	3903	int error, gotit;
	3904
	3905	/*
	3906	* If the effective link count is not equal to the actual link
	3907	* count, then we must track the difference in an inodedep while
	3908	* the inode is (potentially) tossed out of the cache. Otherwise,
	3909	* if there is no existing inodedep, then there are no dependencies
	3910	* to track.
	3911	*/
	3912	ACQUIRE_LOCK(&lk);
	3913	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
	3914	FREE_LOCK(&lk);
	3915	if (ip->i_effnlink != ip->i_nlink)
	3916	panic("softdep_update_inodeblock: bad link count");
	3917	return;
	3918	}
	3919	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) {
	3920	FREE_LOCK(&lk);
	3921	panic("softdep_update_inodeblock: bad delta");
	3922	}
	3923	/*
	3924	* Changes have been initiated. Anything depending on these
	3925	* changes cannot occur until this inode has been written.
	3926	*/
	3927	inodedep->id_state &= ~COMPLETE;
	3928	if ((inodedep->id_state & ONWORKLIST) == 0)
	3929	WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
	3930	/*
	3931	* Any new dependencies associated with the incore inode must
	3932	* now be moved to the list associated with the buffer holding
	3933	* the in-memory copy of the inode. Once merged process any
	3934	* allocdirects that are completed by the merger.
	3935	*/
	3936	merge_inode_lists(inodedep);
	3937	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
	3938	handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
	3939	/*
	3940	* Now that the inode has been pushed into the buffer, the
	3941	* operations dependent on the inode being written to disk
	3942	* can be moved to the id_bufwait so that they will be
	3943	* processed when the buffer I/O completes.
	3944	*/
	3945	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
	3946	WORKLIST_REMOVE(wk);
	3947	WORKLIST_INSERT(&inodedep->id_bufwait, wk);
	3948	}
	3949	/*
	3950	* Newly allocated inodes cannot be written until the bitmap
	3951	* that allocates them have been written (indicated by
	3952	* DEPCOMPLETE being set in id_state). If we are doing a
	3953	* forced sync (e.g., an fsync on a file), we force the bitmap
	3954	* to be written so that the update can be done.
	3955	*/
	3956	if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\| waitfor == 0) {
	3957	FREE_LOCK(&lk);
	3958	return;
	3959	}
	3960	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
	3961	FREE_LOCK(&lk);
	3962	if (gotit &&
	3963	(error = bwrite(inodedep->id_buf)) != 0)
	3964	softdep_error("softdep_update_inodeblock: bwrite", error);
	3965	if ((inodedep->id_state & DEPCOMPLETE) == 0)
	3966	panic("softdep_update_inodeblock: update failed");
	3967	}
	3968
	3969	/*
	3970	* Merge the new inode dependency list (id_newinoupdt) into the old
	3971	* inode dependency list (id_inoupdt). This routine must be called
	3972	* with splbio interrupts blocked.
	3973	*/
	3974	static void
	3975	merge_inode_lists(inodedep)
	3976	struct inodedep *inodedep;
	3977	{
	3978	struct allocdirect listadp, newadp;
	3979
	3980	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
	3981	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
	3982	if (listadp->ad_lbn < newadp->ad_lbn) {
	3983	listadp = TAILQ_NEXT(listadp, ad_next);
	3984	continue;
	3985	}
	3986	TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
	3987	TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
	3988	if (listadp->ad_lbn == newadp->ad_lbn) {
	3989	allocdirect_merge(&inodedep->id_inoupdt, newadp,
	3990	listadp);
	3991	listadp = newadp;
	3992	}
	3993	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
	3994	}
	3995	while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
	3996	TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
	3997	TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
	3998	}
	3999	}
	4000
	4001	/*
	4002	* If we are doing an fsync, then we must ensure that any directory
	4003	* entries for the inode have been written after the inode gets to disk.
	4004	*/
	4005	static int
	4006	softdep_fsync(vp)
	4007	struct vnode vp; / the "in_core" copy of the inode */
	4008	{
	4009	struct inodedep *inodedep;
	4010	struct pagedep *pagedep;
	4011	struct worklist *wk;
	4012	struct diradd *dap;
	4013	struct mount *mnt;
	4014	struct vnode *pvp;
	4015	struct inode *ip;
	4016	struct buf *bp;
	4017	struct fs *fs;
	4018	int error, flushparent;
	4019	ino_t parentino;
	4020	ufs_lbn_t lbn;
	4021
	4022	ip = VTOI(vp);
	4023	fs = ip->i_fs;
	4024	ACQUIRE_LOCK(&lk);
	4025	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
	4026	FREE_LOCK(&lk);
	4027	return (0);
	4028	}
	4029	if (LIST_FIRST(&inodedep->id_inowait) != NULL \|\|
	4030	LIST_FIRST(&inodedep->id_bufwait) != NULL \|\|
	4031	TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\|
	4032	TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
	4033	FREE_LOCK(&lk);
	4034	panic("softdep_fsync: pending ops");
	4035	}
	4036	for (error = 0, flushparent = 0; ; ) {
	4037	if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
	4038	break;
	4039	if (wk->wk_type != D_DIRADD) {
	4040	FREE_LOCK(&lk);
	4041	panic("softdep_fsync: Unexpected type %s",
	4042	TYPENAME(wk->wk_type));
	4043	}
	4044	dap = WK_DIRADD(wk);
	4045	/*
	4046	* Flush our parent if this directory entry
	4047	* has a MKDIR_PARENT dependency.
	4048	*/
	4049	if (dap->da_state & DIRCHG)
	4050	pagedep = dap->da_previous->dm_pagedep;
	4051	else
	4052	pagedep = dap->da_pagedep;
	4053	mnt = pagedep->pd_mnt;
	4054	parentino = pagedep->pd_ino;
	4055	lbn = pagedep->pd_lbn;
	4056	if ((dap->da_state & (MKDIR_BODY \| COMPLETE)) != COMPLETE) {
	4057	FREE_LOCK(&lk);
	4058	panic("softdep_fsync: dirty");
	4059	}
	4060	flushparent = dap->da_state & MKDIR_PARENT;
	4061	/*
	4062	* If we are being fsync'ed as part of vgone'ing this vnode,
	4063	* then we will not be able to release and recover the
	4064	* vnode below, so we just have to give up on writing its
	4065	* directory entry out. It will eventually be written, just
	4066	* not now, but then the user was not asking to have it
	4067	* written, so we are not breaking any promises.
	4068	*/
	4069	if (vp->v_flag & VRECLAIMED)
	4070	break;
	4071	/*
	4072	* We prevent deadlock by always fetching inodes from the
	4073	* root, moving down the directory tree. Thus, when fetching
	4074	* our parent directory, we must unlock ourselves before
	4075	* requesting the lock on our parent. See the comment in
	4076	* ufs_lookup for details on possible races.
	4077	*/
	4078	FREE_LOCK(&lk);
	4079	VOP_UNLOCK(vp, 0);
	4080	error = VFS_VGET(mnt, parentino, &pvp);
	4081	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	4082	if (error != 0)
	4083	return (error);
	4084	if (flushparent) {
	4085	if ((error = UFS_UPDATE(pvp, 1)) != 0) {
	4086	vput(pvp);
	4087	return (error);
	4088	}
	4089	}
	4090	/*
	4091	* Flush directory page containing the inode's name.
	4092	*/
	4093	error = bread(pvp, lblktodoff(fs, lbn), blksize(fs, VTOI(pvp), lbn), &bp);
	4094	if (error == 0)
	4095	error = bwrite(bp);
	4096	vput(pvp);
	4097	if (error != 0)
	4098	return (error);
	4099	ACQUIRE_LOCK(&lk);
	4100	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
	4101	break;
	4102	}
	4103	FREE_LOCK(&lk);
	4104	return (0);
	4105	}
	4106
	4107	/*
	4108	* Flush all the dirty bitmaps associated with the block device
	4109	* before flushing the rest of the dirty blocks so as to reduce
	4110	* the number of dependencies that will have to be rolled back.
	4111	*/
	4112	static int softdep_fsync_mountdev_bp(struct buf bp, void data);
	4113
	4114	void
	4115	softdep_fsync_mountdev(vp)
	4116	struct vnode *vp;
	4117	{
	4118	if (!vn_isdisk(vp, NULL))
	4119	panic("softdep_fsync_mountdev: vnode not a disk");
	4120	ACQUIRE_LOCK(&lk);
	4121	crit_enter();
	4122	RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
	4123	softdep_fsync_mountdev_bp, vp);
	4124	crit_exit();
	4125	drain_output(vp, 1);
	4126	FREE_LOCK(&lk);
	4127	}
	4128
	4129	static int
	4130	softdep_fsync_mountdev_bp(struct buf bp, void data)
	4131	{
	4132	struct worklist *wk;
	4133	struct vnode *vp = data;
	4134
	4135	/*
	4136	* If it is already scheduled, skip to the next buffer.
	4137	*/
	4138	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT))
	4139	return(0);
	4140	if (bp->b_vp != vp \|\| (bp->b_flags & B_DELWRI) == 0) {
	4141	BUF_UNLOCK(bp);
	4142	printf("softdep_fsync_mountdev_bp: warning, buffer %p ripped out from under vnode %p\n", bp, vp);
	4143	return(0);
	4144	}
	4145	/*
	4146	* We are only interested in bitmaps with outstanding
	4147	* dependencies.
	4148	*/
	4149	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL \|\|
	4150	wk->wk_type != D_BMSAFEMAP) {
	4151	BUF_UNLOCK(bp);
	4152	return(0);
	4153	}
	4154	bremfree(bp);
	4155	FREE_LOCK(&lk);
	4156	(void) bawrite(bp);
	4157	ACQUIRE_LOCK(&lk);
	4158	return(0);
	4159	}
	4160
	4161	/*
	4162	* This routine is called when we are trying to synchronously flush a
	4163	* file. This routine must eliminate any filesystem metadata dependencies
	4164	* so that the syncing routine can succeed by pushing the dirty blocks
	4165	* associated with the file. If any I/O errors occur, they are returned.
	4166	*/
	4167	struct softdep_sync_metadata_info {
	4168	struct vnode *vp;
	4169	int waitfor;
	4170	};
	4171
	4172	static int softdep_sync_metadata_bp(struct buf bp, void data);
	4173
	4174	int
	4175	softdep_sync_metadata(struct vnode vp, struct thread td)
	4176	{
	4177	struct softdep_sync_metadata_info info;
	4178	int error, waitfor;
	4179
	4180	/*
	4181	* Check whether this vnode is involved in a filesystem
	4182	* that is doing soft dependency processing.
	4183	*/
	4184	if (!vn_isdisk(vp, NULL)) {
	4185	if (!DOINGSOFTDEP(vp))
	4186	return (0);
	4187	} else
	4188	if (vp->v_rdev->si_mountpoint == NULL \|\|
	4189	(vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0)
	4190	return (0);
	4191	/*
	4192	* Ensure that any direct block dependencies have been cleared.
	4193	*/
	4194	ACQUIRE_LOCK(&lk);
	4195	if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
	4196	FREE_LOCK(&lk);
	4197	return (error);
	4198	}
	4199	/*
	4200	* For most files, the only metadata dependencies are the
	4201	* cylinder group maps that allocate their inode or blocks.
	4202	* The block allocation dependencies can be found by traversing
	4203	* the dependency lists for any buffers that remain on their
	4204	* dirty buffer list. The inode allocation dependency will
	4205	* be resolved when the inode is updated with MNT_WAIT.
	4206	* This work is done in two passes. The first pass grabs most
	4207	* of the buffers and begins asynchronously writing them. The
	4208	* only way to wait for these asynchronous writes is to sleep
	4209	* on the filesystem vnode which may stay busy for a long time
	4210	* if the filesystem is active. So, instead, we make a second
	4211	* pass over the dependencies blocking on each write. In the
	4212	* usual case we will be blocking against a write that we
	4213	* initiated, so when it is done the dependency will have been
	4214	* resolved. Thus the second pass is expected to end quickly.
	4215	*/
	4216	waitfor = MNT_NOWAIT;
	4217	top:
	4218	/*
	4219	* We must wait for any I/O in progress to finish so that
	4220	* all potential buffers on the dirty list will be visible.
	4221	*/
	4222	drain_output(vp, 1);
	4223	info.vp = vp;
	4224	info.waitfor = waitfor;
	4225	crit_enter();
	4226	error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
	4227	softdep_sync_metadata_bp, &info);
	4228	crit_exit();
	4229	if (error < 0) {
	4230	FREE_LOCK(&lk);
	4231	return(-error); /* error code */
	4232	}
	4233
	4234	/*
	4235	* The brief unlock is to allow any pent up dependency
	4236	* processing to be done. Then proceed with the second pass.
	4237	*/
	4238	if (waitfor == MNT_NOWAIT) {
	4239	waitfor = MNT_WAIT;
	4240	FREE_LOCK(&lk);
	4241	ACQUIRE_LOCK(&lk);
	4242	goto top;
	4243	}
	4244
	4245	/*
	4246	* If we have managed to get rid of all the dirty buffers,
	4247	* then we are done. For certain directories and block
	4248	* devices, we may need to do further work.
	4249	*
	4250	* We must wait for any I/O in progress to finish so that
	4251	* all potential buffers on the dirty list will be visible.
	4252	*/
	4253	drain_output(vp, 1);
	4254	if (RB_EMPTY(&vp->v_rbdirty_tree)) {
	4255	FREE_LOCK(&lk);
	4256	return (0);
	4257	}
	4258
	4259	FREE_LOCK(&lk);
	4260	/*
	4261	* If we are trying to sync a block device, some of its buffers may
	4262	* contain metadata that cannot be written until the contents of some
	4263	* partially written files have been written to disk. The only easy
	4264	* way to accomplish this is to sync the entire filesystem (luckily
	4265	* this happens rarely).
	4266	*/
	4267	if (vn_isdisk(vp, NULL) &&
	4268	vp->v_rdev &&
	4269	vp->v_rdev->si_mountpoint && !VOP_ISLOCKED(vp, NULL) &&
	4270	(error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT)) != 0)
	4271	return (error);
	4272	return (0);
	4273	}
	4274
	4275	static int
	4276	softdep_sync_metadata_bp(struct buf bp, void data)
	4277	{
	4278	struct softdep_sync_metadata_info *info = data;
	4279	struct pagedep *pagedep;
	4280	struct allocdirect *adp;
	4281	struct allocindir *aip;
	4282	struct worklist *wk;
	4283	struct buf *nbp;
	4284	int error;
	4285	int i;
	4286
	4287	if (getdirtybuf(&bp, MNT_WAIT) == 0) {
	4288	printf("softdep_sync_metadata_bp(1): caught buf %p going away\n", bp);
	4289	return (1);
	4290	}
	4291	if (bp->b_vp != info->vp \|\| (bp->b_flags & B_DELWRI) == 0) {
	4292	printf("softdep_sync_metadata_bp(2): caught buf %p going away vp %p\n", bp, info->vp);
	4293	BUF_UNLOCK(bp);
	4294	return(1);
	4295	}
	4296
	4297	/*
	4298	* As we hold the buffer locked, none of its dependencies
	4299	* will disappear.
	4300	*/
	4301	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	4302	switch (wk->wk_type) {
	4303
	4304	case D_ALLOCDIRECT:
	4305	adp = WK_ALLOCDIRECT(wk);
	4306	if (adp->ad_state & DEPCOMPLETE)
	4307	break;
	4308	nbp = adp->ad_buf;
	4309	if (getdirtybuf(&nbp, info->waitfor) == 0)
	4310	break;
	4311	FREE_LOCK(&lk);
	4312	if (info->waitfor == MNT_NOWAIT) {
	4313	bawrite(nbp);
	4314	} else if ((error = bwrite(nbp)) != 0) {
	4315	bawrite(bp);
	4316	ACQUIRE_LOCK(&lk);
	4317	return (-error);
	4318	}
	4319	ACQUIRE_LOCK(&lk);
	4320	break;
	4321
	4322	case D_ALLOCINDIR:
	4323	aip = WK_ALLOCINDIR(wk);
	4324	if (aip->ai_state & DEPCOMPLETE)
	4325	break;
	4326	nbp = aip->ai_buf;
	4327	if (getdirtybuf(&nbp, info->waitfor) == 0)
	4328	break;
	4329	FREE_LOCK(&lk);
	4330	if (info->waitfor == MNT_NOWAIT) {
	4331	bawrite(nbp);
	4332	} else if ((error = bwrite(nbp)) != 0) {
	4333	bawrite(bp);
	4334	ACQUIRE_LOCK(&lk);
	4335	return (-error);
	4336	}
	4337	ACQUIRE_LOCK(&lk);
	4338	break;
	4339
	4340	case D_INDIRDEP:
	4341	restart:
	4342
	4343	LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
	4344	if (aip->ai_state & DEPCOMPLETE)
	4345	continue;
	4346	nbp = aip->ai_buf;
	4347	if (getdirtybuf(&nbp, MNT_WAIT) == 0)
	4348	goto restart;
	4349	FREE_LOCK(&lk);
	4350	if ((error = bwrite(nbp)) != 0) {
	4351	bawrite(bp);
	4352	ACQUIRE_LOCK(&lk);
	4353	return (-error);
	4354	}
	4355	ACQUIRE_LOCK(&lk);
	4356	goto restart;
	4357	}
	4358	break;
	4359
	4360	case D_INODEDEP:
	4361	if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
	4362	WK_INODEDEP(wk)->id_ino)) != 0) {
	4363	FREE_LOCK(&lk);
	4364	bawrite(bp);
	4365	ACQUIRE_LOCK(&lk);
	4366	return (-error);
	4367	}
	4368	break;
	4369
	4370	case D_PAGEDEP:
	4371	/*
	4372	* We are trying to sync a directory that may
	4373	* have dependencies on both its own metadata
	4374	* and/or dependencies on the inodes of any
	4375	* recently allocated files. We walk its diradd
	4376	* lists pushing out the associated inode.
	4377	*/
	4378	pagedep = WK_PAGEDEP(wk);
	4379	for (i = 0; i < DAHASHSZ; i++) {
	4380	if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
	4381	continue;
	4382	if ((error =
	4383	flush_pagedep_deps(info->vp,
	4384	pagedep->pd_mnt,
	4385	&pagedep->pd_diraddhd[i]))) {
	4386	FREE_LOCK(&lk);
	4387	bawrite(bp);
	4388	ACQUIRE_LOCK(&lk);
	4389	return (-error);
	4390	}
	4391	}
	4392	break;
	4393
	4394	case D_MKDIR:
	4395	/*
	4396	* This case should never happen if the vnode has
	4397	* been properly sync'ed. However, if this function
	4398	* is used at a place where the vnode has not yet
	4399	* been sync'ed, this dependency can show up. So,
	4400	* rather than panic, just flush it.
	4401	*/
	4402	nbp = WK_MKDIR(wk)->md_buf;
	4403	if (getdirtybuf(&nbp, info->waitfor) == 0)
	4404	break;
	4405	FREE_LOCK(&lk);
	4406	if (info->waitfor == MNT_NOWAIT) {
	4407	bawrite(nbp);
	4408	} else if ((error = bwrite(nbp)) != 0) {
	4409	bawrite(bp);
	4410	ACQUIRE_LOCK(&lk);
	4411	return (-error);
	4412	}
	4413	ACQUIRE_LOCK(&lk);
	4414	break;
	4415
	4416	case D_BMSAFEMAP:
	4417	/*
	4418	* This case should never happen if the vnode has
	4419	* been properly sync'ed. However, if this function
	4420	* is used at a place where the vnode has not yet
	4421	* been sync'ed, this dependency can show up. So,
	4422	* rather than panic, just flush it.
	4423	*
	4424	* nbp can wind up == bp if a device node for the
	4425	* same filesystem is being fsynced at the same time,
	4426	* leading to a panic if we don't catch the case.
	4427	*/
	4428	nbp = WK_BMSAFEMAP(wk)->sm_buf;
	4429	if (nbp == bp)
	4430	break;
	4431	if (getdirtybuf(&nbp, info->waitfor) == 0)
	4432	break;
	4433	FREE_LOCK(&lk);
	4434	if (info->waitfor == MNT_NOWAIT) {
	4435	bawrite(nbp);
	4436	} else if ((error = bwrite(nbp)) != 0) {
	4437	bawrite(bp);
	4438	ACQUIRE_LOCK(&lk);
	4439	return (-error);
	4440	}
	4441	ACQUIRE_LOCK(&lk);
	4442	break;
	4443
	4444	default:
	4445	FREE_LOCK(&lk);
	4446	panic("softdep_sync_metadata: Unknown type %s",
	4447	TYPENAME(wk->wk_type));
	4448	/* NOTREACHED */
	4449	}
	4450	}
	4451	FREE_LOCK(&lk);
	4452	bawrite(bp);
	4453	ACQUIRE_LOCK(&lk);
	4454	return(0);
	4455	}
	4456
	4457	/*
	4458	* Flush the dependencies associated with an inodedep.
	4459	* Called with splbio blocked.
	4460	*/
	4461	static int
	4462	flush_inodedep_deps(fs, ino)
	4463	struct fs *fs;
	4464	ino_t ino;
	4465	{
	4466	struct inodedep *inodedep;
	4467	struct allocdirect *adp;
	4468	int error, waitfor;
	4469	struct buf *bp;
	4470
	4471	/*
	4472	* This work is done in two passes. The first pass grabs most
	4473	* of the buffers and begins asynchronously writing them. The
	4474	* only way to wait for these asynchronous writes is to sleep
	4475	* on the filesystem vnode which may stay busy for a long time
	4476	* if the filesystem is active. So, instead, we make a second
	4477	* pass over the dependencies blocking on each write. In the
	4478	* usual case we will be blocking against a write that we
	4479	* initiated, so when it is done the dependency will have been
	4480	* resolved. Thus the second pass is expected to end quickly.
	4481	* We give a brief window at the top of the loop to allow
	4482	* any pending I/O to complete.
	4483	*/
	4484	for (waitfor = MNT_NOWAIT; ; ) {
	4485	FREE_LOCK(&lk);
	4486	ACQUIRE_LOCK(&lk);
	4487	if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
	4488	return (0);
	4489	TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) {
	4490	if (adp->ad_state & DEPCOMPLETE)
	4491	continue;
	4492	bp = adp->ad_buf;
	4493	if (getdirtybuf(&bp, waitfor) == 0) {
	4494	if (waitfor == MNT_NOWAIT)
	4495	continue;
	4496	break;
	4497	}
	4498	FREE_LOCK(&lk);
	4499	if (waitfor == MNT_NOWAIT) {
	4500	bawrite(bp);
	4501	} else if ((error = bwrite(bp)) != 0) {
	4502	ACQUIRE_LOCK(&lk);
	4503	return (error);
	4504	}
	4505	ACQUIRE_LOCK(&lk);
	4506	break;
	4507	}
	4508	if (adp != NULL)
	4509	continue;
	4510	TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) {
	4511	if (adp->ad_state & DEPCOMPLETE)
	4512	continue;
	4513	bp = adp->ad_buf;
	4514	if (getdirtybuf(&bp, waitfor) == 0) {
	4515	if (waitfor == MNT_NOWAIT)
	4516	continue;
	4517	break;
	4518	}
	4519	FREE_LOCK(&lk);
	4520	if (waitfor == MNT_NOWAIT) {
	4521	bawrite(bp);
	4522	} else if ((error = bwrite(bp)) != 0) {
	4523	ACQUIRE_LOCK(&lk);
	4524	return (error);
	4525	}
	4526	ACQUIRE_LOCK(&lk);
	4527	break;
	4528	}
	4529	if (adp != NULL)
	4530	continue;
	4531	/*
	4532	* If pass2, we are done, otherwise do pass 2.
	4533	*/
	4534	if (waitfor == MNT_WAIT)
	4535	break;
	4536	waitfor = MNT_WAIT;
	4537	}
	4538	/*
	4539	* Try freeing inodedep in case all dependencies have been removed.
	4540	*/
	4541	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
	4542	(void) free_inodedep(inodedep);
	4543	return (0);
	4544	}
	4545
	4546	/*
	4547	* Eliminate a pagedep dependency by flushing out all its diradd dependencies.
	4548	* Called with splbio blocked.
	4549	*/
	4550	static int
	4551	flush_pagedep_deps(pvp, mp, diraddhdp)
	4552	struct vnode *pvp;
	4553	struct mount *mp;
	4554	struct diraddhd *diraddhdp;
	4555	{
	4556	struct inodedep *inodedep;
	4557	struct ufsmount *ump;
	4558	struct diradd *dap;
	4559	struct vnode *vp;
	4560	int gotit, error = 0;
	4561	struct buf *bp;
	4562	ino_t inum;
	4563
	4564	ump = VFSTOUFS(mp);
	4565	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
	4566	/*
	4567	* Flush ourselves if this directory entry
	4568	* has a MKDIR_PARENT dependency.
	4569	*/
	4570	if (dap->da_state & MKDIR_PARENT) {
	4571	FREE_LOCK(&lk);
	4572	if ((error = UFS_UPDATE(pvp, 1)) != 0)
	4573	break;
	4574	ACQUIRE_LOCK(&lk);
	4575	/*
	4576	* If that cleared dependencies, go on to next.
	4577	*/
	4578	if (dap != LIST_FIRST(diraddhdp))
	4579	continue;
	4580	if (dap->da_state & MKDIR_PARENT) {
	4581	FREE_LOCK(&lk);
	4582	panic("flush_pagedep_deps: MKDIR_PARENT");
	4583	}
	4584	}
	4585	/*
	4586	* A newly allocated directory must have its "." and
	4587	* ".." entries written out before its name can be
	4588	* committed in its parent. We do not want or need
	4589	* the full semantics of a synchronous VOP_FSYNC as
	4590	* that may end up here again, once for each directory
	4591	* level in the filesystem. Instead, we push the blocks
	4592	* and wait for them to clear. We have to fsync twice
	4593	* because the first call may choose to defer blocks
	4594	* that still have dependencies, but deferral will
	4595	* happen at most once.
	4596	*/
	4597	inum = dap->da_newinum;
	4598	if (dap->da_state & MKDIR_BODY) {
	4599	FREE_LOCK(&lk);
	4600	if ((error = VFS_VGET(mp, inum, &vp)) != 0)
	4601	break;
	4602	if ((error=VOP_FSYNC(vp, MNT_NOWAIT)) \|\|
	4603	(error=VOP_FSYNC(vp, MNT_NOWAIT))) {
	4604	vput(vp);
	4605	break;
	4606	}
	4607	drain_output(vp, 0);
	4608	vput(vp);
	4609	ACQUIRE_LOCK(&lk);
	4610	/*
	4611	* If that cleared dependencies, go on to next.
	4612	*/
	4613	if (dap != LIST_FIRST(diraddhdp))
	4614	continue;
	4615	if (dap->da_state & MKDIR_BODY) {
	4616	FREE_LOCK(&lk);
	4617	panic("flush_pagedep_deps: MKDIR_BODY");
	4618	}
	4619	}
	4620	/*
	4621	* Flush the inode on which the directory entry depends.
	4622	* Having accounted for MKDIR_PARENT and MKDIR_BODY above,
	4623	* the only remaining dependency is that the updated inode
	4624	* count must get pushed to disk. The inode has already
	4625	* been pushed into its inode buffer (via VOP_UPDATE) at
	4626	* the time of the reference count change. So we need only
	4627	* locate that buffer, ensure that there will be no rollback
	4628	* caused by a bitmap dependency, then write the inode buffer.
	4629	*/
	4630	if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
	4631	FREE_LOCK(&lk);
	4632	panic("flush_pagedep_deps: lost inode");
	4633	}
	4634	/*
	4635	* If the inode still has bitmap dependencies,
	4636	* push them to disk.
	4637	*/
	4638	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
	4639	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
	4640	FREE_LOCK(&lk);
	4641	if (gotit && (error = bwrite(inodedep->id_buf)) != 0)
	4642	break;
	4643	ACQUIRE_LOCK(&lk);
	4644	if (dap != LIST_FIRST(diraddhdp))
	4645	continue;
	4646	}
	4647	/*
	4648	* If the inode is still sitting in a buffer waiting
	4649	* to be written, push it to disk.
	4650	*/
	4651	FREE_LOCK(&lk);
	4652	if ((error = bread(ump->um_devvp,
	4653	fsbtodoff(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
	4654	(int)ump->um_fs->fs_bsize, &bp)) != 0)
	4655	break;
	4656	if ((error = bwrite(bp)) != 0)
	4657	break;
	4658	ACQUIRE_LOCK(&lk);
	4659	/*
	4660	* If we have failed to get rid of all the dependencies
	4661	* then something is seriously wrong.
	4662	*/
	4663	if (dap == LIST_FIRST(diraddhdp)) {
	4664	FREE_LOCK(&lk);
	4665	panic("flush_pagedep_deps: flush failed");
	4666	}
	4667	}
	4668	if (error)
	4669	ACQUIRE_LOCK(&lk);
	4670	return (error);
	4671	}
	4672
	4673	/*
	4674	* A large burst of file addition or deletion activity can drive the
	4675	* memory load excessively high. First attempt to slow things down
	4676	* using the techniques below. If that fails, this routine requests
	4677	* the offending operations to fall back to running synchronously
	4678	* until the memory load returns to a reasonable level.
	4679	*/
	4680	int
	4681	softdep_slowdown(vp)
	4682	struct vnode *vp;
	4683	{
	4684	int max_softdeps_hard;
	4685
	4686	max_softdeps_hard = max_softdeps * 11 / 10;
	4687	if (num_dirrem < max_softdeps_hard / 2 &&
	4688	num_inodedep < max_softdeps_hard)
	4689	return (0);
	4690	stat_sync_limit_hit += 1;
	4691	return (1);
	4692	}
	4693
	4694	/*
	4695	* If memory utilization has gotten too high, deliberately slow things
	4696	* down and speed up the I/O processing.
	4697	*/
	4698	static int
	4699	request_cleanup(resource, islocked)
	4700	int resource;
	4701	int islocked;
	4702	{
	4703	struct thread td = curthread; / XXX */
	4704
	4705	/*
	4706	* We never hold up the filesystem syncer process.
	4707	*/
	4708	if (td == filesys_syncer)
	4709	return (0);
	4710	/*
	4711	* First check to see if the work list has gotten backlogged.
	4712	* If it has, co-opt this process to help clean up two entries.
	4713	* Because this process may hold inodes locked, we cannot
	4714	* handle any remove requests that might block on a locked
	4715	* inode as that could lead to deadlock.
	4716	*/
	4717	if (num_on_worklist > max_softdeps / 10) {
	4718	if (islocked)
	4719	FREE_LOCK(&lk);
	4720	process_worklist_item(NULL, LK_NOWAIT);
	4721	process_worklist_item(NULL, LK_NOWAIT);
	4722	stat_worklist_push += 2;
	4723	if (islocked)
	4724	ACQUIRE_LOCK(&lk);
	4725	return(1);
	4726	}
	4727
	4728	/*
	4729	* If we are resource constrained on inode dependencies, try
	4730	* flushing some dirty inodes. Otherwise, we are constrained
	4731	* by file deletions, so try accelerating flushes of directories
	4732	* with removal dependencies. We would like to do the cleanup
	4733	* here, but we probably hold an inode locked at this point and
	4734	* that might deadlock against one that we try to clean. So,
	4735	* the best that we can do is request the syncer daemon to do
	4736	* the cleanup for us.
	4737	*/
	4738	switch (resource) {
	4739
	4740	case FLUSH_INODES:
	4741	stat_ino_limit_push += 1;
	4742	req_clear_inodedeps += 1;
	4743	stat_countp = &stat_ino_limit_hit;
	4744	break;
	4745
	4746	case FLUSH_REMOVE:
	4747	stat_blk_limit_push += 1;
	4748	req_clear_remove += 1;
	4749	stat_countp = &stat_blk_limit_hit;
	4750	break;
	4751
	4752	default:
	4753	if (islocked)
	4754	FREE_LOCK(&lk);
	4755	panic("request_cleanup: unknown type");
	4756	}
	4757	/*
	4758	* Hopefully the syncer daemon will catch up and awaken us.
	4759	* We wait at most tickdelay before proceeding in any case.
	4760	*/
	4761	if (islocked == 0)
	4762	ACQUIRE_LOCK(&lk);
	4763	crit_enter();
	4764	proc_waiting += 1;
	4765	if (!callout_active(&handle))
	4766	callout_reset(&handle, tickdelay > 2 ? tickdelay : 2,
	4767	pause_timer, NULL);
	4768	interlocked_sleep(&lk, SLEEP, (caddr_t)&proc_waiting, 0,
	4769	"softupdate", 0);
	4770	proc_waiting -= 1;
	4771	crit_exit();
	4772	if (islocked == 0)
	4773	FREE_LOCK(&lk);
	4774	return (1);
	4775	}
	4776
	4777	/*
	4778	* Awaken processes pausing in request_cleanup and clear proc_waiting
	4779	* to indicate that there is no longer a timer running.
	4780	*/
	4781	void
	4782	pause_timer(arg)
	4783	void *arg;
	4784	{
	4785	*stat_countp += 1;
	4786	wakeup_one(&proc_waiting);
	4787	if (proc_waiting > 0)
	4788	callout_reset(&handle, tickdelay > 2 ? tickdelay : 2,
	4789	pause_timer, NULL);
	4790	else
	4791	callout_deactivate(&handle);
	4792	}
	4793
	4794	/*
	4795	* Flush out a directory with at least one removal dependency in an effort to
	4796	* reduce the number of dirrem, freefile, and freeblks dependency structures.
	4797	*/
	4798	static void
	4799	clear_remove(struct thread *td)
	4800	{
	4801	struct pagedep_hashhead *pagedephd;
	4802	struct pagedep *pagedep;
	4803	static int next = 0;
	4804	struct mount *mp;
	4805	struct vnode *vp;
	4806	int error, cnt;
	4807	ino_t ino;
	4808
	4809	ACQUIRE_LOCK(&lk);
	4810	for (cnt = 0; cnt < pagedep_hash; cnt++) {
	4811	pagedephd = &pagedep_hashtbl[next++];
	4812	if (next >= pagedep_hash)
	4813	next = 0;
	4814	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
	4815	if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
	4816	continue;
	4817	mp = pagedep->pd_mnt;
	4818	ino = pagedep->pd_ino;
	4819	FREE_LOCK(&lk);
	4820	if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
	4821	softdep_error("clear_remove: vget", error);
	4822	return;
	4823	}
	4824	if ((error = VOP_FSYNC(vp, MNT_NOWAIT)))
	4825	softdep_error("clear_remove: fsync", error);
	4826	drain_output(vp, 0);
	4827	vput(vp);
	4828	return;
	4829	}
	4830	}
	4831	FREE_LOCK(&lk);
	4832	}
	4833
	4834	/*
	4835	* Clear out a block of dirty inodes in an effort to reduce
	4836	* the number of inodedep dependency structures.
	4837	*/
	4838	struct clear_inodedeps_info {
	4839	struct fs *fs;
	4840	struct mount *mp;
	4841	};
	4842
	4843	static int
	4844	clear_inodedeps_mountlist_callback(struct mount mp, void data)
	4845	{
	4846	struct clear_inodedeps_info *info = data;
	4847
	4848	if ((mp->mnt_flag & MNT_SOFTDEP) && info->fs == VFSTOUFS(mp)->um_fs) {
	4849	info->mp = mp;
	4850	return(-1);
	4851	}
	4852	return(0);
	4853	}
	4854
	4855	static void
	4856	clear_inodedeps(struct thread *td)
	4857	{
	4858	struct clear_inodedeps_info info;
	4859	struct inodedep_hashhead *inodedephd;
	4860	struct inodedep *inodedep;
	4861	static int next = 0;
	4862	struct vnode *vp;
	4863	struct fs *fs;
	4864	int error, cnt;
	4865	ino_t firstino, lastino, ino;
	4866
	4867	ACQUIRE_LOCK(&lk);
	4868	/*
	4869	* Pick a random inode dependency to be cleared.
	4870	* We will then gather up all the inodes in its block
	4871	* that have dependencies and flush them out.
	4872	*/
	4873	for (cnt = 0; cnt < inodedep_hash; cnt++) {
	4874	inodedephd = &inodedep_hashtbl[next++];
	4875	if (next >= inodedep_hash)
	4876	next = 0;
	4877	if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
	4878	break;
	4879	}
	4880	if (inodedep == NULL) {
	4881	FREE_LOCK(&lk);
	4882	return;
	4883	}
	4884	/*
	4885	* Ugly code to find mount point given pointer to superblock.
	4886	*/
	4887	fs = inodedep->id_fs;
	4888	info.mp = NULL;
	4889	info.fs = fs;
	4890	mountlist_scan(clear_inodedeps_mountlist_callback,
	4891	&info, MNTSCAN_FORWARD\|MNTSCAN_NOBUSY);
	4892	/*
	4893	* Find the last inode in the block with dependencies.
	4894	*/
	4895	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
	4896	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
	4897	if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
	4898	break;
	4899	/*
	4900	* Asynchronously push all but the last inode with dependencies.
	4901	* Synchronously push the last inode with dependencies to ensure
	4902	* that the inode block gets written to free up the inodedeps.
	4903	*/
	4904	for (ino = firstino; ino <= lastino; ino++) {
	4905	if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
	4906	continue;
	4907	FREE_LOCK(&lk);
	4908	if ((error = VFS_VGET(info.mp, ino, &vp)) != 0) {
	4909	softdep_error("clear_inodedeps: vget", error);
	4910	return;
	4911	}
	4912	if (ino == lastino) {
	4913	if ((error = VOP_FSYNC(vp, MNT_WAIT)))
	4914	softdep_error("clear_inodedeps: fsync1", error);
	4915	} else {
	4916	if ((error = VOP_FSYNC(vp, MNT_NOWAIT)))
	4917	softdep_error("clear_inodedeps: fsync2", error);
	4918	drain_output(vp, 0);
	4919	}
	4920	vput(vp);
	4921	ACQUIRE_LOCK(&lk);
	4922	}
	4923	FREE_LOCK(&lk);
	4924	}
	4925
	4926	/*
	4927	* Function to determine if the buffer has outstanding dependencies
	4928	* that will cause a roll-back if the buffer is written. If wantcount
	4929	* is set, return number of dependencies, otherwise just yes or no.
	4930	*/
	4931	static int
	4932	softdep_count_dependencies(bp, wantcount)
	4933	struct buf *bp;
	4934	int wantcount;
	4935	{
	4936	struct worklist *wk;
	4937	struct inodedep *inodedep;
	4938	struct indirdep *indirdep;
	4939	struct allocindir *aip;
	4940	struct pagedep *pagedep;
	4941	struct diradd *dap;
	4942	int i, retval;
	4943
	4944	retval = 0;
	4945	ACQUIRE_LOCK(&lk);
	4946	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	4947	switch (wk->wk_type) {
	4948
	4949	case D_INODEDEP:
	4950	inodedep = WK_INODEDEP(wk);
	4951	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
	4952	/* bitmap allocation dependency */
	4953	retval += 1;
	4954	if (!wantcount)
	4955	goto out;
	4956	}
	4957	if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
	4958	/* direct block pointer dependency */
	4959	retval += 1;
	4960	if (!wantcount)
	4961	goto out;
	4962	}
	4963	continue;
	4964
	4965	case D_INDIRDEP:
	4966	indirdep = WK_INDIRDEP(wk);
	4967
	4968	LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
	4969	/* indirect block pointer dependency */
	4970	retval += 1;
	4971	if (!wantcount)
	4972	goto out;
	4973	}
	4974	continue;
	4975
	4976	case D_PAGEDEP:
	4977	pagedep = WK_PAGEDEP(wk);
	4978	for (i = 0; i < DAHASHSZ; i++) {
	4979
	4980	LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
	4981	/* directory entry dependency */
	4982	retval += 1;
	4983	if (!wantcount)
	4984	goto out;
	4985	}
	4986	}
	4987	continue;
	4988
	4989	case D_BMSAFEMAP:
	4990	case D_ALLOCDIRECT:
	4991	case D_ALLOCINDIR:
	4992	case D_MKDIR:
	4993	/* never a dependency on these blocks */
	4994	continue;
	4995
	4996	default:
	4997	FREE_LOCK(&lk);
	4998	panic("softdep_check_for_rollback: Unexpected type %s",
	4999	TYPENAME(wk->wk_type));
	5000	/* NOTREACHED */
	5001	}
	5002	}
	5003	out:
	5004	FREE_LOCK(&lk);
	5005	return retval;
	5006	}
	5007
	5008	/*
	5009	* Acquire exclusive access to a buffer.
	5010	* Must be called with splbio blocked.
	5011	* Return 1 if buffer was acquired.
	5012	*/
	5013	static int
	5014	getdirtybuf(bpp, waitfor)
	5015	struct buf **bpp;
	5016	int waitfor;
	5017	{
	5018	struct buf *bp;
	5019	int error;
	5020
	5021	for (;;) {
	5022	if ((bp = *bpp) == NULL)
	5023	return (0);
	5024	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT) == 0)
	5025	break;
	5026	if (waitfor != MNT_WAIT)
	5027	return (0);
	5028	error = interlocked_sleep(&lk, LOCKBUF, bp,
	5029	LK_EXCLUSIVE \| LK_SLEEPFAIL, 0, 0);
	5030	if (error != ENOLCK) {
	5031	FREE_LOCK(&lk);
	5032	panic("getdirtybuf: inconsistent lock");
	5033	}
	5034	}
	5035	if ((bp->b_flags & B_DELWRI) == 0) {
	5036	BUF_UNLOCK(bp);
	5037	return (0);
	5038	}
	5039	bremfree(bp);
	5040	return (1);
	5041	}
	5042
	5043	/*
	5044	* Wait for pending output on a vnode to complete.
	5045	* Must be called with vnode locked.
	5046	*/
	5047	static void
	5048	drain_output(vp, islocked)
	5049	struct vnode *vp;
	5050	int islocked;
	5051	{
	5052
	5053	if (!islocked)
	5054	ACQUIRE_LOCK(&lk);
	5055	while (vp->v_track_write.bk_active) {
	5056	vp->v_track_write.bk_waitflag = 1;
	5057	interlocked_sleep(&lk, SLEEP, &vp->v_track_write,
	5058	0, "drainvp", 0);
	5059	}
	5060	if (!islocked)
	5061	FREE_LOCK(&lk);
	5062	}
	5063
	5064	/*
	5065	* Called whenever a buffer that is being invalidated or reallocated
	5066	* contains dependencies. This should only happen if an I/O error has
	5067	* occurred. The routine is called with the buffer locked.
	5068	*/
	5069	static void
	5070	softdep_deallocate_dependencies(bp)
	5071	struct buf *bp;
	5072	{
	5073
	5074	if ((bp->b_flags & B_ERROR) == 0)
	5075	panic("softdep_deallocate_dependencies: dangling deps");
	5076	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntfromname, bp->b_error);
	5077	panic("softdep_deallocate_dependencies: unrecovered I/O error");
	5078	}
	5079
	5080	/*
	5081	* Function to handle asynchronous write errors in the filesystem.
	5082	*/
	5083	void
	5084	softdep_error(func, error)
	5085	char *func;
	5086	int error;
	5087	{
	5088
	5089	/* XXX should do something better! */
	5090	printf("%s: got error %d while accessing filesystem\n", func, error);
	5091	}