gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
	3	*
	4	* The soft updates code is derived from the appendix of a University
	5	* of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
	6	* "Soft Updates: A Solution to the Metadata Update Problem in File
	7	* Systems", CSE-TR-254-95, August 1995).
	8	*
	9	* Further information about soft updates can be obtained from:
	10	*
	11	* Marshall Kirk McKusick http://www.mckusick.com/softdep/
	12	* 1614 Oxford Street mckusick@mckusick.com
	13	* Berkeley, CA 94709-1608 +1-510-843-9542
	14	* USA
	15	*
	16	* Redistribution and use in source and binary forms, with or without
	17	* modification, are permitted provided that the following conditions
	18	* are met:
	19	*
	20	* 1. Redistributions of source code must retain the above copyright
	21	* notice, this list of conditions and the following disclaimer.
	22	* 2. Redistributions in binary form must reproduce the above copyright
	23	* notice, this list of conditions and the following disclaimer in the
	24	* documentation and/or other materials provided with the distribution.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
	27	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	28	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	29	* DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
	30	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
	39	* $FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.57.2.11 2002/02/05 18:46:53 dillon Exp $
	40	*/
	41
	42	/*
	43	* For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
	44	*/
	45	#ifndef DIAGNOSTIC
	46	#define DIAGNOSTIC
	47	#endif
	48	#ifndef DEBUG
	49	#define DEBUG
	50	#endif
	51
	52	#include <sys/param.h>
	53	#include <sys/kernel.h>
	54	#include <sys/systm.h>
	55	#include <sys/buf.h>
	56	#include <sys/malloc.h>
	57	#include <sys/mount.h>
	58	#include <sys/proc.h>
	59	#include <sys/syslog.h>
	60	#include <sys/vnode.h>
	61	#include <sys/conf.h>
	62	#include <machine/inttypes.h>
	63	#include "dir.h"
	64	#include "quota.h"
	65	#include "inode.h"
	66	#include "ufsmount.h"
	67	#include "fs.h"
	68	#include "softdep.h"
	69	#include "ffs_extern.h"
	70	#include "ufs_extern.h"
	71
	72	#include <sys/buf2.h>
	73	#include <sys/thread2.h>
	74	#include <sys/lock.h>
	75
	76	/*
	77	* These definitions need to be adapted to the system to which
	78	* this file is being ported.
	79	*/
	80	/*
	81	* malloc types defined for the softdep system.
	82	*/
	83	MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
	84	MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
	85	MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
	86	MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
	87	MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
	88	MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
	89	MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
	90	MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
	91	MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
	92	MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
	93	MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
	94	MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
	95	MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
	96
	97	#define M_SOFTDEP_FLAGS (M_WAITOK \| M_USE_RESERVE)
	98
	99	#define D_PAGEDEP 0
	100	#define D_INODEDEP 1
	101	#define D_NEWBLK 2
	102	#define D_BMSAFEMAP 3
	103	#define D_ALLOCDIRECT 4
	104	#define D_INDIRDEP 5
	105	#define D_ALLOCINDIR 6
	106	#define D_FREEFRAG 7
	107	#define D_FREEBLKS 8
	108	#define D_FREEFILE 9
	109	#define D_DIRADD 10
	110	#define D_MKDIR 11
	111	#define D_DIRREM 12
	112	#define D_LAST D_DIRREM
	113
	114	/*
	115	* translate from workitem type to memory type
	116	* MUST match the defines above, such that memtype[D_XXX] == M_XXX
	117	*/
	118	static struct malloc_type *memtype[] = {
	119	M_PAGEDEP,
	120	M_INODEDEP,
	121	M_NEWBLK,
	122	M_BMSAFEMAP,
	123	M_ALLOCDIRECT,
	124	M_INDIRDEP,
	125	M_ALLOCINDIR,
	126	M_FREEFRAG,
	127	M_FREEBLKS,
	128	M_FREEFILE,
	129	M_DIRADD,
	130	M_MKDIR,
	131	M_DIRREM
	132	};
	133
	134	#define DtoM(type) (memtype[type])
	135
	136	/*
	137	* Names of malloc types.
	138	*/
	139	#define TYPENAME(type) \
	140	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
	141	/*
	142	* End system adaptaion definitions.
	143	*/
	144
	145	/*
	146	* Internal function prototypes.
	147	*/
	148	static void softdep_error(char *, int);
	149	static void drain_output(struct vnode *, int);
	150	static int getdirtybuf(struct buf **, int);
	151	static void clear_remove(struct thread *);
	152	static void clear_inodedeps(struct thread *);
	153	static int flush_pagedep_deps(struct vnode , struct mount ,
	154	struct diraddhd *);
	155	static int flush_inodedep_deps(struct fs *, ino_t);
	156	static int handle_written_filepage(struct pagedep , struct buf );
	157	static void diradd_inode_written(struct diradd , struct inodedep );
	158	static int handle_written_inodeblock(struct inodedep , struct buf );
	159	static void handle_allocdirect_partdone(struct allocdirect *);
	160	static void handle_allocindir_partdone(struct allocindir *);
	161	static void initiate_write_filepage(struct pagedep , struct buf );
	162	static void handle_written_mkdir(struct mkdir *, int);
	163	static void initiate_write_inodeblock(struct inodedep , struct buf );
	164	static void handle_workitem_freefile(struct freefile *);
	165	static void handle_workitem_remove(struct dirrem *);
	166	static struct dirrem newdirrem(struct buf , struct inode *,
	167	struct inode , int, struct dirrem *);
	168	static void free_diradd(struct diradd *);
	169	static void free_allocindir(struct allocindir , struct inodedep );
	170	static int indir_trunc (struct inode , off_t, int, ufs_lbn_t, long );
	171	static void deallocate_dependencies(struct buf , struct inodedep );
	172	static void free_allocdirect(struct allocdirectlst *,
	173	struct allocdirect *, int);
	174	static int check_inode_unwritten(struct inodedep *);
	175	static int free_inodedep(struct inodedep *);
	176	static void handle_workitem_freeblocks(struct freeblks *);
	177	static void merge_inode_lists(struct inodedep *);
	178	static void setup_allocindir_phase2(struct buf , struct inode ,
	179	struct allocindir *);
	180	static struct allocindir newallocindir(struct inode , int, ufs_daddr_t,
	181	ufs_daddr_t);
	182	static void handle_workitem_freefrag(struct freefrag *);
	183	static struct freefrag newfreefrag(struct inode , ufs_daddr_t, long);
	184	static void allocdirect_merge(struct allocdirectlst *,
	185	struct allocdirect , struct allocdirect );
	186	static struct bmsafemap bmsafemap_lookup(struct buf );
	187	static int newblk_lookup(struct fs *, ufs_daddr_t, int,
	188	struct newblk **);
	189	static int inodedep_lookup(struct fs , ino_t, int, struct inodedep *);
	190	static int pagedep_lookup(struct inode *, ufs_lbn_t, int,
	191	struct pagedep **);
	192	static int request_cleanup(int, int);
	193	static int process_worklist_item(struct mount *, int);
	194	static void add_to_worklist(struct worklist *);
	195
	196	/*
	197	* Exported softdep operations.
	198	*/
	199	static void softdep_disk_io_initiation(struct buf *);
	200	static void softdep_disk_write_complete(struct buf *);
	201	static void softdep_deallocate_dependencies(struct buf *);
	202	static int softdep_fsync(struct vnode *);
	203	static int softdep_process_worklist(struct mount *);
	204	static void softdep_move_dependencies(struct buf , struct buf );
	205	static int softdep_count_dependencies(struct buf *bp, int);
	206	static int softdep_checkread(struct buf *bp);
	207	static int softdep_checkwrite(struct buf *bp);
	208
	209	static struct bio_ops softdep_bioops = {
	210	.io_start = softdep_disk_io_initiation,
	211	.io_complete = softdep_disk_write_complete,
	212	.io_deallocate = softdep_deallocate_dependencies,
	213	.io_fsync = softdep_fsync,
	214	.io_sync = softdep_process_worklist,
	215	.io_movedeps = softdep_move_dependencies,
	216	.io_countdeps = softdep_count_dependencies,
	217	.io_checkread = softdep_checkread,
	218	.io_checkwrite = softdep_checkwrite
	219	};
	220
	221	/*
	222	* Locking primitives.
	223	*/
	224	static void acquire_lock(struct lock *);
	225	static void free_lock(struct lock *);
	226	#ifdef INVARIANTS
	227	static int lock_held(struct lock *);
	228	#endif
	229
	230	static struct lock lk;
	231
	232	#define ACQUIRE_LOCK(lkp) acquire_lock(lkp)
	233	#define FREE_LOCK(lkp) free_lock(lkp)
	234
	235	static void
	236	acquire_lock(struct lock *lkp)
	237	{
	238	lockmgr(lkp, LK_EXCLUSIVE);
	239	}
	240
	241	static void
	242	free_lock(struct lock *lkp)
	243	{
	244	lockmgr(lkp, LK_RELEASE);
	245	}
	246
	247	#ifdef INVARIANTS
	248	static int
	249	lock_held(struct lock *lkp)
	250	{
	251	return lockcountnb(lkp);
	252	}
	253	#endif
	254
	255	/*
	256	* Place holder for real semaphores.
	257	*/
	258	struct sema {
	259	int value;
	260	thread_t holder;
	261	char *name;
	262	int timo;
	263	struct spinlock spin;
	264	};
	265	static void sema_init(struct sema , char , int);
	266	static int sema_get(struct sema , struct lock );
	267	static void sema_release(struct sema , struct lock );
	268
	269	#define NOHOLDER ((struct thread *) -1)
	270
	271	static void
	272	sema_init(struct sema semap, char name, int timo)
	273	{
	274	semap->holder = NOHOLDER;
	275	semap->value = 0;
	276	semap->name = name;
	277	semap->timo = timo;
	278	spin_init(&semap->spin);
	279	}
	280
	281	/*
	282	* Obtain exclusive access, semaphore is protected by the interlock.
	283	* If interlock is NULL we must protect the semaphore ourselves.
	284	*/
	285	static int
	286	sema_get(struct sema semap, struct lock interlock)
	287	{
	288	int rv;
	289
	290	if (interlock) {
	291	if (semap->value > 0) {
	292	++semap->value; /* serves as wakeup flag */
	293	lksleep(semap, interlock, 0,
	294	semap->name, semap->timo);
	295	rv = 0;
	296	} else {
	297	semap->value = 1; /* serves as owned flag */
	298	semap->holder = curthread;
	299	rv = 1;
	300	}
	301	} else {
	302	spin_lock(&semap->spin);
	303	if (semap->value > 0) {
	304	++semap->value; /* serves as wakeup flag */
	305	ssleep(semap, &semap->spin, 0,
	306	semap->name, semap->timo);
	307	spin_unlock(&semap->spin);
	308	rv = 0;
	309	} else {
	310	semap->value = 1; /* serves as owned flag */
	311	semap->holder = curthread;
	312	spin_unlock(&semap->spin);
	313	rv = 1;
	314	}
	315	}
	316	return (rv);
	317	}
	318
	319	static void
	320	sema_release(struct sema semap, struct lock lk)
	321	{
	322	if (semap->value <= 0 \|\| semap->holder != curthread)
	323	panic("sema_release: not held");
	324	if (lk) {
	325	semap->holder = NOHOLDER;
	326	if (--semap->value > 0) {
	327	semap->value = 0;
	328	wakeup(semap);
	329	}
	330	} else {
	331	spin_lock(&semap->spin);
	332	semap->holder = NOHOLDER;
	333	if (--semap->value > 0) {
	334	semap->value = 0;
	335	spin_unlock(&semap->spin);
	336	wakeup(semap);
	337	} else {
	338	spin_unlock(&semap->spin);
	339	}
	340	}
	341	}
	342
	343	/*
	344	* Worklist queue management.
	345	* These routines require that the lock be held.
	346	*/
	347	static void worklist_insert(struct workhead , struct worklist );
	348	static void worklist_remove(struct worklist *);
	349	static void workitem_free(struct worklist *, int);
	350
	351	#define WORKLIST_INSERT_BP(bp, item) do { \
	352	(bp)->b_ops = &softdep_bioops; \
	353	worklist_insert(&(bp)->b_dep, item); \
	354	} while (0)
	355
	356	#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
	357	#define WORKLIST_REMOVE(item) worklist_remove(item)
	358	#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
	359
	360	static void
	361	worklist_insert(struct workhead head, struct worklist item)
	362	{
	363	KKASSERT(lock_held(&lk) > 0);
	364
	365	if (item->wk_state & ONWORKLIST) {
	366	panic("worklist_insert: already on list");
	367	}
	368	item->wk_state \|= ONWORKLIST;
	369	LIST_INSERT_HEAD(head, item, wk_list);
	370	}
	371
	372	static void
	373	worklist_remove(struct worklist *item)
	374	{
	375
	376	KKASSERT(lock_held(&lk));
	377	if ((item->wk_state & ONWORKLIST) == 0)
	378	panic("worklist_remove: not on list");
	379
	380	item->wk_state &= ~ONWORKLIST;
	381	LIST_REMOVE(item, wk_list);
	382	}
	383
	384	static void
	385	workitem_free(struct worklist *item, int type)
	386	{
	387
	388	if (item->wk_state & ONWORKLIST)
	389	panic("workitem_free: still on list");
	390	if (item->wk_type != type)
	391	panic("workitem_free: type mismatch");
	392
	393	kfree(item, DtoM(type));
	394	}
	395
	396	/*
	397	* Workitem queue management
	398	*/
	399	static struct workhead softdep_workitem_pending;
	400	static int num_on_worklist; /* number of worklist items to be processed */
	401	static int softdep_worklist_busy; /* 1 => trying to do unmount */
	402	static int softdep_worklist_req; /* serialized waiters */
	403	static int max_softdeps; /* maximum number of structs before slowdown */
	404	static int tickdelay = 2; /* number of ticks to pause during slowdown */
	405	static int stat_countp; / statistic to count in proc_waiting timeout */
	406	static int proc_waiting; /* tracks whether we have a timeout posted */
	407	static struct thread filesys_syncer; / proc of filesystem syncer process */
	408	static int req_clear_inodedeps; /* syncer process flush some inodedeps */
	409	#define FLUSH_INODES 1
	410	static int req_clear_remove; /* syncer process flush some freeblks */
	411	#define FLUSH_REMOVE 2
	412	/*
	413	* runtime statistics
	414	*/
	415	static int stat_worklist_push; /* number of worklist cleanups */
	416	static int stat_blk_limit_push; /* number of times block limit neared */
	417	static int stat_ino_limit_push; /* number of times inode limit neared */
	418	static int stat_blk_limit_hit; /* number of times block slowdown imposed */
	419	static int stat_ino_limit_hit; /* number of times inode slowdown imposed */
	420	static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
	421	static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
	422	static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
	423	static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
	424	static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
	425	#ifdef DEBUG
	426	#include <vm/vm.h>
	427	#include <sys/sysctl.h>
	428	SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0,
	429	"Maximum soft dependencies before slowdown occurs");
	430	SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0,
	431	"Ticks to delay before allocating during slowdown");
	432	SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,
	433	"Number of worklist cleanups");
	434	SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,
	435	"Number of times block limit neared");
	436	SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,
	437	"Number of times inode limit neared");
	438	SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0,
	439	"Number of times block slowdown imposed");
	440	SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0,
	441	"Number of times inode slowdown imposed ");
	442	SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0,
	443	"Number of synchronous slowdowns imposed");
	444	SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0,
	445	"Bufs redirtied as indir ptrs not written");
	446	SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0,
	447	"Bufs redirtied as inode bitmap not written");
	448	SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0,
	449	"Bufs redirtied as direct ptrs not written");
	450	SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0,
	451	"Bufs redirtied as dir entry cannot write");
	452	#endif /* DEBUG */
	453
	454	/*
	455	* Add an item to the end of the work queue.
	456	* This routine requires that the lock be held.
	457	* This is the only routine that adds items to the list.
	458	* The following routine is the only one that removes items
	459	* and does so in order from first to last.
	460	*/
	461	static void
	462	add_to_worklist(struct worklist *wk)
	463	{
	464	static struct worklist *worklist_tail;
	465
	466	if (wk->wk_state & ONWORKLIST) {
	467	panic("add_to_worklist: already on list");
	468	}
	469	wk->wk_state \|= ONWORKLIST;
	470	if (LIST_FIRST(&softdep_workitem_pending) == NULL)
	471	LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
	472	else
	473	LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
	474	worklist_tail = wk;
	475	num_on_worklist += 1;
	476	}
	477
	478	/*
	479	* Process that runs once per second to handle items in the background queue.
	480	*
	481	* Note that we ensure that everything is done in the order in which they
	482	* appear in the queue. The code below depends on this property to ensure
	483	* that blocks of a file are freed before the inode itself is freed. This
	484	* ordering ensures that no new <vfsid, inum, lbn> triples will be generated
	485	* until all the old ones have been purged from the dependency lists.
	486	*
	487	* bioops callback - hold io_token
	488	*/
	489	static int
	490	softdep_process_worklist(struct mount *matchmnt)
	491	{
	492	thread_t td = curthread;
	493	int matchcnt, loopcount;
	494	long starttime;
	495
	496	ACQUIRE_LOCK(&lk);
	497
	498	/*
	499	* Record the process identifier of our caller so that we can give
	500	* this process preferential treatment in request_cleanup below.
	501	*/
	502	filesys_syncer = td;
	503	matchcnt = 0;
	504
	505	/*
	506	* There is no danger of having multiple processes run this
	507	* code, but we have to single-thread it when softdep_flushfiles()
	508	* is in operation to get an accurate count of the number of items
	509	* related to its mount point that are in the list.
	510	*/
	511	if (matchmnt == NULL) {
	512	if (softdep_worklist_busy < 0) {
	513	matchcnt = -1;
	514	goto done;
	515	}
	516	softdep_worklist_busy += 1;
	517	}
	518
	519	/*
	520	* If requested, try removing inode or removal dependencies.
	521	*/
	522	if (req_clear_inodedeps) {
	523	clear_inodedeps(td);
	524	req_clear_inodedeps -= 1;
	525	wakeup_one(&proc_waiting);
	526	}
	527	if (req_clear_remove) {
	528	clear_remove(td);
	529	req_clear_remove -= 1;
	530	wakeup_one(&proc_waiting);
	531	}
	532	loopcount = 1;
	533	starttime = time_second;
	534	while (num_on_worklist > 0) {
	535	matchcnt += process_worklist_item(matchmnt, 0);
	536
	537	/*
	538	* If a umount operation wants to run the worklist
	539	* accurately, abort.
	540	*/
	541	if (softdep_worklist_req && matchmnt == NULL) {
	542	matchcnt = -1;
	543	break;
	544	}
	545
	546	/*
	547	* If requested, try removing inode or removal dependencies.
	548	*/
	549	if (req_clear_inodedeps) {
	550	clear_inodedeps(td);
	551	req_clear_inodedeps -= 1;
	552	wakeup_one(&proc_waiting);
	553	}
	554	if (req_clear_remove) {
	555	clear_remove(td);
	556	req_clear_remove -= 1;
	557	wakeup_one(&proc_waiting);
	558	}
	559	/*
	560	* We do not generally want to stop for buffer space, but if
	561	* we are really being a buffer hog, we will stop and wait.
	562	*/
	563	if (loopcount++ % 128 == 0) {
	564	FREE_LOCK(&lk);
	565	bwillinode(1);
	566	ACQUIRE_LOCK(&lk);
	567	}
	568
	569	/*
	570	* Never allow processing to run for more than one
	571	* second. Otherwise the other syncer tasks may get
	572	* excessively backlogged.
	573	*/
	574	if (starttime != time_second && matchmnt == NULL) {
	575	matchcnt = -1;
	576	break;
	577	}
	578	}
	579	if (matchmnt == NULL) {
	580	--softdep_worklist_busy;
	581	if (softdep_worklist_req && softdep_worklist_busy == 0)
	582	wakeup(&softdep_worklist_req);
	583	}
	584	done:
	585	FREE_LOCK(&lk);
	586	return (matchcnt);
	587	}
	588
	589	/*
	590	* Process one item on the worklist.
	591	*/
	592	static int
	593	process_worklist_item(struct mount *matchmnt, int flags)
	594	{
	595	struct worklist *wk;
	596	struct dirrem *dirrem;
	597	struct fs *matchfs;
	598	struct vnode *vp;
	599	int matchcnt = 0;
	600
	601	matchfs = NULL;
	602	if (matchmnt != NULL)
	603	matchfs = VFSTOUFS(matchmnt)->um_fs;
	604
	605	/*
	606	* Normally we just process each item on the worklist in order.
	607	* However, if we are in a situation where we cannot lock any
	608	* inodes, we have to skip over any dirrem requests whose
	609	* vnodes are resident and locked.
	610	*/
	611	LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
	612	if ((flags & LK_NOWAIT) == 0 \|\| wk->wk_type != D_DIRREM)
	613	break;
	614	dirrem = WK_DIRREM(wk);
	615	vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev,
	616	dirrem->dm_oldinum);
	617	if (vp == NULL \|\| !vn_islocked(vp))
	618	break;
	619	}
	620	if (wk == NULL) {
	621	return (0);
	622	}
	623	WORKLIST_REMOVE(wk);
	624	num_on_worklist -= 1;
	625	FREE_LOCK(&lk);
	626	switch (wk->wk_type) {
	627	case D_DIRREM:
	628	/* removal of a directory entry */
	629	if (WK_DIRREM(wk)->dm_mnt == matchmnt)
	630	matchcnt += 1;
	631	handle_workitem_remove(WK_DIRREM(wk));
	632	break;
	633
	634	case D_FREEBLKS:
	635	/* releasing blocks and/or fragments from a file */
	636	if (WK_FREEBLKS(wk)->fb_fs == matchfs)
	637	matchcnt += 1;
	638	handle_workitem_freeblocks(WK_FREEBLKS(wk));
	639	break;
	640
	641	case D_FREEFRAG:
	642	/* releasing a fragment when replaced as a file grows */
	643	if (WK_FREEFRAG(wk)->ff_fs == matchfs)
	644	matchcnt += 1;
	645	handle_workitem_freefrag(WK_FREEFRAG(wk));
	646	break;
	647
	648	case D_FREEFILE:
	649	/* releasing an inode when its link count drops to 0 */
	650	if (WK_FREEFILE(wk)->fx_fs == matchfs)
	651	matchcnt += 1;
	652	handle_workitem_freefile(WK_FREEFILE(wk));
	653	break;
	654
	655	default:
	656	panic("%s_process_worklist: Unknown type %s",
	657	"softdep", TYPENAME(wk->wk_type));
	658	/* NOTREACHED */
	659	}
	660	ACQUIRE_LOCK(&lk);
	661	return (matchcnt);
	662	}
	663
	664	/*
	665	* Move dependencies from one buffer to another.
	666	*
	667	* bioops callback - hold io_token
	668	*/
	669	static void
	670	softdep_move_dependencies(struct buf oldbp, struct buf newbp)
	671	{
	672	struct worklist wk, wktail;
	673
	674	if (LIST_FIRST(&newbp->b_dep) != NULL)
	675	panic("softdep_move_dependencies: need merge code");
	676	wktail = NULL;
	677	ACQUIRE_LOCK(&lk);
	678	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
	679	LIST_REMOVE(wk, wk_list);
	680	if (wktail == NULL)
	681	LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
	682	else
	683	LIST_INSERT_AFTER(wktail, wk, wk_list);
	684	wktail = wk;
	685	newbp->b_ops = &softdep_bioops;
	686	}
	687	FREE_LOCK(&lk);
	688	}
	689
	690	/*
	691	* Purge the work list of all items associated with a particular mount point.
	692	*/
	693	int
	694	softdep_flushfiles(struct mount *oldmnt, int flags)
	695	{
	696	struct vnode *devvp;
	697	int error, loopcnt;
	698
	699	/*
	700	* Await our turn to clear out the queue, then serialize access.
	701	*/
	702	ACQUIRE_LOCK(&lk);
	703	while (softdep_worklist_busy != 0) {
	704	softdep_worklist_req += 1;
	705	lksleep(&softdep_worklist_req, &lk, 0, "softflush", 0);
	706	softdep_worklist_req -= 1;
	707	}
	708	softdep_worklist_busy = -1;
	709	FREE_LOCK(&lk);
	710
	711	if ((error = ffs_flushfiles(oldmnt, flags)) != 0) {
	712	softdep_worklist_busy = 0;
	713	if (softdep_worklist_req)
	714	wakeup(&softdep_worklist_req);
	715	return (error);
	716	}
	717	/*
	718	* Alternately flush the block device associated with the mount
	719	* point and process any dependencies that the flushing
	720	* creates. In theory, this loop can happen at most twice,
	721	* but we give it a few extra just to be sure.
	722	*/
	723	devvp = VFSTOUFS(oldmnt)->um_devvp;
	724	for (loopcnt = 10; loopcnt > 0; ) {
	725	if (softdep_process_worklist(oldmnt) == 0) {
	726	loopcnt--;
	727	/*
	728	* Do another flush in case any vnodes were brought in
	729	* as part of the cleanup operations.
	730	*/
	731	if ((error = ffs_flushfiles(oldmnt, flags)) != 0)
	732	break;
	733	/*
	734	* If we still found nothing to do, we are really done.
	735	*/
	736	if (softdep_process_worklist(oldmnt) == 0)
	737	break;
	738	}
	739	vn_lock(devvp, LK_EXCLUSIVE \| LK_RETRY);
	740	error = VOP_FSYNC(devvp, MNT_WAIT, 0);
	741	vn_unlock(devvp);
	742	if (error)
	743	break;
	744	}
	745	ACQUIRE_LOCK(&lk);
	746	softdep_worklist_busy = 0;
	747	if (softdep_worklist_req)
	748	wakeup(&softdep_worklist_req);
	749	FREE_LOCK(&lk);
	750
	751	/*
	752	* If we are unmounting then it is an error to fail. If we
	753	* are simply trying to downgrade to read-only, then filesystem
	754	* activity can keep us busy forever, so we just fail with EBUSY.
	755	*/
	756	if (loopcnt == 0) {
	757	if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
	758	panic("softdep_flushfiles: looping");
	759	error = EBUSY;
	760	}
	761	return (error);
	762	}
	763
	764	/*
	765	* Structure hashing.
	766	*
	767	* There are three types of structures that can be looked up:
	768	* 1) pagedep structures identified by mount point, inode number,
	769	* and logical block.
	770	* 2) inodedep structures identified by mount point and inode number.
	771	* 3) newblk structures identified by mount point and
	772	* physical block number.
	773	*
	774	* The "pagedep" and "inodedep" dependency structures are hashed
	775	* separately from the file blocks and inodes to which they correspond.
	776	* This separation helps when the in-memory copy of an inode or
	777	* file block must be replaced. It also obviates the need to access
	778	* an inode or file page when simply updating (or de-allocating)
	779	* dependency structures. Lookup of newblk structures is needed to
	780	* find newly allocated blocks when trying to associate them with
	781	* their allocdirect or allocindir structure.
	782	*
	783	* The lookup routines optionally create and hash a new instance when
	784	* an existing entry is not found.
	785	*/
	786	#define DEPALLOC 0x0001 /* allocate structure if lookup fails */
	787	#define NODELAY 0x0002 /* cannot do background work */
	788
	789	/*
	790	* Structures and routines associated with pagedep caching.
	791	*/
	792	LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
	793	u_long pagedep_hash; /* size of hash table - 1 */
	794	#define PAGEDEP_HASH(mp, inum, lbn) \
	795	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
	796	pagedep_hash])
	797	static struct sema pagedep_in_progress;
	798
	799	/*
	800	* Helper routine for pagedep_lookup()
	801	*/
	802	static __inline
	803	struct pagedep *
	804	pagedep_find(struct pagedep_hashhead *pagedephd, ino_t ino, ufs_lbn_t lbn,
	805	struct mount *mp)
	806	{
	807	struct pagedep *pagedep;
	808
	809	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
	810	if (ino == pagedep->pd_ino &&
	811	lbn == pagedep->pd_lbn &&
	812	mp == pagedep->pd_mnt) {
	813	return (pagedep);
	814	}
	815	}
	816	return(NULL);
	817	}
	818
	819	/*
	820	* Look up a pagedep. Return 1 if found, 0 if not found.
	821	* If not found, allocate if DEPALLOC flag is passed.
	822	* Found or allocated entry is returned in pagedeppp.
	823	* This routine must be called with splbio interrupts blocked.
	824	*/
	825	static int
	826	pagedep_lookup(struct inode *ip, ufs_lbn_t lbn, int flags,
	827	struct pagedep **pagedeppp)
	828	{
	829	struct pagedep *pagedep;
	830	struct pagedep_hashhead *pagedephd;
	831	struct mount *mp;
	832	int i;
	833
	834	KKASSERT(lock_held(&lk) > 0);
	835
	836	mp = ITOV(ip)->v_mount;
	837	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
	838	top:
	839	*pagedeppp = pagedep_find(pagedephd, ip->i_number, lbn, mp);
	840	if (*pagedeppp)
	841	return(1);
	842	if ((flags & DEPALLOC) == 0)
	843	return (0);
	844	if (sema_get(&pagedep_in_progress, &lk) == 0)
	845	goto top;
	846
	847	FREE_LOCK(&lk);
	848	pagedep = kmalloc(sizeof(struct pagedep), M_PAGEDEP,
	849	M_SOFTDEP_FLAGS \| M_ZERO);
	850	ACQUIRE_LOCK(&lk);
	851	if (pagedep_find(pagedephd, ip->i_number, lbn, mp)) {
	852	kprintf("pagedep_lookup: blocking race avoided\n");
	853	sema_release(&pagedep_in_progress, &lk);
	854	kfree(pagedep, M_PAGEDEP);
	855	goto top;
	856	}
	857
	858	pagedep->pd_list.wk_type = D_PAGEDEP;
	859	pagedep->pd_mnt = mp;
	860	pagedep->pd_ino = ip->i_number;
	861	pagedep->pd_lbn = lbn;
	862	LIST_INIT(&pagedep->pd_dirremhd);
	863	LIST_INIT(&pagedep->pd_pendinghd);
	864	for (i = 0; i < DAHASHSZ; i++)
	865	LIST_INIT(&pagedep->pd_diraddhd[i]);
	866	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
	867	sema_release(&pagedep_in_progress, &lk);
	868	*pagedeppp = pagedep;
	869	return (0);
	870	}
	871
	872	/*
	873	* Structures and routines associated with inodedep caching.
	874	*/
	875	LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
	876	static u_long inodedep_hash; /* size of hash table - 1 */
	877	static long num_inodedep; /* number of inodedep allocated */
	878	#define INODEDEP_HASH(fs, inum) \
	879	(&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
	880	static struct sema inodedep_in_progress;
	881
	882	/*
	883	* Helper routine for inodedep_lookup()
	884	*/
	885	static __inline
	886	struct inodedep *
	887	inodedep_find(struct inodedep_hashhead inodedephd, struct fs fs, ino_t inum)
	888	{
	889	struct inodedep *inodedep;
	890
	891	LIST_FOREACH(inodedep, inodedephd, id_hash) {
	892	if (inum == inodedep->id_ino && fs == inodedep->id_fs)
	893	return(inodedep);
	894	}
	895	return (NULL);
	896	}
	897
	898	/*
	899	* Look up a inodedep. Return 1 if found, 0 if not found.
	900	* If not found, allocate if DEPALLOC flag is passed.
	901	* Found or allocated entry is returned in inodedeppp.
	902	* This routine must be called with splbio interrupts blocked.
	903	*/
	904	static int
	905	inodedep_lookup(struct fs *fs, ino_t inum, int flags,
	906	struct inodedep **inodedeppp)
	907	{
	908	struct inodedep *inodedep;
	909	struct inodedep_hashhead *inodedephd;
	910	int firsttry;
	911
	912	KKASSERT(lock_held(&lk) > 0);
	913
	914	firsttry = 1;
	915	inodedephd = INODEDEP_HASH(fs, inum);
	916	top:
	917	*inodedeppp = inodedep_find(inodedephd, fs, inum);
	918	if (*inodedeppp)
	919	return (1);
	920	if ((flags & DEPALLOC) == 0)
	921	return (0);
	922	/*
	923	* If we are over our limit, try to improve the situation.
	924	*/
	925	if (num_inodedep > max_softdeps && firsttry &&
	926	speedup_syncer() == 0 && (flags & NODELAY) == 0 &&
	927	request_cleanup(FLUSH_INODES, 1)) {
	928	firsttry = 0;
	929	goto top;
	930	}
	931	if (sema_get(&inodedep_in_progress, &lk) == 0)
	932	goto top;
	933
	934	FREE_LOCK(&lk);
	935	inodedep = kmalloc(sizeof(struct inodedep), M_INODEDEP,
	936	M_SOFTDEP_FLAGS \| M_ZERO);
	937	ACQUIRE_LOCK(&lk);
	938	if (inodedep_find(inodedephd, fs, inum)) {
	939	kprintf("inodedep_lookup: blocking race avoided\n");
	940	sema_release(&inodedep_in_progress, &lk);
	941	kfree(inodedep, M_INODEDEP);
	942	goto top;
	943	}
	944	inodedep->id_list.wk_type = D_INODEDEP;
	945	inodedep->id_fs = fs;
	946	inodedep->id_ino = inum;
	947	inodedep->id_state = ALLCOMPLETE;
	948	inodedep->id_nlinkdelta = 0;
	949	inodedep->id_savedino = NULL;
	950	inodedep->id_savedsize = -1;
	951	inodedep->id_buf = NULL;
	952	LIST_INIT(&inodedep->id_pendinghd);
	953	LIST_INIT(&inodedep->id_inowait);
	954	LIST_INIT(&inodedep->id_bufwait);
	955	TAILQ_INIT(&inodedep->id_inoupdt);
	956	TAILQ_INIT(&inodedep->id_newinoupdt);
	957	num_inodedep += 1;
	958	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
	959	sema_release(&inodedep_in_progress, &lk);
	960	*inodedeppp = inodedep;
	961	return (0);
	962	}
	963
	964	/*
	965	* Structures and routines associated with newblk caching.
	966	*/
	967	LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
	968	u_long newblk_hash; /* size of hash table - 1 */
	969	#define NEWBLK_HASH(fs, inum) \
	970	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
	971	static struct sema newblk_in_progress;
	972
	973	/*
	974	* Helper routine for newblk_lookup()
	975	*/
	976	static __inline
	977	struct newblk *
	978	newblk_find(struct newblk_hashhead newblkhd, struct fs fs,
	979	ufs_daddr_t newblkno)
	980	{
	981	struct newblk *newblk;
	982
	983	LIST_FOREACH(newblk, newblkhd, nb_hash) {
	984	if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
	985	return (newblk);
	986	}
	987	return(NULL);
	988	}
	989
	990	/*
	991	* Look up a newblk. Return 1 if found, 0 if not found.
	992	* If not found, allocate if DEPALLOC flag is passed.
	993	* Found or allocated entry is returned in newblkpp.
	994	*/
	995	static int
	996	newblk_lookup(struct fs *fs, ufs_daddr_t newblkno, int flags,
	997	struct newblk **newblkpp)
	998	{
	999	struct newblk *newblk;
	1000	struct newblk_hashhead *newblkhd;
	1001
	1002	newblkhd = NEWBLK_HASH(fs, newblkno);
	1003	top:
	1004	*newblkpp = newblk_find(newblkhd, fs, newblkno);
	1005	if (*newblkpp)
	1006	return(1);
	1007	if ((flags & DEPALLOC) == 0)
	1008	return (0);
	1009	if (sema_get(&newblk_in_progress, NULL) == 0)
	1010	goto top;
	1011
	1012	newblk = kmalloc(sizeof(struct newblk), M_NEWBLK,
	1013	M_SOFTDEP_FLAGS \| M_ZERO);
	1014
	1015	if (newblk_find(newblkhd, fs, newblkno)) {
	1016	kprintf("newblk_lookup: blocking race avoided\n");
	1017	sema_release(&pagedep_in_progress, NULL);
	1018	kfree(newblk, M_NEWBLK);
	1019	goto top;
	1020	}
	1021	newblk->nb_state = 0;
	1022	newblk->nb_fs = fs;
	1023	newblk->nb_newblkno = newblkno;
	1024	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
	1025	sema_release(&newblk_in_progress, NULL);
	1026	*newblkpp = newblk;
	1027	return (0);
	1028	}
	1029
	1030	/*
	1031	* Executed during filesystem system initialization before
	1032	* mounting any filesystems.
	1033	*/
	1034	void
	1035	softdep_initialize(void)
	1036	{
	1037	LIST_INIT(&mkdirlisthd);
	1038	LIST_INIT(&softdep_workitem_pending);
	1039	max_softdeps = min(desiredvnodes * 8,
	1040	M_INODEDEP->ks_limit / (2 * sizeof(struct inodedep)));
	1041	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
	1042	&pagedep_hash);
	1043	lockinit(&lk, "ffs_softdep", 0, LK_CANRECURSE);
	1044	sema_init(&pagedep_in_progress, "pagedep", 0);
	1045	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
	1046	sema_init(&inodedep_in_progress, "inodedep", 0);
	1047	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
	1048	sema_init(&newblk_in_progress, "newblk", 0);
	1049	add_bio_ops(&softdep_bioops);
	1050	}
	1051
	1052	/*
	1053	* Called at mount time to notify the dependency code that a
	1054	* filesystem wishes to use it.
	1055	*/
	1056	int
	1057	softdep_mount(struct vnode devvp, struct mount mp, struct fs *fs)
	1058	{
	1059	struct csum cstotal;
	1060	struct cg *cgp;
	1061	struct buf *bp;
	1062	int error, cyl;
	1063
	1064	mp->mnt_flag &= ~MNT_ASYNC;
	1065	mp->mnt_flag \|= MNT_SOFTDEP;
	1066	mp->mnt_bioops = &softdep_bioops;
	1067	/*
	1068	* When doing soft updates, the counters in the
	1069	* superblock may have gotten out of sync, so we have
	1070	* to scan the cylinder groups and recalculate them.
	1071	*/
	1072	if (fs->fs_clean != 0)
	1073	return (0);
	1074	bzero(&cstotal, sizeof cstotal);
	1075	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
	1076	if ((error = bread(devvp, fsbtodoff(fs, cgtod(fs, cyl)),
	1077	fs->fs_cgsize, &bp)) != 0) {
	1078	brelse(bp);
	1079	return (error);
	1080	}
	1081	cgp = (struct cg *)bp->b_data;
	1082	cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
	1083	cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
	1084	cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
	1085	cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
	1086	fs->fs_cs(fs, cyl) = cgp->cg_cs;
	1087	brelse(bp);
	1088	}
	1089	#ifdef DEBUG
	1090	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
	1091	kprintf("ffs_mountfs: superblock updated for soft updates\n");
	1092	#endif
	1093	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
	1094	return (0);
	1095	}
	1096
	1097	/*
	1098	* Protecting the freemaps (or bitmaps).
	1099	*
	1100	* To eliminate the need to execute fsck before mounting a filesystem
	1101	* after a power failure, one must (conservatively) guarantee that the
	1102	* on-disk copy of the bitmaps never indicate that a live inode or block is
	1103	* free. So, when a block or inode is allocated, the bitmap should be
	1104	* updated (on disk) before any new pointers. When a block or inode is
	1105	* freed, the bitmap should not be updated until all pointers have been
	1106	* reset. The latter dependency is handled by the delayed de-allocation
	1107	* approach described below for block and inode de-allocation. The former
	1108	* dependency is handled by calling the following procedure when a block or
	1109	* inode is allocated. When an inode is allocated an "inodedep" is created
	1110	* with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
	1111	* Each "inodedep" is also inserted into the hash indexing structure so
	1112	* that any additional link additions can be made dependent on the inode
	1113	* allocation.
	1114	*
	1115	* The ufs filesystem maintains a number of free block counts (e.g., per
	1116	* cylinder group, per cylinder and per <cylinder, rotational position> pair)
	1117	* in addition to the bitmaps. These counts are used to improve efficiency
	1118	* during allocation and therefore must be consistent with the bitmaps.
	1119	* There is no convenient way to guarantee post-crash consistency of these
	1120	* counts with simple update ordering, for two main reasons: (1) The counts
	1121	* and bitmaps for a single cylinder group block are not in the same disk
	1122	* sector. If a disk write is interrupted (e.g., by power failure), one may
	1123	* be written and the other not. (2) Some of the counts are located in the
	1124	* superblock rather than the cylinder group block. So, we focus our soft
	1125	* updates implementation on protecting the bitmaps. When mounting a
	1126	* filesystem, we recompute the auxiliary counts from the bitmaps.
	1127	*/
	1128
	1129	/*
	1130	* Called just after updating the cylinder group block to allocate an inode.
	1131	*
	1132	* Parameters:
	1133	* bp: buffer for cylgroup block with inode map
	1134	* ip: inode related to allocation
	1135	* newinum: new inode number being allocated
	1136	*/
	1137	void
	1138	softdep_setup_inomapdep(struct buf bp, struct inode ip, ino_t newinum)
	1139	{
	1140	struct inodedep *inodedep;
	1141	struct bmsafemap *bmsafemap;
	1142
	1143	/*
	1144	* Create a dependency for the newly allocated inode.
	1145	* Panic if it already exists as something is seriously wrong.
	1146	* Otherwise add it to the dependency list for the buffer holding
	1147	* the cylinder group map from which it was allocated.
	1148	*/
	1149	ACQUIRE_LOCK(&lk);
	1150	if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC\|NODELAY, &inodedep))) {
	1151	panic("softdep_setup_inomapdep: found inode");
	1152	}
	1153	inodedep->id_buf = bp;
	1154	inodedep->id_state &= ~DEPCOMPLETE;
	1155	bmsafemap = bmsafemap_lookup(bp);
	1156	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
	1157	FREE_LOCK(&lk);
	1158	}
	1159
	1160	/*
	1161	* Called just after updating the cylinder group block to
	1162	* allocate block or fragment.
	1163	*
	1164	* Parameters:
	1165	* bp: buffer for cylgroup block with block map
	1166	* fs: filesystem doing allocation
	1167	* newblkno: number of newly allocated block
	1168	*/
	1169	void
	1170	softdep_setup_blkmapdep(struct buf bp, struct fs fs,
	1171	ufs_daddr_t newblkno)
	1172	{
	1173	struct newblk *newblk;
	1174	struct bmsafemap *bmsafemap;
	1175
	1176	/*
	1177	* Create a dependency for the newly allocated block.
	1178	* Add it to the dependency list for the buffer holding
	1179	* the cylinder group map from which it was allocated.
	1180	*/
	1181	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
	1182	panic("softdep_setup_blkmapdep: found block");
	1183	ACQUIRE_LOCK(&lk);
	1184	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
	1185	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
	1186	FREE_LOCK(&lk);
	1187	}
	1188
	1189	/*
	1190	* Find the bmsafemap associated with a cylinder group buffer.
	1191	* If none exists, create one. The buffer must be locked when
	1192	* this routine is called and this routine must be called with
	1193	* splbio interrupts blocked.
	1194	*/
	1195	static struct bmsafemap *
	1196	bmsafemap_lookup(struct buf *bp)
	1197	{
	1198	struct bmsafemap *bmsafemap;
	1199	struct worklist *wk;
	1200
	1201	KKASSERT(lock_held(&lk) > 0);
	1202
	1203	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	1204	if (wk->wk_type == D_BMSAFEMAP)
	1205	return (WK_BMSAFEMAP(wk));
	1206	}
	1207	FREE_LOCK(&lk);
	1208	bmsafemap = kmalloc(sizeof(struct bmsafemap), M_BMSAFEMAP,
	1209	M_SOFTDEP_FLAGS);
	1210	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
	1211	bmsafemap->sm_list.wk_state = 0;
	1212	bmsafemap->sm_buf = bp;
	1213	LIST_INIT(&bmsafemap->sm_allocdirecthd);
	1214	LIST_INIT(&bmsafemap->sm_allocindirhd);
	1215	LIST_INIT(&bmsafemap->sm_inodedephd);
	1216	LIST_INIT(&bmsafemap->sm_newblkhd);
	1217	ACQUIRE_LOCK(&lk);
	1218	WORKLIST_INSERT_BP(bp, &bmsafemap->sm_list);
	1219	return (bmsafemap);
	1220	}
	1221
	1222	/*
	1223	* Direct block allocation dependencies.
	1224	*
	1225	* When a new block is allocated, the corresponding disk locations must be
	1226	* initialized (with zeros or new data) before the on-disk inode points to
	1227	* them. Also, the freemap from which the block was allocated must be
	1228	* updated (on disk) before the inode's pointer. These two dependencies are
	1229	* independent of each other and are needed for all file blocks and indirect
	1230	* blocks that are pointed to directly by the inode. Just before the
	1231	* "in-core" version of the inode is updated with a newly allocated block
	1232	* number, a procedure (below) is called to setup allocation dependency
	1233	* structures. These structures are removed when the corresponding
	1234	* dependencies are satisfied or when the block allocation becomes obsolete
	1235	* (i.e., the file is deleted, the block is de-allocated, or the block is a
	1236	* fragment that gets upgraded). All of these cases are handled in
	1237	* procedures described later.
	1238	*
	1239	* When a file extension causes a fragment to be upgraded, either to a larger
	1240	* fragment or to a full block, the on-disk location may change (if the
	1241	* previous fragment could not simply be extended). In this case, the old
	1242	* fragment must be de-allocated, but not until after the inode's pointer has
	1243	* been updated. In most cases, this is handled by later procedures, which
	1244	* will construct a "freefrag" structure to be added to the workitem queue
	1245	* when the inode update is complete (or obsolete). The main exception to
	1246	* this is when an allocation occurs while a pending allocation dependency
	1247	* (for the same block pointer) remains. This case is handled in the main
	1248	* allocation dependency setup procedure by immediately freeing the
	1249	* unreferenced fragments.
	1250	*
	1251	* Parameters:
	1252	* ip: inode to which block is being added
	1253	* lbn: block pointer within inode
	1254	* newblkno: disk block number being added
	1255	* oldblkno: previous block number, 0 unless frag
	1256	* newsize: size of new block
	1257	* oldsize: size of new block
	1258	* bp: bp for allocated block
	1259	*/
	1260	void
	1261	softdep_setup_allocdirect(struct inode *ip, ufs_lbn_t lbn, ufs_daddr_t newblkno,
	1262	ufs_daddr_t oldblkno, long newsize, long oldsize,
	1263	struct buf *bp)
	1264	{
	1265	struct allocdirect adp, oldadp;
	1266	struct allocdirectlst *adphead;
	1267	struct bmsafemap *bmsafemap;
	1268	struct inodedep *inodedep;
	1269	struct pagedep *pagedep;
	1270	struct newblk *newblk;
	1271
	1272	adp = kmalloc(sizeof(struct allocdirect), M_ALLOCDIRECT,
	1273	M_SOFTDEP_FLAGS \| M_ZERO);
	1274	adp->ad_list.wk_type = D_ALLOCDIRECT;
	1275	adp->ad_lbn = lbn;
	1276	adp->ad_newblkno = newblkno;
	1277	adp->ad_oldblkno = oldblkno;
	1278	adp->ad_newsize = newsize;
	1279	adp->ad_oldsize = oldsize;
	1280	adp->ad_state = ATTACHED;
	1281	if (newblkno == oldblkno)
	1282	adp->ad_freefrag = NULL;
	1283	else
	1284	adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
	1285
	1286	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
	1287	panic("softdep_setup_allocdirect: lost block");
	1288
	1289	ACQUIRE_LOCK(&lk);
	1290	inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC \| NODELAY, &inodedep);
	1291	adp->ad_inodedep = inodedep;
	1292
	1293	if (newblk->nb_state == DEPCOMPLETE) {
	1294	adp->ad_state \|= DEPCOMPLETE;
	1295	adp->ad_buf = NULL;
	1296	} else {
	1297	bmsafemap = newblk->nb_bmsafemap;
	1298	adp->ad_buf = bmsafemap->sm_buf;
	1299	LIST_REMOVE(newblk, nb_deps);
	1300	LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
	1301	}
	1302	LIST_REMOVE(newblk, nb_hash);
	1303	kfree(newblk, M_NEWBLK);
	1304
	1305	WORKLIST_INSERT_BP(bp, &adp->ad_list);
	1306	if (lbn >= NDADDR) {
	1307	/* allocating an indirect block */
	1308	if (oldblkno != 0) {
	1309	panic("softdep_setup_allocdirect: non-zero indir");
	1310	}
	1311	} else {
	1312	/*
	1313	* Allocating a direct block.
	1314	*
	1315	* If we are allocating a directory block, then we must
	1316	* allocate an associated pagedep to track additions and
	1317	* deletions.
	1318	*/
	1319	if ((ip->i_mode & IFMT) == IFDIR &&
	1320	pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) {
	1321	WORKLIST_INSERT_BP(bp, &pagedep->pd_list);
	1322	}
	1323	}
	1324	/*
	1325	* The list of allocdirects must be kept in sorted and ascending
	1326	* order so that the rollback routines can quickly determine the
	1327	* first uncommitted block (the size of the file stored on disk
	1328	* ends at the end of the lowest committed fragment, or if there
	1329	* are no fragments, at the end of the highest committed block).
	1330	* Since files generally grow, the typical case is that the new
	1331	* block is to be added at the end of the list. We speed this
	1332	* special case by checking against the last allocdirect in the
	1333	* list before laboriously traversing the list looking for the
	1334	* insertion point.
	1335	*/
	1336	adphead = &inodedep->id_newinoupdt;
	1337	oldadp = TAILQ_LAST(adphead, allocdirectlst);
	1338	if (oldadp == NULL \|\| oldadp->ad_lbn <= lbn) {
	1339	/* insert at end of list */
	1340	TAILQ_INSERT_TAIL(adphead, adp, ad_next);
	1341	if (oldadp != NULL && oldadp->ad_lbn == lbn)
	1342	allocdirect_merge(adphead, adp, oldadp);
	1343	FREE_LOCK(&lk);
	1344	return;
	1345	}
	1346	TAILQ_FOREACH(oldadp, adphead, ad_next) {
	1347	if (oldadp->ad_lbn >= lbn)
	1348	break;
	1349	}
	1350	if (oldadp == NULL) {
	1351	panic("softdep_setup_allocdirect: lost entry");
	1352	}
	1353	/* insert in middle of list */
	1354	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
	1355	if (oldadp->ad_lbn == lbn)
	1356	allocdirect_merge(adphead, adp, oldadp);
	1357	FREE_LOCK(&lk);
	1358	}
	1359
	1360	/*
	1361	* Replace an old allocdirect dependency with a newer one.
	1362	* This routine must be called with splbio interrupts blocked.
	1363	*
	1364	* Parameters:
	1365	* adphead: head of list holding allocdirects
	1366	* newadp: allocdirect being added
	1367	* oldadp: existing allocdirect being checked
	1368	*/
	1369	static void
	1370	allocdirect_merge(struct allocdirectlst *adphead,
	1371	struct allocdirect *newadp,
	1372	struct allocdirect *oldadp)
	1373	{
	1374	struct freefrag *freefrag;
	1375
	1376	KKASSERT(lock_held(&lk) > 0);
	1377
	1378	if (newadp->ad_oldblkno != oldadp->ad_newblkno \|\|
	1379	newadp->ad_oldsize != oldadp->ad_newsize \|\|
	1380	newadp->ad_lbn >= NDADDR) {
	1381	panic("allocdirect_check: old %d != new %d \|\| lbn %ld >= %d",
	1382	newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
	1383	NDADDR);
	1384	}
	1385	newadp->ad_oldblkno = oldadp->ad_oldblkno;
	1386	newadp->ad_oldsize = oldadp->ad_oldsize;
	1387	/*
	1388	* If the old dependency had a fragment to free or had never
	1389	* previously had a block allocated, then the new dependency
	1390	* can immediately post its freefrag and adopt the old freefrag.
	1391	* This action is done by swapping the freefrag dependencies.
	1392	* The new dependency gains the old one's freefrag, and the
	1393	* old one gets the new one and then immediately puts it on
	1394	* the worklist when it is freed by free_allocdirect. It is
	1395	* not possible to do this swap when the old dependency had a
	1396	* non-zero size but no previous fragment to free. This condition
	1397	* arises when the new block is an extension of the old block.
	1398	* Here, the first part of the fragment allocated to the new
	1399	* dependency is part of the block currently claimed on disk by
	1400	* the old dependency, so cannot legitimately be freed until the
	1401	* conditions for the new dependency are fulfilled.
	1402	*/
	1403	if (oldadp->ad_freefrag != NULL \|\| oldadp->ad_oldblkno == 0) {
	1404	freefrag = newadp->ad_freefrag;
	1405	newadp->ad_freefrag = oldadp->ad_freefrag;
	1406	oldadp->ad_freefrag = freefrag;
	1407	}
	1408	free_allocdirect(adphead, oldadp, 0);
	1409	}
	1410
	1411	/*
	1412	* Allocate a new freefrag structure if needed.
	1413	*/
	1414	static struct freefrag *
	1415	newfreefrag(struct inode *ip, ufs_daddr_t blkno, long size)
	1416	{
	1417	struct freefrag *freefrag;
	1418	struct fs *fs;
	1419
	1420	if (blkno == 0)
	1421	return (NULL);
	1422	fs = ip->i_fs;
	1423	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
	1424	panic("newfreefrag: frag size");
	1425	freefrag = kmalloc(sizeof(struct freefrag), M_FREEFRAG,
	1426	M_SOFTDEP_FLAGS);
	1427	freefrag->ff_list.wk_type = D_FREEFRAG;
	1428	freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */
	1429	freefrag->ff_inum = ip->i_number;
	1430	freefrag->ff_fs = fs;
	1431	freefrag->ff_devvp = ip->i_devvp;
	1432	freefrag->ff_blkno = blkno;
	1433	freefrag->ff_fragsize = size;
	1434	return (freefrag);
	1435	}
	1436
	1437	/*
	1438	* This workitem de-allocates fragments that were replaced during
	1439	* file block allocation.
	1440	*/
	1441	static void
	1442	handle_workitem_freefrag(struct freefrag *freefrag)
	1443	{
	1444	struct inode tip;
	1445
	1446	tip.i_fs = freefrag->ff_fs;
	1447	tip.i_devvp = freefrag->ff_devvp;
	1448	tip.i_dev = freefrag->ff_devvp->v_rdev;
	1449	tip.i_number = freefrag->ff_inum;
	1450	tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above */
	1451	ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
	1452	kfree(freefrag, M_FREEFRAG);
	1453	}
	1454
	1455	/*
	1456	* Indirect block allocation dependencies.
	1457	*
	1458	* The same dependencies that exist for a direct block also exist when
	1459	* a new block is allocated and pointed to by an entry in a block of
	1460	* indirect pointers. The undo/redo states described above are also
	1461	* used here. Because an indirect block contains many pointers that
	1462	* may have dependencies, a second copy of the entire in-memory indirect
	1463	* block is kept. The buffer cache copy is always completely up-to-date.
	1464	* The second copy, which is used only as a source for disk writes,
	1465	* contains only the safe pointers (i.e., those that have no remaining
	1466	* update dependencies). The second copy is freed when all pointers
	1467	* are safe. The cache is not allowed to replace indirect blocks with
	1468	* pending update dependencies. If a buffer containing an indirect
	1469	* block with dependencies is written, these routines will mark it
	1470	* dirty again. It can only be successfully written once all the
	1471	* dependencies are removed. The ffs_fsync routine in conjunction with
	1472	* softdep_sync_metadata work together to get all the dependencies
	1473	* removed so that a file can be successfully written to disk. Three
	1474	* procedures are used when setting up indirect block pointer
	1475	* dependencies. The division is necessary because of the organization
	1476	* of the "balloc" routine and because of the distinction between file
	1477	* pages and file metadata blocks.
	1478	*/
	1479
	1480	/*
	1481	* Allocate a new allocindir structure.
	1482	*
	1483	* Parameters:
	1484	* ip: inode for file being extended
	1485	* ptrno: offset of pointer in indirect block
	1486	* newblkno: disk block number being added
	1487	* oldblkno: previous block number, 0 if none
	1488	*/
	1489	static struct allocindir *
	1490	newallocindir(struct inode *ip, int ptrno, ufs_daddr_t newblkno,
	1491	ufs_daddr_t oldblkno)
	1492	{
	1493	struct allocindir *aip;
	1494
	1495	aip = kmalloc(sizeof(struct allocindir), M_ALLOCINDIR,
	1496	M_SOFTDEP_FLAGS \| M_ZERO);
	1497	aip->ai_list.wk_type = D_ALLOCINDIR;
	1498	aip->ai_state = ATTACHED;
	1499	aip->ai_offset = ptrno;
	1500	aip->ai_newblkno = newblkno;
	1501	aip->ai_oldblkno = oldblkno;
	1502	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
	1503	return (aip);
	1504	}
	1505
	1506	/*
	1507	* Called just before setting an indirect block pointer
	1508	* to a newly allocated file page.
	1509	*
	1510	* Parameters:
	1511	* ip: inode for file being extended
	1512	* lbn: allocated block number within file
	1513	* bp: buffer with indirect blk referencing page
	1514	* ptrno: offset of pointer in indirect block
	1515	* newblkno: disk block number being added
	1516	* oldblkno: previous block number, 0 if none
	1517	* nbp: buffer holding allocated page
	1518	*/
	1519	void
	1520	softdep_setup_allocindir_page(struct inode *ip, ufs_lbn_t lbn,
	1521	struct buf *bp, int ptrno,
	1522	ufs_daddr_t newblkno, ufs_daddr_t oldblkno,
	1523	struct buf *nbp)
	1524	{
	1525	struct allocindir *aip;
	1526	struct pagedep *pagedep;
	1527
	1528	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
	1529	ACQUIRE_LOCK(&lk);
	1530	/*
	1531	* If we are allocating a directory page, then we must
	1532	* allocate an associated pagedep to track additions and
	1533	* deletions.
	1534	*/
	1535	if ((ip->i_mode & IFMT) == IFDIR &&
	1536	pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
	1537	WORKLIST_INSERT_BP(nbp, &pagedep->pd_list);
	1538	WORKLIST_INSERT_BP(nbp, &aip->ai_list);
	1539	FREE_LOCK(&lk);
	1540	setup_allocindir_phase2(bp, ip, aip);
	1541	}
	1542
	1543	/*
	1544	* Called just before setting an indirect block pointer to a
	1545	* newly allocated indirect block.
	1546	* Parameters:
	1547	* nbp: newly allocated indirect block
	1548	* ip: inode for file being extended
	1549	* bp: indirect block referencing allocated block
	1550	* ptrno: offset of pointer in indirect block
	1551	* newblkno: disk block number being added
	1552	*/
	1553	void
	1554	softdep_setup_allocindir_meta(struct buf nbp, struct inode ip,
	1555	struct buf *bp, int ptrno,
	1556	ufs_daddr_t newblkno)
	1557	{
	1558	struct allocindir *aip;
	1559
	1560	aip = newallocindir(ip, ptrno, newblkno, 0);
	1561	ACQUIRE_LOCK(&lk);
	1562	WORKLIST_INSERT_BP(nbp, &aip->ai_list);
	1563	FREE_LOCK(&lk);
	1564	setup_allocindir_phase2(bp, ip, aip);
	1565	}
	1566
	1567	/*
	1568	* Called to finish the allocation of the "aip" allocated
	1569	* by one of the two routines above.
	1570	*
	1571	* Parameters:
	1572	* bp: in-memory copy of the indirect block
	1573	* ip: inode for file being extended
	1574	* aip: allocindir allocated by the above routines
	1575	*/
	1576	static void
	1577	setup_allocindir_phase2(struct buf bp, struct inode ip,
	1578	struct allocindir *aip)
	1579	{
	1580	struct worklist *wk;
	1581	struct indirdep indirdep, newindirdep;
	1582	struct bmsafemap *bmsafemap;
	1583	struct allocindir *oldaip;
	1584	struct freefrag *freefrag;
	1585	struct newblk *newblk;
	1586
	1587	if (bp->b_loffset >= 0)
	1588	panic("setup_allocindir_phase2: not indir blk");
	1589	for (indirdep = NULL, newindirdep = NULL; ; ) {
	1590	ACQUIRE_LOCK(&lk);
	1591	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	1592	if (wk->wk_type != D_INDIRDEP)
	1593	continue;
	1594	indirdep = WK_INDIRDEP(wk);
	1595	break;
	1596	}
	1597	if (indirdep == NULL && newindirdep) {
	1598	indirdep = newindirdep;
	1599	WORKLIST_INSERT_BP(bp, &indirdep->ir_list);
	1600	newindirdep = NULL;
	1601	}
	1602	FREE_LOCK(&lk);
	1603	if (indirdep) {
	1604	if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
	1605	&newblk) == 0)
	1606	panic("setup_allocindir: lost block");
	1607	ACQUIRE_LOCK(&lk);
	1608	if (newblk->nb_state == DEPCOMPLETE) {
	1609	aip->ai_state \|= DEPCOMPLETE;
	1610	aip->ai_buf = NULL;
	1611	} else {
	1612	bmsafemap = newblk->nb_bmsafemap;
	1613	aip->ai_buf = bmsafemap->sm_buf;
	1614	LIST_REMOVE(newblk, nb_deps);
	1615	LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
	1616	aip, ai_deps);
	1617	}
	1618	LIST_REMOVE(newblk, nb_hash);
	1619	kfree(newblk, M_NEWBLK);
	1620	aip->ai_indirdep = indirdep;
	1621	/*
	1622	* Check to see if there is an existing dependency
	1623	* for this block. If there is, merge the old
	1624	* dependency into the new one.
	1625	*/
	1626	if (aip->ai_oldblkno == 0)
	1627	oldaip = NULL;
	1628	else
	1629
	1630	LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
	1631	if (oldaip->ai_offset == aip->ai_offset)
	1632	break;
	1633	if (oldaip != NULL) {
	1634	if (oldaip->ai_newblkno != aip->ai_oldblkno) {
	1635	panic("setup_allocindir_phase2: blkno");
	1636	}
	1637	aip->ai_oldblkno = oldaip->ai_oldblkno;
	1638	freefrag = oldaip->ai_freefrag;
	1639	oldaip->ai_freefrag = aip->ai_freefrag;
	1640	aip->ai_freefrag = freefrag;
	1641	free_allocindir(oldaip, NULL);
	1642	}
	1643	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
	1644	((ufs_daddr_t *)indirdep->ir_savebp->b_data)
	1645	[aip->ai_offset] = aip->ai_oldblkno;
	1646	FREE_LOCK(&lk);
	1647	}
	1648	if (newindirdep) {
	1649	/*
	1650	* Avoid any possibility of data corruption by
	1651	* ensuring that our old version is thrown away.
	1652	*/
	1653	newindirdep->ir_savebp->b_flags \|= B_INVAL \| B_NOCACHE;
	1654	brelse(newindirdep->ir_savebp);
	1655	WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
	1656	}
	1657	if (indirdep)
	1658	break;
	1659	newindirdep = kmalloc(sizeof(struct indirdep), M_INDIRDEP,
	1660	M_SOFTDEP_FLAGS);
	1661	newindirdep->ir_list.wk_type = D_INDIRDEP;
	1662	newindirdep->ir_state = ATTACHED;
	1663	LIST_INIT(&newindirdep->ir_deplisthd);
	1664	LIST_INIT(&newindirdep->ir_donehd);
	1665	if (bp->b_bio2.bio_offset == NOOFFSET) {
	1666	VOP_BMAP(bp->b_vp, bp->b_bio1.bio_offset,
	1667	&bp->b_bio2.bio_offset, NULL, NULL,
	1668	BUF_CMD_WRITE);
	1669	}
	1670	KKASSERT(bp->b_bio2.bio_offset != NOOFFSET);
	1671	newindirdep->ir_savebp = getblk(ip->i_devvp,
	1672	bp->b_bio2.bio_offset,
	1673	bp->b_bcount, 0, 0);
	1674	BUF_KERNPROC(newindirdep->ir_savebp);
	1675	bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
	1676	}
	1677	}
	1678
	1679	/*
	1680	* Block de-allocation dependencies.
	1681	*
	1682	* When blocks are de-allocated, the on-disk pointers must be nullified before
	1683	* the blocks are made available for use by other files. (The true
	1684	* requirement is that old pointers must be nullified before new on-disk
	1685	* pointers are set. We chose this slightly more stringent requirement to
	1686	* reduce complexity.) Our implementation handles this dependency by updating
	1687	* the inode (or indirect block) appropriately but delaying the actual block
	1688	* de-allocation (i.e., freemap and free space count manipulation) until
	1689	* after the updated versions reach stable storage. After the disk is
	1690	* updated, the blocks can be safely de-allocated whenever it is convenient.
	1691	* This implementation handles only the common case of reducing a file's
	1692	* length to zero. Other cases are handled by the conventional synchronous
	1693	* write approach.
	1694	*
	1695	* The ffs implementation with which we worked double-checks
	1696	* the state of the block pointers and file size as it reduces
	1697	* a file's length. Some of this code is replicated here in our
	1698	* soft updates implementation. The freeblks->fb_chkcnt field is
	1699	* used to transfer a part of this information to the procedure
	1700	* that eventually de-allocates the blocks.
	1701	*
	1702	* This routine should be called from the routine that shortens
	1703	* a file's length, before the inode's size or block pointers
	1704	* are modified. It will save the block pointer information for
	1705	* later release and zero the inode so that the calling routine
	1706	* can release it.
	1707	*/
	1708	struct softdep_setup_freeblocks_info {
	1709	struct fs *fs;
	1710	struct inode *ip;
	1711	};
	1712
	1713	static int softdep_setup_freeblocks_bp(struct buf bp, void data);
	1714
	1715	/*
	1716	* Parameters:
	1717	* ip: The inode whose length is to be reduced
	1718	* length: The new length for the file
	1719	*/
	1720	void
	1721	softdep_setup_freeblocks(struct inode *ip, off_t length)
	1722	{
	1723	struct softdep_setup_freeblocks_info info;
	1724	struct freeblks *freeblks;
	1725	struct inodedep *inodedep;
	1726	struct allocdirect *adp;
	1727	struct vnode *vp;
	1728	struct buf *bp;
	1729	struct fs *fs;
	1730	int i, error, delay;
	1731	int count;
	1732
	1733	fs = ip->i_fs;
	1734	if (length != 0)
	1735	panic("softde_setup_freeblocks: non-zero length");
	1736	freeblks = kmalloc(sizeof(struct freeblks), M_FREEBLKS,
	1737	M_SOFTDEP_FLAGS \| M_ZERO);
	1738	freeblks->fb_list.wk_type = D_FREEBLKS;
	1739	freeblks->fb_state = ATTACHED;
	1740	freeblks->fb_uid = ip->i_uid;
	1741	freeblks->fb_previousinum = ip->i_number;
	1742	freeblks->fb_devvp = ip->i_devvp;
	1743	freeblks->fb_fs = fs;
	1744	freeblks->fb_oldsize = ip->i_size;
	1745	freeblks->fb_newsize = length;
	1746	freeblks->fb_chkcnt = ip->i_blocks;
	1747	for (i = 0; i < NDADDR; i++) {
	1748	freeblks->fb_dblks[i] = ip->i_db[i];
	1749	ip->i_db[i] = 0;
	1750	}
	1751	for (i = 0; i < NIADDR; i++) {
	1752	freeblks->fb_iblks[i] = ip->i_ib[i];
	1753	ip->i_ib[i] = 0;
	1754	}
	1755	ip->i_blocks = 0;
	1756	ip->i_size = 0;
	1757	/*
	1758	* Push the zero'ed inode to to its disk buffer so that we are free
	1759	* to delete its dependencies below. Once the dependencies are gone
	1760	* the buffer can be safely released.
	1761	*/
	1762	if ((error = bread(ip->i_devvp,
	1763	fsbtodoff(fs, ino_to_fsba(fs, ip->i_number)),
	1764	(int)fs->fs_bsize, &bp)) != 0)
	1765	softdep_error("softdep_setup_freeblocks", error);
	1766	((struct ufs1_dinode )bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
	1767	ip->i_din;
	1768	/*
	1769	* Find and eliminate any inode dependencies.
	1770	*/
	1771	ACQUIRE_LOCK(&lk);
	1772	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
	1773	if ((inodedep->id_state & IOSTARTED) != 0) {
	1774	panic("softdep_setup_freeblocks: inode busy");
	1775	}
	1776	/*
	1777	* Add the freeblks structure to the list of operations that
	1778	* must await the zero'ed inode being written to disk. If we
	1779	* still have a bitmap dependency (delay == 0), then the inode
	1780	* has never been written to disk, so we can process the
	1781	* freeblks below once we have deleted the dependencies.
	1782	*/
	1783	delay = (inodedep->id_state & DEPCOMPLETE);
	1784	if (delay)
	1785	WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
	1786	/*
	1787	* Because the file length has been truncated to zero, any
	1788	* pending block allocation dependency structures associated
	1789	* with this inode are obsolete and can simply be de-allocated.
	1790	* We must first merge the two dependency lists to get rid of
	1791	* any duplicate freefrag structures, then purge the merged list.
	1792	*/
	1793	merge_inode_lists(inodedep);
	1794	while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
	1795	free_allocdirect(&inodedep->id_inoupdt, adp, 1);
	1796	FREE_LOCK(&lk);
	1797	bdwrite(bp);
	1798	/*
	1799	* We must wait for any I/O in progress to finish so that
	1800	* all potential buffers on the dirty list will be visible.
	1801	* Once they are all there, walk the list and get rid of
	1802	* any dependencies.
	1803	*/
	1804	vp = ITOV(ip);
	1805	ACQUIRE_LOCK(&lk);
	1806	drain_output(vp, 1);
	1807
	1808	info.fs = fs;
	1809	info.ip = ip;
	1810	lwkt_gettoken(&vp->v_token);
	1811	do {
	1812	count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
	1813	softdep_setup_freeblocks_bp, &info);
	1814	} while (count != 0);
	1815	lwkt_reltoken(&vp->v_token);
	1816
	1817	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
	1818	(void)free_inodedep(inodedep);
	1819
	1820	if (delay) {
	1821	freeblks->fb_state \|= DEPCOMPLETE;
	1822	/*
	1823	* If the inode with zeroed block pointers is now on disk
	1824	* we can start freeing blocks. Add freeblks to the worklist
	1825	* instead of calling handle_workitem_freeblocks directly as
	1826	* it is more likely that additional IO is needed to complete
	1827	* the request here than in the !delay case.
	1828	*/
	1829	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
	1830	add_to_worklist(&freeblks->fb_list);
	1831	}
	1832
	1833	FREE_LOCK(&lk);
	1834	/*
	1835	* If the inode has never been written to disk (delay == 0),
	1836	* then we can process the freeblks now that we have deleted
	1837	* the dependencies.
	1838	*/
	1839	if (!delay)
	1840	handle_workitem_freeblocks(freeblks);
	1841	}
	1842
	1843	static int
	1844	softdep_setup_freeblocks_bp(struct buf bp, void data)
	1845	{
	1846	struct softdep_setup_freeblocks_info *info = data;
	1847	struct inodedep *inodedep;
	1848
	1849	if (getdirtybuf(&bp, MNT_WAIT) == 0) {
	1850	kprintf("softdep_setup_freeblocks_bp(1): caught bp %p going away\n", bp);
	1851	return(-1);
	1852	}
	1853	if (bp->b_vp != ITOV(info->ip) \|\| (bp->b_flags & B_DELWRI) == 0) {
	1854	kprintf("softdep_setup_freeblocks_bp(2): caught bp %p going away\n", bp);
	1855	BUF_UNLOCK(bp);
	1856	return(-1);
	1857	}
	1858	(void) inodedep_lookup(info->fs, info->ip->i_number, 0, &inodedep);
	1859	deallocate_dependencies(bp, inodedep);
	1860	bp->b_flags \|= B_INVAL \| B_NOCACHE;
	1861	FREE_LOCK(&lk);
	1862	brelse(bp);
	1863	ACQUIRE_LOCK(&lk);
	1864	return(1);
	1865	}
	1866
	1867	/*
	1868	* Reclaim any dependency structures from a buffer that is about to
	1869	* be reallocated to a new vnode. The buffer must be locked, thus,
	1870	* no I/O completion operations can occur while we are manipulating
	1871	* its associated dependencies. The mutex is held so that other I/O's
	1872	* associated with related dependencies do not occur.
	1873	*/
	1874	static void
	1875	deallocate_dependencies(struct buf bp, struct inodedep inodedep)
	1876	{
	1877	struct worklist *wk;
	1878	struct indirdep *indirdep;
	1879	struct allocindir *aip;
	1880	struct pagedep *pagedep;
	1881	struct dirrem *dirrem;
	1882	struct diradd *dap;
	1883	int i;
	1884
	1885	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
	1886	switch (wk->wk_type) {
	1887
	1888	case D_INDIRDEP:
	1889	indirdep = WK_INDIRDEP(wk);
	1890	/*
	1891	* None of the indirect pointers will ever be visible,
	1892	* so they can simply be tossed. GOINGAWAY ensures
	1893	* that allocated pointers will be saved in the buffer
	1894	* cache until they are freed. Note that they will
	1895	* only be able to be found by their physical address
	1896	* since the inode mapping the logical address will
	1897	* be gone. The save buffer used for the safe copy
	1898	* was allocated in setup_allocindir_phase2 using
	1899	* the physical address so it could be used for this
	1900	* purpose. Hence we swap the safe copy with the real
	1901	* copy, allowing the safe copy to be freed and holding
	1902	* on to the real copy for later use in indir_trunc.
	1903	*
	1904	* NOTE: ir_savebp is relative to the block device
	1905	* so b_bio1 contains the device block number.
	1906	*/
	1907	if (indirdep->ir_state & GOINGAWAY) {
	1908	panic("deallocate_dependencies: already gone");
	1909	}
	1910	indirdep->ir_state \|= GOINGAWAY;
	1911	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL)
	1912	free_allocindir(aip, inodedep);
	1913	if (bp->b_bio1.bio_offset >= 0 \|\|
	1914	bp->b_bio2.bio_offset != indirdep->ir_savebp->b_bio1.bio_offset) {
	1915	panic("deallocate_dependencies: not indir");
	1916	}
	1917	bcopy(bp->b_data, indirdep->ir_savebp->b_data,
	1918	bp->b_bcount);
	1919	WORKLIST_REMOVE(wk);
	1920	WORKLIST_INSERT_BP(indirdep->ir_savebp, wk);
	1921	continue;
	1922
	1923	case D_PAGEDEP:
	1924	pagedep = WK_PAGEDEP(wk);
	1925	/*
	1926	* None of the directory additions will ever be
	1927	* visible, so they can simply be tossed.
	1928	*/
	1929	for (i = 0; i < DAHASHSZ; i++)
	1930	while ((dap =
	1931	LIST_FIRST(&pagedep->pd_diraddhd[i])))
	1932	free_diradd(dap);
	1933	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
	1934	free_diradd(dap);
	1935	/*
	1936	* Copy any directory remove dependencies to the list
	1937	* to be processed after the zero'ed inode is written.
	1938	* If the inode has already been written, then they
	1939	* can be dumped directly onto the work list.
	1940	*/
	1941	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
	1942	LIST_REMOVE(dirrem, dm_next);
	1943	dirrem->dm_dirinum = pagedep->pd_ino;
	1944	if (inodedep == NULL \|\|
	1945	(inodedep->id_state & ALLCOMPLETE) ==
	1946	ALLCOMPLETE)
	1947	add_to_worklist(&dirrem->dm_list);
	1948	else
	1949	WORKLIST_INSERT(&inodedep->id_bufwait,
	1950	&dirrem->dm_list);
	1951	}
	1952	WORKLIST_REMOVE(&pagedep->pd_list);
	1953	LIST_REMOVE(pagedep, pd_hash);
	1954	WORKITEM_FREE(pagedep, D_PAGEDEP);
	1955	continue;
	1956
	1957	case D_ALLOCINDIR:
	1958	free_allocindir(WK_ALLOCINDIR(wk), inodedep);
	1959	continue;
	1960
	1961	case D_ALLOCDIRECT:
	1962	case D_INODEDEP:
	1963	panic("deallocate_dependencies: Unexpected type %s",
	1964	TYPENAME(wk->wk_type));
	1965	/* NOTREACHED */
	1966
	1967	default:
	1968	panic("deallocate_dependencies: Unknown type %s",
	1969	TYPENAME(wk->wk_type));
	1970	/* NOTREACHED */
	1971	}
	1972	}
	1973	}
	1974
	1975	/*
	1976	* Free an allocdirect. Generate a new freefrag work request if appropriate.
	1977	* This routine must be called with splbio interrupts blocked.
	1978	*/
	1979	static void
	1980	free_allocdirect(struct allocdirectlst *adphead,
	1981	struct allocdirect *adp, int delay)
	1982	{
	1983	KKASSERT(lock_held(&lk) > 0);
	1984
	1985	if ((adp->ad_state & DEPCOMPLETE) == 0)
	1986	LIST_REMOVE(adp, ad_deps);
	1987	TAILQ_REMOVE(adphead, adp, ad_next);
	1988	if ((adp->ad_state & COMPLETE) == 0)
	1989	WORKLIST_REMOVE(&adp->ad_list);
	1990	if (adp->ad_freefrag != NULL) {
	1991	if (delay)
	1992	WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
	1993	&adp->ad_freefrag->ff_list);
	1994	else
	1995	add_to_worklist(&adp->ad_freefrag->ff_list);
	1996	}
	1997	WORKITEM_FREE(adp, D_ALLOCDIRECT);
	1998	}
	1999
	2000	/*
	2001	* Prepare an inode to be freed. The actual free operation is not
	2002	* done until the zero'ed inode has been written to disk.
	2003	*/
	2004	void
	2005	softdep_freefile(struct vnode *pvp, ino_t ino, int mode)
	2006	{
	2007	struct inode *ip = VTOI(pvp);
	2008	struct inodedep *inodedep;
	2009	struct freefile *freefile;
	2010
	2011	/*
	2012	* This sets up the inode de-allocation dependency.
	2013	*/
	2014	freefile = kmalloc(sizeof(struct freefile), M_FREEFILE,
	2015	M_SOFTDEP_FLAGS);
	2016	freefile->fx_list.wk_type = D_FREEFILE;
	2017	freefile->fx_list.wk_state = 0;
	2018	freefile->fx_mode = mode;
	2019	freefile->fx_oldinum = ino;
	2020	freefile->fx_devvp = ip->i_devvp;
	2021	freefile->fx_fs = ip->i_fs;
	2022
	2023	/*
	2024	* If the inodedep does not exist, then the zero'ed inode has
	2025	* been written to disk. If the allocated inode has never been
	2026	* written to disk, then the on-disk inode is zero'ed. In either
	2027	* case we can free the file immediately.
	2028	*/
	2029	ACQUIRE_LOCK(&lk);
	2030	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 \|\|
	2031	check_inode_unwritten(inodedep)) {
	2032	FREE_LOCK(&lk);
	2033	handle_workitem_freefile(freefile);
	2034	return;
	2035	}
	2036	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
	2037	FREE_LOCK(&lk);
	2038	}
	2039
	2040	/*
	2041	* Check to see if an inode has never been written to disk. If
	2042	* so free the inodedep and return success, otherwise return failure.
	2043	* This routine must be called with splbio interrupts blocked.
	2044	*
	2045	* If we still have a bitmap dependency, then the inode has never
	2046	* been written to disk. Drop the dependency as it is no longer
	2047	* necessary since the inode is being deallocated. We set the
	2048	* ALLCOMPLETE flags since the bitmap now properly shows that the
	2049	* inode is not allocated. Even if the inode is actively being
	2050	* written, it has been rolled back to its zero'ed state, so we
	2051	* are ensured that a zero inode is what is on the disk. For short
	2052	* lived files, this change will usually result in removing all the
	2053	* dependencies from the inode so that it can be freed immediately.
	2054	*/
	2055	static int
	2056	check_inode_unwritten(struct inodedep *inodedep)
	2057	{
	2058
	2059	if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\|
	2060	LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\|
	2061	LIST_FIRST(&inodedep->id_bufwait) != NULL \|\|
	2062	LIST_FIRST(&inodedep->id_inowait) != NULL \|\|
	2063	TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\|
	2064	TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\|
	2065	inodedep->id_nlinkdelta != 0)
	2066	return (0);
	2067
	2068	/*
	2069	* Another process might be in initiate_write_inodeblock
	2070	* trying to allocate memory without holding "Softdep Lock".
	2071	*/
	2072	if ((inodedep->id_state & IOSTARTED) != 0 &&
	2073	inodedep->id_savedino == NULL)
	2074	return(0);
	2075
	2076	inodedep->id_state \|= ALLCOMPLETE;
	2077	LIST_REMOVE(inodedep, id_deps);
	2078	inodedep->id_buf = NULL;
	2079	if (inodedep->id_state & ONWORKLIST)
	2080	WORKLIST_REMOVE(&inodedep->id_list);
	2081	if (inodedep->id_savedino != NULL) {
	2082	kfree(inodedep->id_savedino, M_INODEDEP);
	2083	inodedep->id_savedino = NULL;
	2084	}
	2085	if (free_inodedep(inodedep) == 0) {
	2086	panic("check_inode_unwritten: busy inode");
	2087	}
	2088	return (1);
	2089	}
	2090
	2091	/*
	2092	* Try to free an inodedep structure. Return 1 if it could be freed.
	2093	*/
	2094	static int
	2095	free_inodedep(struct inodedep *inodedep)
	2096	{
	2097
	2098	if ((inodedep->id_state & ONWORKLIST) != 0 \|\|
	2099	(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE \|\|
	2100	LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\|
	2101	LIST_FIRST(&inodedep->id_bufwait) != NULL \|\|
	2102	LIST_FIRST(&inodedep->id_inowait) != NULL \|\|
	2103	TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\|
	2104	TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\|
	2105	inodedep->id_nlinkdelta != 0 \|\| inodedep->id_savedino != NULL)
	2106	return (0);
	2107	LIST_REMOVE(inodedep, id_hash);
	2108	WORKITEM_FREE(inodedep, D_INODEDEP);
	2109	num_inodedep -= 1;
	2110	return (1);
	2111	}
	2112
	2113	/*
	2114	* This workitem routine performs the block de-allocation.
	2115	* The workitem is added to the pending list after the updated
	2116	* inode block has been written to disk. As mentioned above,
	2117	* checks regarding the number of blocks de-allocated (compared
	2118	* to the number of blocks allocated for the file) are also
	2119	* performed in this function.
	2120	*/
	2121	static void
	2122	handle_workitem_freeblocks(struct freeblks *freeblks)
	2123	{
	2124	struct inode tip;
	2125	ufs_daddr_t bn;
	2126	struct fs *fs;
	2127	int i, level, bsize;
	2128	long nblocks, blocksreleased = 0;
	2129	int error, allerror = 0;
	2130	ufs_lbn_t baselbns[NIADDR], tmpval;
	2131
	2132	tip.i_number = freeblks->fb_previousinum;
	2133	tip.i_devvp = freeblks->fb_devvp;
	2134	tip.i_dev = freeblks->fb_devvp->v_rdev;
	2135	tip.i_fs = freeblks->fb_fs;
	2136	tip.i_size = freeblks->fb_oldsize;
	2137	tip.i_uid = freeblks->fb_uid;
	2138	fs = freeblks->fb_fs;
	2139	tmpval = 1;
	2140	baselbns[0] = NDADDR;
	2141	for (i = 1; i < NIADDR; i++) {
	2142	tmpval *= NINDIR(fs);
	2143	baselbns[i] = baselbns[i - 1] + tmpval;
	2144	}
	2145	nblocks = btodb(fs->fs_bsize);
	2146	blocksreleased = 0;
	2147	/*
	2148	* Indirect blocks first.
	2149	*/
	2150	for (level = (NIADDR - 1); level >= 0; level--) {
	2151	if ((bn = freeblks->fb_iblks[level]) == 0)
	2152	continue;
	2153	if ((error = indir_trunc(&tip, fsbtodoff(fs, bn), level,
	2154	baselbns[level], &blocksreleased)) == 0)
	2155	allerror = error;
	2156	ffs_blkfree(&tip, bn, fs->fs_bsize);
	2157	blocksreleased += nblocks;
	2158	}
	2159	/*
	2160	* All direct blocks or frags.
	2161	*/
	2162	for (i = (NDADDR - 1); i >= 0; i--) {
	2163	if ((bn = freeblks->fb_dblks[i]) == 0)
	2164	continue;
	2165	bsize = blksize(fs, &tip, i);
	2166	ffs_blkfree(&tip, bn, bsize);
	2167	blocksreleased += btodb(bsize);
	2168	}
	2169
	2170	#ifdef DIAGNOSTIC
	2171	if (freeblks->fb_chkcnt != blocksreleased)
	2172	kprintf("handle_workitem_freeblocks: block count\n");
	2173	if (allerror)
	2174	softdep_error("handle_workitem_freeblks", allerror);
	2175	#endif /* DIAGNOSTIC */
	2176	WORKITEM_FREE(freeblks, D_FREEBLKS);
	2177	}
	2178
	2179	/*
	2180	* Release blocks associated with the inode ip and stored in the indirect
	2181	* block at doffset. If level is greater than SINGLE, the block is an
	2182	* indirect block and recursive calls to indirtrunc must be used to
	2183	* cleanse other indirect blocks.
	2184	*/
	2185	static int
	2186	indir_trunc(struct inode *ip, off_t doffset, int level, ufs_lbn_t lbn,
	2187	long *countp)
	2188	{
	2189	struct buf *bp;
	2190	ufs_daddr_t *bap;
	2191	ufs_daddr_t nb;
	2192	struct fs *fs;
	2193	struct worklist *wk;
	2194	struct indirdep *indirdep;
	2195	int i, lbnadd, nblocks;
	2196	int error, allerror = 0;
	2197
	2198	fs = ip->i_fs;
	2199	lbnadd = 1;
	2200	for (i = level; i > 0; i--)
	2201	lbnadd *= NINDIR(fs);
	2202	/*
	2203	* Get buffer of block pointers to be freed. This routine is not
	2204	* called until the zero'ed inode has been written, so it is safe
	2205	* to free blocks as they are encountered. Because the inode has
	2206	* been zero'ed, calls to bmap on these blocks will fail. So, we
	2207	* have to use the on-disk address and the block device for the
	2208	* filesystem to look them up. If the file was deleted before its
	2209	* indirect blocks were all written to disk, the routine that set
	2210	* us up (deallocate_dependencies) will have arranged to leave
	2211	* a complete copy of the indirect block in memory for our use.
	2212	* Otherwise we have to read the blocks in from the disk.
	2213	*/
	2214	ACQUIRE_LOCK(&lk);
	2215	if ((bp = findblk(ip->i_devvp, doffset, FINDBLK_TEST)) != NULL &&
	2216	(wk = LIST_FIRST(&bp->b_dep)) != NULL) {
	2217	/*
	2218	* bp must be ir_savebp, which is held locked for our use.
	2219	*/
	2220	if (wk->wk_type != D_INDIRDEP \|\|
	2221	(indirdep = WK_INDIRDEP(wk))->ir_savebp != bp \|\|
	2222	(indirdep->ir_state & GOINGAWAY) == 0) {
	2223	panic("indir_trunc: lost indirdep");
	2224	}
	2225	WORKLIST_REMOVE(wk);
	2226	WORKITEM_FREE(indirdep, D_INDIRDEP);
	2227	if (LIST_FIRST(&bp->b_dep) != NULL) {
	2228	panic("indir_trunc: dangling dep");
	2229	}
	2230	FREE_LOCK(&lk);
	2231	} else {
	2232	FREE_LOCK(&lk);
	2233	error = bread(ip->i_devvp, doffset, (int)fs->fs_bsize, &bp);
	2234	if (error)
	2235	return (error);
	2236	}
	2237	/*
	2238	* Recursively free indirect blocks.
	2239	*/
	2240	bap = (ufs_daddr_t *)bp->b_data;
	2241	nblocks = btodb(fs->fs_bsize);
	2242	for (i = NINDIR(fs) - 1; i >= 0; i--) {
	2243	if ((nb = bap[i]) == 0)
	2244	continue;
	2245	if (level != 0) {
	2246	if ((error = indir_trunc(ip, fsbtodoff(fs, nb),
	2247	level - 1, lbn + (i * lbnadd), countp)) != 0)
	2248	allerror = error;
	2249	}
	2250	ffs_blkfree(ip, nb, fs->fs_bsize);
	2251	*countp += nblocks;
	2252	}
	2253	bp->b_flags \|= B_INVAL \| B_NOCACHE;
	2254	brelse(bp);
	2255	return (allerror);
	2256	}
	2257
	2258	/*
	2259	* Free an allocindir.
	2260	* This routine must be called with splbio interrupts blocked.
	2261	*/
	2262	static void
	2263	free_allocindir(struct allocindir aip, struct inodedep inodedep)
	2264	{
	2265	struct freefrag *freefrag;
	2266
	2267	KKASSERT(lock_held(&lk) > 0);
	2268
	2269	if ((aip->ai_state & DEPCOMPLETE) == 0)
	2270	LIST_REMOVE(aip, ai_deps);
	2271	if (aip->ai_state & ONWORKLIST)
	2272	WORKLIST_REMOVE(&aip->ai_list);
	2273	LIST_REMOVE(aip, ai_next);
	2274	if ((freefrag = aip->ai_freefrag) != NULL) {
	2275	if (inodedep == NULL)
	2276	add_to_worklist(&freefrag->ff_list);
	2277	else
	2278	WORKLIST_INSERT(&inodedep->id_bufwait,
	2279	&freefrag->ff_list);
	2280	}
	2281	WORKITEM_FREE(aip, D_ALLOCINDIR);
	2282	}
	2283
	2284	/*
	2285	* Directory entry addition dependencies.
	2286	*
	2287	* When adding a new directory entry, the inode (with its incremented link
	2288	* count) must be written to disk before the directory entry's pointer to it.
	2289	* Also, if the inode is newly allocated, the corresponding freemap must be
	2290	* updated (on disk) before the directory entry's pointer. These requirements
	2291	* are met via undo/redo on the directory entry's pointer, which consists
	2292	* simply of the inode number.
	2293	*
	2294	* As directory entries are added and deleted, the free space within a
	2295	* directory block can become fragmented. The ufs filesystem will compact
	2296	* a fragmented directory block to make space for a new entry. When this
	2297	* occurs, the offsets of previously added entries change. Any "diradd"
	2298	* dependency structures corresponding to these entries must be updated with
	2299	* the new offsets.
	2300	*/
	2301
	2302	/*
	2303	* This routine is called after the in-memory inode's link
	2304	* count has been incremented, but before the directory entry's
	2305	* pointer to the inode has been set.
	2306	*
	2307	* Parameters:
	2308	* bp: buffer containing directory block
	2309	* dp: inode for directory
	2310	* diroffset: offset of new entry in directory
	2311	* newinum: inode referenced by new directory entry
	2312	* newdirbp: non-NULL => contents of new mkdir
	2313	*/
	2314	void
	2315	softdep_setup_directory_add(struct buf bp, struct inode dp, off_t diroffset,
	2316	ino_t newinum, struct buf *newdirbp)
	2317	{
	2318	int offset; /* offset of new entry within directory block */
	2319	ufs_lbn_t lbn; /* block in directory containing new entry */
	2320	struct fs *fs;
	2321	struct diradd *dap;
	2322	struct pagedep *pagedep;
	2323	struct inodedep *inodedep;
	2324	struct mkdir mkdir1, mkdir2;
	2325
	2326	/*
	2327	* Whiteouts have no dependencies.
	2328	*/
	2329	if (newinum == WINO) {
	2330	if (newdirbp != NULL)
	2331	bdwrite(newdirbp);
	2332	return;
	2333	}
	2334
	2335	fs = dp->i_fs;
	2336	lbn = lblkno(fs, diroffset);
	2337	offset = blkoff(fs, diroffset);
	2338	dap = kmalloc(sizeof(struct diradd), M_DIRADD,
	2339	M_SOFTDEP_FLAGS \| M_ZERO);
	2340	dap->da_list.wk_type = D_DIRADD;
	2341	dap->da_offset = offset;
	2342	dap->da_newinum = newinum;
	2343	dap->da_state = ATTACHED;
	2344	if (newdirbp == NULL) {
	2345	dap->da_state \|= DEPCOMPLETE;
	2346	ACQUIRE_LOCK(&lk);
	2347	} else {
	2348	dap->da_state \|= MKDIR_BODY \| MKDIR_PARENT;
	2349	mkdir1 = kmalloc(sizeof(struct mkdir), M_MKDIR,
	2350	M_SOFTDEP_FLAGS);
	2351	mkdir1->md_list.wk_type = D_MKDIR;
	2352	mkdir1->md_state = MKDIR_BODY;
	2353	mkdir1->md_diradd = dap;
	2354	mkdir2 = kmalloc(sizeof(struct mkdir), M_MKDIR,
	2355	M_SOFTDEP_FLAGS);
	2356	mkdir2->md_list.wk_type = D_MKDIR;
	2357	mkdir2->md_state = MKDIR_PARENT;
	2358	mkdir2->md_diradd = dap;
	2359	/*
	2360	* Dependency on "." and ".." being written to disk.
	2361	*/
	2362	mkdir1->md_buf = newdirbp;
	2363	ACQUIRE_LOCK(&lk);
	2364	LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
	2365	WORKLIST_INSERT_BP(newdirbp, &mkdir1->md_list);
	2366	FREE_LOCK(&lk);
	2367	bdwrite(newdirbp);
	2368	/*
	2369	* Dependency on link count increase for parent directory
	2370	*/
	2371	ACQUIRE_LOCK(&lk);
	2372	if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
	2373	\|\| (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
	2374	dap->da_state &= ~MKDIR_PARENT;
	2375	WORKITEM_FREE(mkdir2, D_MKDIR);
	2376	} else {
	2377	LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
	2378	WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
	2379	}
	2380	}
	2381	/*
	2382	* Link into parent directory pagedep to await its being written.
	2383	*/
	2384	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
	2385	WORKLIST_INSERT_BP(bp, &pagedep->pd_list);
	2386	dap->da_pagedep = pagedep;
	2387	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
	2388	da_pdlist);
	2389	/*
	2390	* Link into its inodedep. Put it on the id_bufwait list if the inode
	2391	* is not yet written. If it is written, do the post-inode write
	2392	* processing to put it on the id_pendinghd list.
	2393	*/
	2394	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
	2395	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
	2396	diradd_inode_written(dap, inodedep);
	2397	else
	2398	WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
	2399	FREE_LOCK(&lk);
	2400	}
	2401
	2402	/*
	2403	* This procedure is called to change the offset of a directory
	2404	* entry when compacting a directory block which must be owned
	2405	* exclusively by the caller. Note that the actual entry movement
	2406	* must be done in this procedure to ensure that no I/O completions
	2407	* occur while the move is in progress.
	2408	*
	2409	* Parameters:
	2410	* dp: inode for directory
	2411	* base: address of dp->i_offset
	2412	* oldloc: address of old directory location
	2413	* newloc: address of new directory location
	2414	* entrysize: size of directory entry
	2415	*/
	2416	void
	2417	softdep_change_directoryentry_offset(struct inode *dp, caddr_t base,
	2418	caddr_t oldloc, caddr_t newloc,
	2419	int entrysize)
	2420	{
	2421	int offset, oldoffset, newoffset;
	2422	struct pagedep *pagedep;
	2423	struct diradd *dap;
	2424	ufs_lbn_t lbn;
	2425
	2426	ACQUIRE_LOCK(&lk);
	2427	lbn = lblkno(dp->i_fs, dp->i_offset);
	2428	offset = blkoff(dp->i_fs, dp->i_offset);
	2429	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
	2430	goto done;
	2431	oldoffset = offset + (oldloc - base);
	2432	newoffset = offset + (newloc - base);
	2433
	2434	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
	2435	if (dap->da_offset != oldoffset)
	2436	continue;
	2437	dap->da_offset = newoffset;
	2438	if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
	2439	break;
	2440	LIST_REMOVE(dap, da_pdlist);
	2441	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
	2442	dap, da_pdlist);
	2443	break;
	2444	}
	2445	if (dap == NULL) {
	2446
	2447	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
	2448	if (dap->da_offset == oldoffset) {
	2449	dap->da_offset = newoffset;
	2450	break;
	2451	}
	2452	}
	2453	}
	2454	done:
	2455	bcopy(oldloc, newloc, entrysize);
	2456	FREE_LOCK(&lk);
	2457	}
	2458
	2459	/*
	2460	* Free a diradd dependency structure. This routine must be called
	2461	* with splbio interrupts blocked.
	2462	*/
	2463	static void
	2464	free_diradd(struct diradd *dap)
	2465	{
	2466	struct dirrem *dirrem;
	2467	struct pagedep *pagedep;
	2468	struct inodedep *inodedep;
	2469	struct mkdir mkdir, nextmd;
	2470
	2471	KKASSERT(lock_held(&lk) > 0);
	2472
	2473	WORKLIST_REMOVE(&dap->da_list);
	2474	LIST_REMOVE(dap, da_pdlist);
	2475	if ((dap->da_state & DIRCHG) == 0) {
	2476	pagedep = dap->da_pagedep;
	2477	} else {
	2478	dirrem = dap->da_previous;
	2479	pagedep = dirrem->dm_pagedep;
	2480	dirrem->dm_dirinum = pagedep->pd_ino;
	2481	add_to_worklist(&dirrem->dm_list);
	2482	}
	2483	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
	2484	0, &inodedep) != 0)
	2485	(void) free_inodedep(inodedep);
	2486	if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) {
	2487	for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
	2488	nextmd = LIST_NEXT(mkdir, md_mkdirs);
	2489	if (mkdir->md_diradd != dap)
	2490	continue;
	2491	dap->da_state &= ~mkdir->md_state;
	2492	WORKLIST_REMOVE(&mkdir->md_list);
	2493	LIST_REMOVE(mkdir, md_mkdirs);
	2494	WORKITEM_FREE(mkdir, D_MKDIR);
	2495	}
	2496	if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) {
	2497	panic("free_diradd: unfound ref");
	2498	}
	2499	}
	2500	WORKITEM_FREE(dap, D_DIRADD);
	2501	}
	2502
	2503	/*
	2504	* Directory entry removal dependencies.
	2505	*
	2506	* When removing a directory entry, the entry's inode pointer must be
	2507	* zero'ed on disk before the corresponding inode's link count is decremented
	2508	* (possibly freeing the inode for re-use). This dependency is handled by
	2509	* updating the directory entry but delaying the inode count reduction until
	2510	* after the directory block has been written to disk. After this point, the
	2511	* inode count can be decremented whenever it is convenient.
	2512	*/
	2513
	2514	/*
	2515	* This routine should be called immediately after removing
	2516	* a directory entry. The inode's link count should not be
	2517	* decremented by the calling procedure -- the soft updates
	2518	* code will do this task when it is safe.
	2519	*
	2520	* Parameters:
	2521	* bp: buffer containing directory block
	2522	* dp: inode for the directory being modified
	2523	* ip: inode for directory entry being removed
	2524	* isrmdir: indicates if doing RMDIR
	2525	*/
	2526	void
	2527	softdep_setup_remove(struct buf bp, struct inode dp, struct inode *ip,
	2528	int isrmdir)
	2529	{
	2530	struct dirrem dirrem, prevdirrem;
	2531
	2532	/*
	2533	* Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
	2534	*/
	2535	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
	2536
	2537	/*
	2538	* If the COMPLETE flag is clear, then there were no active
	2539	* entries and we want to roll back to a zeroed entry until
	2540	* the new inode is committed to disk. If the COMPLETE flag is
	2541	* set then we have deleted an entry that never made it to
	2542	* disk. If the entry we deleted resulted from a name change,
	2543	* then the old name still resides on disk. We cannot delete
	2544	* its inode (returned to us in prevdirrem) until the zeroed
	2545	* directory entry gets to disk. The new inode has never been
	2546	* referenced on the disk, so can be deleted immediately.
	2547	*/
	2548	if ((dirrem->dm_state & COMPLETE) == 0) {
	2549	LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
	2550	dm_next);
	2551	FREE_LOCK(&lk);
	2552	} else {
	2553	if (prevdirrem != NULL)
	2554	LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
	2555	prevdirrem, dm_next);
	2556	dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
	2557	FREE_LOCK(&lk);
	2558	handle_workitem_remove(dirrem);
	2559	}
	2560	}
	2561
	2562	/*
	2563	* Allocate a new dirrem if appropriate and return it along with
	2564	* its associated pagedep. Called without a lock, returns with lock.
	2565	*/
	2566	static long num_dirrem; /* number of dirrem allocated */
	2567
	2568	/*
	2569	* Parameters:
	2570	* bp: buffer containing directory block
	2571	* dp: inode for the directory being modified
	2572	* ip: inode for directory entry being removed
	2573	* isrmdir: indicates if doing RMDIR
	2574	* prevdirremp: previously referenced inode, if any
	2575	*/
	2576	static struct dirrem *
	2577	newdirrem(struct buf bp, struct inode dp, struct inode *ip,
	2578	int isrmdir, struct dirrem **prevdirremp)
	2579	{
	2580	int offset;
	2581	ufs_lbn_t lbn;
	2582	struct diradd *dap;
	2583	struct dirrem *dirrem;
	2584	struct pagedep *pagedep;
	2585
	2586	/*
	2587	* Whiteouts have no deletion dependencies.
	2588	*/
	2589	if (ip == NULL)
	2590	panic("newdirrem: whiteout");
	2591	/*
	2592	* If we are over our limit, try to improve the situation.
	2593	* Limiting the number of dirrem structures will also limit
	2594	* the number of freefile and freeblks structures.
	2595	*/
	2596	if (num_dirrem > max_softdeps / 2 && speedup_syncer() == 0)
	2597	(void) request_cleanup(FLUSH_REMOVE, 0);
	2598	num_dirrem += 1;
	2599	dirrem = kmalloc(sizeof(struct dirrem), M_DIRREM,
	2600	M_SOFTDEP_FLAGS \| M_ZERO);
	2601	dirrem->dm_list.wk_type = D_DIRREM;
	2602	dirrem->dm_state = isrmdir ? RMDIR : 0;
	2603	dirrem->dm_mnt = ITOV(ip)->v_mount;
	2604	dirrem->dm_oldinum = ip->i_number;
	2605	*prevdirremp = NULL;
	2606
	2607	ACQUIRE_LOCK(&lk);
	2608	lbn = lblkno(dp->i_fs, dp->i_offset);
	2609	offset = blkoff(dp->i_fs, dp->i_offset);
	2610	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
	2611	WORKLIST_INSERT_BP(bp, &pagedep->pd_list);
	2612	dirrem->dm_pagedep = pagedep;
	2613	/*
	2614	* Check for a diradd dependency for the same directory entry.
	2615	* If present, then both dependencies become obsolete and can
	2616	* be de-allocated. Check for an entry on both the pd_dirraddhd
	2617	* list and the pd_pendinghd list.
	2618	*/
	2619
	2620	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
	2621	if (dap->da_offset == offset)
	2622	break;
	2623	if (dap == NULL) {
	2624
	2625	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
	2626	if (dap->da_offset == offset)
	2627	break;
	2628	if (dap == NULL)
	2629	return (dirrem);
	2630	}
	2631	/*
	2632	* Must be ATTACHED at this point.
	2633	*/
	2634	if ((dap->da_state & ATTACHED) == 0) {
	2635	panic("newdirrem: not ATTACHED");
	2636	}
	2637	if (dap->da_newinum != ip->i_number) {
	2638	panic("newdirrem: inum %"PRId64" should be %"PRId64,
	2639	ip->i_number, dap->da_newinum);
	2640	}
	2641	/*
	2642	* If we are deleting a changed name that never made it to disk,
	2643	* then return the dirrem describing the previous inode (which
	2644	* represents the inode currently referenced from this entry on disk).
	2645	*/
	2646	if ((dap->da_state & DIRCHG) != 0) {
	2647	*prevdirremp = dap->da_previous;
	2648	dap->da_state &= ~DIRCHG;
	2649	dap->da_pagedep = pagedep;
	2650	}
	2651	/*
	2652	* We are deleting an entry that never made it to disk.
	2653	* Mark it COMPLETE so we can delete its inode immediately.
	2654	*/
	2655	dirrem->dm_state \|= COMPLETE;
	2656	free_diradd(dap);
	2657	return (dirrem);
	2658	}
	2659
	2660	/*
	2661	* Directory entry change dependencies.
	2662	*
	2663	* Changing an existing directory entry requires that an add operation
	2664	* be completed first followed by a deletion. The semantics for the addition
	2665	* are identical to the description of adding a new entry above except
	2666	* that the rollback is to the old inode number rather than zero. Once
	2667	* the addition dependency is completed, the removal is done as described
	2668	* in the removal routine above.
	2669	*/
	2670
	2671	/*
	2672	* This routine should be called immediately after changing
	2673	* a directory entry. The inode's link count should not be
	2674	* decremented by the calling procedure -- the soft updates
	2675	* code will perform this task when it is safe.
	2676	*
	2677	* Parameters:
	2678	* bp: buffer containing directory block
	2679	* dp: inode for the directory being modified
	2680	* ip: inode for directory entry being removed
	2681	* newinum: new inode number for changed entry
	2682	* isrmdir: indicates if doing RMDIR
	2683	*/
	2684	void
	2685	softdep_setup_directory_change(struct buf bp, struct inode dp,
	2686	struct inode *ip, ino_t newinum,
	2687	int isrmdir)
	2688	{
	2689	int offset;
	2690	struct diradd *dap = NULL;
	2691	struct dirrem dirrem, prevdirrem;
	2692	struct pagedep *pagedep;
	2693	struct inodedep *inodedep;
	2694
	2695	offset = blkoff(dp->i_fs, dp->i_offset);
	2696
	2697	/*
	2698	* Whiteouts do not need diradd dependencies.
	2699	*/
	2700	if (newinum != WINO) {
	2701	dap = kmalloc(sizeof(struct diradd), M_DIRADD,
	2702	M_SOFTDEP_FLAGS \| M_ZERO);
	2703	dap->da_list.wk_type = D_DIRADD;
	2704	dap->da_state = DIRCHG \| ATTACHED \| DEPCOMPLETE;
	2705	dap->da_offset = offset;
	2706	dap->da_newinum = newinum;
	2707	}
	2708
	2709	/*
	2710	* Allocate a new dirrem and ACQUIRE_LOCK.
	2711	*/
	2712	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
	2713	pagedep = dirrem->dm_pagedep;
	2714	/*
	2715	* The possible values for isrmdir:
	2716	* 0 - non-directory file rename
	2717	* 1 - directory rename within same directory
	2718	* inum - directory rename to new directory of given inode number
	2719	* When renaming to a new directory, we are both deleting and
	2720	* creating a new directory entry, so the link count on the new
	2721	* directory should not change. Thus we do not need the followup
	2722	* dirrem which is usually done in handle_workitem_remove. We set
	2723	* the DIRCHG flag to tell handle_workitem_remove to skip the
	2724	* followup dirrem.
	2725	*/
	2726	if (isrmdir > 1)
	2727	dirrem->dm_state \|= DIRCHG;
	2728
	2729	/*
	2730	* Whiteouts have no additional dependencies,
	2731	* so just put the dirrem on the correct list.
	2732	*/
	2733	if (newinum == WINO) {
	2734	if ((dirrem->dm_state & COMPLETE) == 0) {
	2735	LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
	2736	dm_next);
	2737	} else {
	2738	dirrem->dm_dirinum = pagedep->pd_ino;
	2739	add_to_worklist(&dirrem->dm_list);
	2740	}
	2741	FREE_LOCK(&lk);
	2742	return;
	2743	}
	2744
	2745	/*
	2746	* If the COMPLETE flag is clear, then there were no active
	2747	* entries and we want to roll back to the previous inode until
	2748	* the new inode is committed to disk. If the COMPLETE flag is
	2749	* set, then we have deleted an entry that never made it to disk.
	2750	* If the entry we deleted resulted from a name change, then the old
	2751	* inode reference still resides on disk. Any rollback that we do
	2752	* needs to be to that old inode (returned to us in prevdirrem). If
	2753	* the entry we deleted resulted from a create, then there is
	2754	* no entry on the disk, so we want to roll back to zero rather
	2755	* than the uncommitted inode. In either of the COMPLETE cases we
	2756	* want to immediately free the unwritten and unreferenced inode.
	2757	*/
	2758	if ((dirrem->dm_state & COMPLETE) == 0) {
	2759	dap->da_previous = dirrem;
	2760	} else {
	2761	if (prevdirrem != NULL) {
	2762	dap->da_previous = prevdirrem;
	2763	} else {
	2764	dap->da_state &= ~DIRCHG;
	2765	dap->da_pagedep = pagedep;
	2766	}
	2767	dirrem->dm_dirinum = pagedep->pd_ino;
	2768	add_to_worklist(&dirrem->dm_list);
	2769	}
	2770	/*
	2771	* Link into its inodedep. Put it on the id_bufwait list if the inode
	2772	* is not yet written. If it is written, do the post-inode write
	2773	* processing to put it on the id_pendinghd list.
	2774	*/
	2775	if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 \|\|
	2776	(inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
	2777	dap->da_state \|= COMPLETE;
	2778	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
	2779	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
	2780	} else {
	2781	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
	2782	dap, da_pdlist);
	2783	WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
	2784	}
	2785	FREE_LOCK(&lk);
	2786	}
	2787
	2788	/*
	2789	* Called whenever the link count on an inode is changed.
	2790	* It creates an inode dependency so that the new reference(s)
	2791	* to the inode cannot be committed to disk until the updated
	2792	* inode has been written.
	2793	*
	2794	* Parameters:
	2795	* ip: the inode with the increased link count
	2796	*/
	2797	void
	2798	softdep_change_linkcnt(struct inode *ip)
	2799	{
	2800	struct inodedep *inodedep;
	2801
	2802	ACQUIRE_LOCK(&lk);
	2803	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
	2804	if (ip->i_nlink < ip->i_effnlink) {
	2805	panic("softdep_change_linkcnt: bad delta");
	2806	}
	2807	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
	2808	FREE_LOCK(&lk);
	2809	}
	2810
	2811	/*
	2812	* This workitem decrements the inode's link count.
	2813	* If the link count reaches zero, the file is removed.
	2814	*/
	2815	static void
	2816	handle_workitem_remove(struct dirrem *dirrem)
	2817	{
	2818	struct inodedep *inodedep;
	2819	struct vnode *vp;
	2820	struct inode *ip;
	2821	ino_t oldinum;
	2822	int error;
	2823
	2824	error = VFS_VGET(dirrem->dm_mnt, NULL, dirrem->dm_oldinum, &vp);
	2825	if (error) {
	2826	softdep_error("handle_workitem_remove: vget", error);
	2827	return;
	2828	}
	2829	ip = VTOI(vp);
	2830	ACQUIRE_LOCK(&lk);
	2831	if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){
	2832	panic("handle_workitem_remove: lost inodedep");
	2833	}
	2834	/*
	2835	* Normal file deletion.
	2836	*/
	2837	if ((dirrem->dm_state & RMDIR) == 0) {
	2838	ip->i_nlink--;
	2839	ip->i_flag \|= IN_CHANGE;
	2840	if (ip->i_nlink < ip->i_effnlink) {
	2841	panic("handle_workitem_remove: bad file delta");
	2842	}
	2843	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
	2844	FREE_LOCK(&lk);
	2845	vput(vp);
	2846	num_dirrem -= 1;
	2847	WORKITEM_FREE(dirrem, D_DIRREM);
	2848	return;
	2849	}
	2850	/*
	2851	* Directory deletion. Decrement reference count for both the
	2852	* just deleted parent directory entry and the reference for ".".
	2853	* Next truncate the directory to length zero. When the
	2854	* truncation completes, arrange to have the reference count on
	2855	* the parent decremented to account for the loss of "..".
	2856	*/
	2857	ip->i_nlink -= 2;
	2858	ip->i_flag \|= IN_CHANGE;
	2859	if (ip->i_nlink < ip->i_effnlink) {
	2860	panic("handle_workitem_remove: bad dir delta");
	2861	}
	2862	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
	2863	FREE_LOCK(&lk);
	2864	if ((error = ffs_truncate(vp, (off_t)0, 0, proc0.p_ucred)) != 0)
	2865	softdep_error("handle_workitem_remove: truncate", error);
	2866	/*
	2867	* Rename a directory to a new parent. Since, we are both deleting
	2868	* and creating a new directory entry, the link count on the new
	2869	* directory should not change. Thus we skip the followup dirrem.
	2870	*/
	2871	if (dirrem->dm_state & DIRCHG) {
	2872	vput(vp);
	2873	num_dirrem -= 1;
	2874	WORKITEM_FREE(dirrem, D_DIRREM);
	2875	return;
	2876	}
	2877	/*
	2878	* If the inodedep does not exist, then the zero'ed inode has
	2879	* been written to disk. If the allocated inode has never been
	2880	* written to disk, then the on-disk inode is zero'ed. In either
	2881	* case we can remove the file immediately.
	2882	*/
	2883	ACQUIRE_LOCK(&lk);
	2884	dirrem->dm_state = 0;
	2885	oldinum = dirrem->dm_oldinum;
	2886	dirrem->dm_oldinum = dirrem->dm_dirinum;
	2887	if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 \|\|
	2888	check_inode_unwritten(inodedep)) {
	2889	FREE_LOCK(&lk);
	2890	vput(vp);
	2891	handle_workitem_remove(dirrem);
	2892	return;
	2893	}
	2894	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
	2895	FREE_LOCK(&lk);
	2896	ip->i_flag \|= IN_CHANGE;
	2897	ffs_update(vp, 0);
	2898	vput(vp);
	2899	}
	2900
	2901	/*
	2902	* Inode de-allocation dependencies.
	2903	*
	2904	* When an inode's link count is reduced to zero, it can be de-allocated. We
	2905	* found it convenient to postpone de-allocation until after the inode is
	2906	* written to disk with its new link count (zero). At this point, all of the
	2907	* on-disk inode's block pointers are nullified and, with careful dependency
	2908	* list ordering, all dependencies related to the inode will be satisfied and
	2909	* the corresponding dependency structures de-allocated. So, if/when the
	2910	* inode is reused, there will be no mixing of old dependencies with new
	2911	* ones. This artificial dependency is set up by the block de-allocation
	2912	* procedure above (softdep_setup_freeblocks) and completed by the
	2913	* following procedure.
	2914	*/
	2915	static void
	2916	handle_workitem_freefile(struct freefile *freefile)
	2917	{
	2918	struct vnode vp;
	2919	struct inode tip;
	2920	struct inodedep *idp;
	2921	int error;
	2922
	2923	#ifdef DEBUG
	2924	ACQUIRE_LOCK(&lk);
	2925	error = inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp);
	2926	FREE_LOCK(&lk);
	2927	if (error)
	2928	panic("handle_workitem_freefile: inodedep survived");
	2929	#endif
	2930	tip.i_devvp = freefile->fx_devvp;
	2931	tip.i_dev = freefile->fx_devvp->v_rdev;
	2932	tip.i_fs = freefile->fx_fs;
	2933	vp.v_data = &tip;
	2934	if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
	2935	softdep_error("handle_workitem_freefile", error);
	2936	WORKITEM_FREE(freefile, D_FREEFILE);
	2937	}
	2938
	2939	/*
	2940	* Helper function which unlinks marker element from work list and returns
	2941	* the next element on the list.
	2942	*/
	2943	static __inline struct worklist *
	2944	markernext(struct worklist *marker)
	2945	{
	2946	struct worklist *next;
	2947
	2948	next = LIST_NEXT(marker, wk_list);
	2949	LIST_REMOVE(marker, wk_list);
	2950	return next;
	2951	}
	2952
	2953	/*
	2954	* checkread, checkwrite
	2955	*
	2956	* bioops callback - hold io_token
	2957	*/
	2958	static int
	2959	softdep_checkread(struct buf *bp)
	2960	{
	2961	/* nothing to do, mp lock not needed */
	2962	return(0);
	2963	}
	2964
	2965	/*
	2966	* bioops callback - hold io_token
	2967	*/
	2968	static int
	2969	softdep_checkwrite(struct buf *bp)
	2970	{
	2971	/* nothing to do, mp lock not needed */
	2972	return(0);
	2973	}
	2974
	2975	/*
	2976	* Disk writes.
	2977	*
	2978	* The dependency structures constructed above are most actively used when file
	2979	* system blocks are written to disk. No constraints are placed on when a
	2980	* block can be written, but unsatisfied update dependencies are made safe by
	2981	* modifying (or replacing) the source memory for the duration of the disk
	2982	* write. When the disk write completes, the memory block is again brought
	2983	* up-to-date.
	2984	*
	2985	* In-core inode structure reclamation.
	2986	*
	2987	* Because there are a finite number of "in-core" inode structures, they are
	2988	* reused regularly. By transferring all inode-related dependencies to the
	2989	* in-memory inode block and indexing them separately (via "inodedep"s), we
	2990	* can allow "in-core" inode structures to be reused at any time and avoid
	2991	* any increase in contention.
	2992	*
	2993	* Called just before entering the device driver to initiate a new disk I/O.
	2994	* The buffer must be locked, thus, no I/O completion operations can occur
	2995	* while we are manipulating its associated dependencies.
	2996	*
	2997	* bioops callback - hold io_token
	2998	*
	2999	* Parameters:
	3000	* bp: structure describing disk write to occur
	3001	*/
	3002	static void
	3003	softdep_disk_io_initiation(struct buf *bp)
	3004	{
	3005	struct worklist *wk;
	3006	struct worklist marker;
	3007	struct indirdep *indirdep;
	3008
	3009	/*
	3010	* We only care about write operations. There should never
	3011	* be dependencies for reads.
	3012	*/
	3013	if (bp->b_cmd == BUF_CMD_READ)
	3014	panic("softdep_disk_io_initiation: read");
	3015
	3016	ACQUIRE_LOCK(&lk);
	3017	marker.wk_type = D_LAST + 1; /* Not a normal workitem */
	3018
	3019	/*
	3020	* Do any necessary pre-I/O processing.
	3021	*/
	3022	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = markernext(&marker)) {
	3023	LIST_INSERT_AFTER(wk, &marker, wk_list);
	3024
	3025	switch (wk->wk_type) {
	3026	case D_PAGEDEP:
	3027	initiate_write_filepage(WK_PAGEDEP(wk), bp);
	3028	continue;
	3029
	3030	case D_INODEDEP:
	3031	initiate_write_inodeblock(WK_INODEDEP(wk), bp);
	3032	continue;
	3033
	3034	case D_INDIRDEP:
	3035	indirdep = WK_INDIRDEP(wk);
	3036	if (indirdep->ir_state & GOINGAWAY)
	3037	panic("disk_io_initiation: indirdep gone");
	3038	/*
	3039	* If there are no remaining dependencies, this
	3040	* will be writing the real pointers, so the
	3041	* dependency can be freed.
	3042	*/
	3043	if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
	3044	indirdep->ir_savebp->b_flags \|= B_INVAL \| B_NOCACHE;
	3045	brelse(indirdep->ir_savebp);
	3046	/* inline expand WORKLIST_REMOVE(wk); */
	3047	wk->wk_state &= ~ONWORKLIST;
	3048	LIST_REMOVE(wk, wk_list);
	3049	WORKITEM_FREE(indirdep, D_INDIRDEP);
	3050	continue;
	3051	}
	3052	/*
	3053	* Replace up-to-date version with safe version.
	3054	*/
	3055	indirdep->ir_saveddata = kmalloc(bp->b_bcount,
	3056	M_INDIRDEP,
	3057	M_SOFTDEP_FLAGS);
	3058	ACQUIRE_LOCK(&lk);
	3059	indirdep->ir_state &= ~ATTACHED;
	3060	indirdep->ir_state \|= UNDONE;
	3061	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
	3062	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
	3063	bp->b_bcount);
	3064	FREE_LOCK(&lk);
	3065	continue;
	3066
	3067	case D_MKDIR:
	3068	case D_BMSAFEMAP:
	3069	case D_ALLOCDIRECT:
	3070	case D_ALLOCINDIR:
	3071	continue;
	3072
	3073	default:
	3074	panic("handle_disk_io_initiation: Unexpected type %s",
	3075	TYPENAME(wk->wk_type));
	3076	/* NOTREACHED */
	3077	}
	3078	}
	3079	FREE_LOCK(&lk);
	3080	}
	3081
	3082	/*
	3083	* Called from within the procedure above to deal with unsatisfied
	3084	* allocation dependencies in a directory. The buffer must be locked,
	3085	* thus, no I/O completion operations can occur while we are
	3086	* manipulating its associated dependencies.
	3087	*/
	3088	static void
	3089	initiate_write_filepage(struct pagedep pagedep, struct buf bp)
	3090	{
	3091	struct diradd *dap;
	3092	struct direct *ep;
	3093	int i;
	3094
	3095	if (pagedep->pd_state & IOSTARTED) {
	3096	/*
	3097	* This can only happen if there is a driver that does not
	3098	* understand chaining. Here biodone will reissue the call
	3099	* to strategy for the incomplete buffers.
	3100	*/
	3101	kprintf("initiate_write_filepage: already started\n");
	3102	return;
	3103	}
	3104	pagedep->pd_state \|= IOSTARTED;
	3105	ACQUIRE_LOCK(&lk);
	3106	for (i = 0; i < DAHASHSZ; i++) {
	3107	LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
	3108	ep = (struct direct *)
	3109	((char *)bp->b_data + dap->da_offset);
	3110	if (ep->d_ino != dap->da_newinum) {
	3111	panic("%s: dir inum %d != new %"PRId64,
	3112	"initiate_write_filepage",
	3113	ep->d_ino, dap->da_newinum);
	3114	}
	3115	if (dap->da_state & DIRCHG)
	3116	ep->d_ino = dap->da_previous->dm_oldinum;
	3117	else
	3118	ep->d_ino = 0;
	3119	dap->da_state &= ~ATTACHED;
	3120	dap->da_state \|= UNDONE;
	3121	}
	3122	}
	3123	FREE_LOCK(&lk);
	3124	}
	3125
	3126	/*
	3127	* Called from within the procedure above to deal with unsatisfied
	3128	* allocation dependencies in an inodeblock. The buffer must be
	3129	* locked, thus, no I/O completion operations can occur while we
	3130	* are manipulating its associated dependencies.
	3131	*
	3132	* Parameters:
	3133	* bp: The inode block
	3134	*/
	3135	static void
	3136	initiate_write_inodeblock(struct inodedep inodedep, struct buf bp)
	3137	{
	3138	struct allocdirect adp, lastadp;
	3139	struct ufs1_dinode *dp;
	3140	struct ufs1_dinode *sip;
	3141	struct fs *fs;
	3142	ufs_lbn_t prevlbn = 0;
	3143	int i, deplist;
	3144
	3145	if (inodedep->id_state & IOSTARTED)
	3146	panic("initiate_write_inodeblock: already started");
	3147	inodedep->id_state \|= IOSTARTED;
	3148	fs = inodedep->id_fs;
	3149	dp = (struct ufs1_dinode *)bp->b_data +
	3150	ino_to_fsbo(fs, inodedep->id_ino);
	3151	/*
	3152	* If the bitmap is not yet written, then the allocated
	3153	* inode cannot be written to disk.
	3154	*/
	3155	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
	3156	if (inodedep->id_savedino != NULL)
	3157	panic("initiate_write_inodeblock: already doing I/O");
	3158	sip = kmalloc(sizeof(struct ufs1_dinode), M_INODEDEP,
	3159	M_SOFTDEP_FLAGS);
	3160	inodedep->id_savedino = sip;
	3161	inodedep->id_savedino = dp;
	3162	bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
	3163	dp->di_gen = inodedep->id_savedino->di_gen;
	3164	return;
	3165	}
	3166	/*
	3167	* If no dependencies, then there is nothing to roll back.
	3168	*/
	3169	inodedep->id_savedsize = dp->di_size;
	3170	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
	3171	return;
	3172	/*
	3173	* Set the dependencies to busy.
	3174	*/
	3175	ACQUIRE_LOCK(&lk);
	3176	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
	3177	adp = TAILQ_NEXT(adp, ad_next)) {
	3178	#ifdef DIAGNOSTIC
	3179	if (deplist != 0 && prevlbn >= adp->ad_lbn) {
	3180	panic("softdep_write_inodeblock: lbn order");
	3181	}
	3182	prevlbn = adp->ad_lbn;
	3183	if (adp->ad_lbn < NDADDR &&
	3184	dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
	3185	panic("%s: direct pointer #%ld mismatch %d != %d",
	3186	"softdep_write_inodeblock", adp->ad_lbn,
	3187	dp->di_db[adp->ad_lbn], adp->ad_newblkno);
	3188	}
	3189	if (adp->ad_lbn >= NDADDR &&
	3190	dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) {
	3191	panic("%s: indirect pointer #%ld mismatch %d != %d",
	3192	"softdep_write_inodeblock", adp->ad_lbn - NDADDR,
	3193	dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno);
	3194	}
	3195	deplist \|= 1 << adp->ad_lbn;
	3196	if ((adp->ad_state & ATTACHED) == 0) {
	3197	panic("softdep_write_inodeblock: Unknown state 0x%x",
	3198	adp->ad_state);
	3199	}
	3200	#endif /* DIAGNOSTIC */
	3201	adp->ad_state &= ~ATTACHED;
	3202	adp->ad_state \|= UNDONE;
	3203	}
	3204	/*
	3205	* The on-disk inode cannot claim to be any larger than the last
	3206	* fragment that has been written. Otherwise, the on-disk inode
	3207	* might have fragments that were not the last block in the file
	3208	* which would corrupt the filesystem.
	3209	*/
	3210	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
	3211	lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
	3212	if (adp->ad_lbn >= NDADDR)
	3213	break;
	3214	dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
	3215	/* keep going until hitting a rollback to a frag */
	3216	if (adp->ad_oldsize == 0 \|\| adp->ad_oldsize == fs->fs_bsize)
	3217	continue;
	3218	dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
	3219	for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
	3220	#ifdef DIAGNOSTIC
	3221	if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
	3222	panic("softdep_write_inodeblock: lost dep1");
	3223	}
	3224	#endif /* DIAGNOSTIC */
	3225	dp->di_db[i] = 0;
	3226	}
	3227	for (i = 0; i < NIADDR; i++) {
	3228	#ifdef DIAGNOSTIC
	3229	if (dp->di_ib[i] != 0 &&
	3230	(deplist & ((1 << NDADDR) << i)) == 0) {
	3231	panic("softdep_write_inodeblock: lost dep2");
	3232	}
	3233	#endif /* DIAGNOSTIC */
	3234	dp->di_ib[i] = 0;
	3235	}
	3236	FREE_LOCK(&lk);
	3237	return;
	3238	}
	3239	/*
	3240	* If we have zero'ed out the last allocated block of the file,
	3241	* roll back the size to the last currently allocated block.
	3242	* We know that this last allocated block is a full-sized as
	3243	* we already checked for fragments in the loop above.
	3244	*/
	3245	if (lastadp != NULL &&
	3246	dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
	3247	for (i = lastadp->ad_lbn; i >= 0; i--)
	3248	if (dp->di_db[i] != 0)
	3249	break;
	3250	dp->di_size = (i + 1) * fs->fs_bsize;
	3251	}
	3252	/*
	3253	* The only dependencies are for indirect blocks.
	3254	*
	3255	* The file size for indirect block additions is not guaranteed.
	3256	* Such a guarantee would be non-trivial to achieve. The conventional
	3257	* synchronous write implementation also does not make this guarantee.
	3258	* Fsck should catch and fix discrepancies. Arguably, the file size
	3259	* can be over-estimated without destroying integrity when the file
	3260	* moves into the indirect blocks (i.e., is large). If we want to
	3261	* postpone fsck, we are stuck with this argument.
	3262	*/
	3263	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
	3264	dp->di_ib[adp->ad_lbn - NDADDR] = 0;
	3265	FREE_LOCK(&lk);
	3266	}
	3267
	3268	/*
	3269	* This routine is called during the completion interrupt
	3270	* service routine for a disk write (from the procedure called
	3271	* by the device driver to inform the filesystem caches of
	3272	* a request completion). It should be called early in this
	3273	* procedure, before the block is made available to other
	3274	* processes or other routines are called.
	3275	*
	3276	* bioops callback - hold io_token
	3277	*
	3278	* Parameters:
	3279	* bp: describes the completed disk write
	3280	*/
	3281	static void
	3282	softdep_disk_write_complete(struct buf *bp)
	3283	{
	3284	struct worklist *wk;
	3285	struct workhead reattach;
	3286	struct newblk *newblk;
	3287	struct allocindir *aip;
	3288	struct allocdirect *adp;
	3289	struct indirdep *indirdep;
	3290	struct inodedep *inodedep;
	3291	struct bmsafemap *bmsafemap;
	3292
	3293	ACQUIRE_LOCK(&lk);
	3294
	3295	LIST_INIT(&reattach);
	3296	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
	3297	WORKLIST_REMOVE(wk);
	3298	switch (wk->wk_type) {
	3299
	3300	case D_PAGEDEP:
	3301	if (handle_written_filepage(WK_PAGEDEP(wk), bp))
	3302	WORKLIST_INSERT(&reattach, wk);
	3303	continue;
	3304
	3305	case D_INODEDEP:
	3306	if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
	3307	WORKLIST_INSERT(&reattach, wk);
	3308	continue;
	3309
	3310	case D_BMSAFEMAP:
	3311	bmsafemap = WK_BMSAFEMAP(wk);
	3312	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
	3313	newblk->nb_state \|= DEPCOMPLETE;
	3314	newblk->nb_bmsafemap = NULL;
	3315	LIST_REMOVE(newblk, nb_deps);
	3316	}
	3317	while ((adp =
	3318	LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
	3319	adp->ad_state \|= DEPCOMPLETE;
	3320	adp->ad_buf = NULL;
	3321	LIST_REMOVE(adp, ad_deps);
	3322	handle_allocdirect_partdone(adp);
	3323	}
	3324	while ((aip =
	3325	LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
	3326	aip->ai_state \|= DEPCOMPLETE;
	3327	aip->ai_buf = NULL;
	3328	LIST_REMOVE(aip, ai_deps);
	3329	handle_allocindir_partdone(aip);
	3330	}
	3331	while ((inodedep =
	3332	LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
	3333	inodedep->id_state \|= DEPCOMPLETE;
	3334	LIST_REMOVE(inodedep, id_deps);
	3335	inodedep->id_buf = NULL;
	3336	}
	3337	WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
	3338	continue;
	3339
	3340	case D_MKDIR:
	3341	handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
	3342	continue;
	3343
	3344	case D_ALLOCDIRECT:
	3345	adp = WK_ALLOCDIRECT(wk);
	3346	adp->ad_state \|= COMPLETE;
	3347	handle_allocdirect_partdone(adp);
	3348	continue;
	3349
	3350	case D_ALLOCINDIR:
	3351	aip = WK_ALLOCINDIR(wk);
	3352	aip->ai_state \|= COMPLETE;
	3353	handle_allocindir_partdone(aip);
	3354	continue;
	3355
	3356	case D_INDIRDEP:
	3357	indirdep = WK_INDIRDEP(wk);
	3358	if (indirdep->ir_state & GOINGAWAY) {
	3359	panic("disk_write_complete: indirdep gone");
	3360	}
	3361	bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
	3362	kfree(indirdep->ir_saveddata, M_INDIRDEP);
	3363	indirdep->ir_saveddata = 0;
	3364	indirdep->ir_state &= ~UNDONE;
	3365	indirdep->ir_state \|= ATTACHED;
	3366	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) {
	3367	handle_allocindir_partdone(aip);
	3368	if (aip == LIST_FIRST(&indirdep->ir_donehd)) {
	3369	panic("disk_write_complete: not gone");
	3370	}
	3371	}
	3372	WORKLIST_INSERT(&reattach, wk);
	3373	if ((bp->b_flags & B_DELWRI) == 0)
	3374	stat_indir_blk_ptrs++;
	3375	bdirty(bp);
	3376	continue;
	3377
	3378	default:
	3379	panic("handle_disk_write_complete: Unknown type %s",
	3380	TYPENAME(wk->wk_type));
	3381	/* NOTREACHED */
	3382	}
	3383	}
	3384	/*
	3385	* Reattach any requests that must be redone.
	3386	*/
	3387	while ((wk = LIST_FIRST(&reattach)) != NULL) {
	3388	WORKLIST_REMOVE(wk);
	3389	WORKLIST_INSERT_BP(bp, wk);
	3390	}
	3391
	3392	FREE_LOCK(&lk);
	3393	}
	3394
	3395	/*
	3396	* Called from within softdep_disk_write_complete above. Note that
	3397	* this routine is always called from interrupt level with further
	3398	* splbio interrupts blocked.
	3399	*
	3400	* Parameters:
	3401	* adp: the completed allocdirect
	3402	*/
	3403	static void
	3404	handle_allocdirect_partdone(struct allocdirect *adp)
	3405	{
	3406	struct allocdirect *listadp;
	3407	struct inodedep *inodedep;
	3408	long bsize;
	3409
	3410	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
	3411	return;
	3412	if (adp->ad_buf != NULL)
	3413	panic("handle_allocdirect_partdone: dangling dep");
	3414
	3415	/*
	3416	* The on-disk inode cannot claim to be any larger than the last
	3417	* fragment that has been written. Otherwise, the on-disk inode
	3418	* might have fragments that were not the last block in the file
	3419	* which would corrupt the filesystem. Thus, we cannot free any
	3420	* allocdirects after one whose ad_oldblkno claims a fragment as
	3421	* these blocks must be rolled back to zero before writing the inode.
	3422	* We check the currently active set of allocdirects in id_inoupdt.
	3423	*/
	3424	inodedep = adp->ad_inodedep;
	3425	bsize = inodedep->id_fs->fs_bsize;
	3426	TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) {
	3427	/* found our block */
	3428	if (listadp == adp)
	3429	break;
	3430	/* continue if ad_oldlbn is not a fragment */
	3431	if (listadp->ad_oldsize == 0 \|\|
	3432	listadp->ad_oldsize == bsize)
	3433	continue;
	3434	/* hit a fragment */
	3435	return;
	3436	}
	3437	/*
	3438	* If we have reached the end of the current list without
	3439	* finding the just finished dependency, then it must be
	3440	* on the future dependency list. Future dependencies cannot
	3441	* be freed until they are moved to the current list.
	3442	*/
	3443	if (listadp == NULL) {
	3444	#ifdef DEBUG
	3445	TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next)
	3446	/* found our block */
	3447	if (listadp == adp)
	3448	break;
	3449	if (listadp == NULL)
	3450	panic("handle_allocdirect_partdone: lost dep");
	3451	#endif /* DEBUG */
	3452	return;
	3453	}
	3454	/*
	3455	* If we have found the just finished dependency, then free
	3456	* it along with anything that follows it that is complete.
	3457	*/
	3458	for (; adp; adp = listadp) {
	3459	listadp = TAILQ_NEXT(adp, ad_next);
	3460	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
	3461	return;
	3462	free_allocdirect(&inodedep->id_inoupdt, adp, 1);
	3463	}
	3464	}
	3465
	3466	/*
	3467	* Called from within softdep_disk_write_complete above. Note that
	3468	* this routine is always called from interrupt level with further
	3469	* splbio interrupts blocked.
	3470	*
	3471	* Parameters:
	3472	* aip: the completed allocindir
	3473	*/
	3474	static void
	3475	handle_allocindir_partdone(struct allocindir *aip)
	3476	{
	3477	struct indirdep *indirdep;
	3478
	3479	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
	3480	return;
	3481	if (aip->ai_buf != NULL)
	3482	panic("handle_allocindir_partdone: dangling dependency");
	3483
	3484	indirdep = aip->ai_indirdep;
	3485	if (indirdep->ir_state & UNDONE) {
	3486	LIST_REMOVE(aip, ai_next);
	3487	LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
	3488	return;
	3489	}
	3490	((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
	3491	aip->ai_newblkno;
	3492	LIST_REMOVE(aip, ai_next);
	3493	if (aip->ai_freefrag != NULL)
	3494	add_to_worklist(&aip->ai_freefrag->ff_list);
	3495	WORKITEM_FREE(aip, D_ALLOCINDIR);
	3496	}
	3497
	3498	/*
	3499	* Called from within softdep_disk_write_complete above to restore
	3500	* in-memory inode block contents to their most up-to-date state. Note
	3501	* that this routine is always called from interrupt level with further
	3502	* splbio interrupts blocked.
	3503	*
	3504	* Parameters:
	3505	* bp: buffer containing the inode block
	3506	*/
	3507	static int
	3508	handle_written_inodeblock(struct inodedep inodedep, struct buf bp)
	3509	{
	3510	struct worklist wk, filefree;
	3511	struct allocdirect adp, nextadp;
	3512	struct ufs1_dinode *dp;
	3513	int hadchanges;
	3514
	3515	if ((inodedep->id_state & IOSTARTED) == 0)
	3516	panic("handle_written_inodeblock: not started");
	3517
	3518	inodedep->id_state &= ~IOSTARTED;
	3519	dp = (struct ufs1_dinode *)bp->b_data +
	3520	ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
	3521	/*
	3522	* If we had to rollback the inode allocation because of
	3523	* bitmaps being incomplete, then simply restore it.
	3524	* Keep the block dirty so that it will not be reclaimed until
	3525	* all associated dependencies have been cleared and the
	3526	* corresponding updates written to disk.
	3527	*/
	3528	if (inodedep->id_savedino != NULL) {
	3529	dp = inodedep->id_savedino;
	3530	kfree(inodedep->id_savedino, M_INODEDEP);
	3531	inodedep->id_savedino = NULL;
	3532	if ((bp->b_flags & B_DELWRI) == 0)
	3533	stat_inode_bitmap++;
	3534	bdirty(bp);
	3535	return (1);
	3536	}
	3537	inodedep->id_state \|= COMPLETE;
	3538	/*
	3539	* Roll forward anything that had to be rolled back before
	3540	* the inode could be updated.
	3541	*/
	3542	hadchanges = 0;
	3543	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
	3544	nextadp = TAILQ_NEXT(adp, ad_next);
	3545	if (adp->ad_state & ATTACHED)
	3546	panic("handle_written_inodeblock: new entry");
	3547
	3548	if (adp->ad_lbn < NDADDR) {
	3549	if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) {
	3550	panic("%s: %s #%ld mismatch %d != %d",
	3551	"handle_written_inodeblock",
	3552	"direct pointer", adp->ad_lbn,
	3553	dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
	3554	}
	3555	dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
	3556	} else {
	3557	if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) {
	3558	panic("%s: %s #%ld allocated as %d",
	3559	"handle_written_inodeblock",
	3560	"indirect pointer", adp->ad_lbn - NDADDR,
	3561	dp->di_ib[adp->ad_lbn - NDADDR]);
	3562	}
	3563	dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno;
	3564	}
	3565	adp->ad_state &= ~UNDONE;
	3566	adp->ad_state \|= ATTACHED;
	3567	hadchanges = 1;
	3568	}
	3569	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
	3570	stat_direct_blk_ptrs++;
	3571	/*
	3572	* Reset the file size to its most up-to-date value.
	3573	*/
	3574	if (inodedep->id_savedsize == -1) {
	3575	panic("handle_written_inodeblock: bad size");
	3576	}
	3577	if (dp->di_size != inodedep->id_savedsize) {
	3578	dp->di_size = inodedep->id_savedsize;
	3579	hadchanges = 1;
	3580	}
	3581	inodedep->id_savedsize = -1;
	3582	/*
	3583	* If there were any rollbacks in the inode block, then it must be
	3584	* marked dirty so that its will eventually get written back in
	3585	* its correct form.
	3586	*/
	3587	if (hadchanges)
	3588	bdirty(bp);
	3589	/*
	3590	* Process any allocdirects that completed during the update.
	3591	*/
	3592	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
	3593	handle_allocdirect_partdone(adp);
	3594	/*
	3595	* Process deallocations that were held pending until the
	3596	* inode had been written to disk. Freeing of the inode
	3597	* is delayed until after all blocks have been freed to
	3598	* avoid creation of new <vfsid, inum, lbn> triples
	3599	* before the old ones have been deleted.
	3600	*/
	3601	filefree = NULL;
	3602	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
	3603	WORKLIST_REMOVE(wk);
	3604	switch (wk->wk_type) {
	3605
	3606	case D_FREEFILE:
	3607	/*
	3608	* We defer adding filefree to the worklist until
	3609	* all other additions have been made to ensure
	3610	* that it will be done after all the old blocks
	3611	* have been freed.
	3612	*/
	3613	if (filefree != NULL) {
	3614	panic("handle_written_inodeblock: filefree");
	3615	}
	3616	filefree = wk;
	3617	continue;
	3618
	3619	case D_MKDIR:
	3620	handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
	3621	continue;
	3622
	3623	case D_DIRADD:
	3624	diradd_inode_written(WK_DIRADD(wk), inodedep);
	3625	continue;
	3626
	3627	case D_FREEBLKS:
	3628	wk->wk_state \|= COMPLETE;
	3629	if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE)
	3630	continue;
	3631	/* -- fall through -- */
	3632	case D_FREEFRAG:
	3633	case D_DIRREM:
	3634	add_to_worklist(wk);
	3635	continue;
	3636
	3637	default:
	3638	panic("handle_written_inodeblock: Unknown type %s",
	3639	TYPENAME(wk->wk_type));
	3640	/* NOTREACHED */
	3641	}
	3642	}
	3643	if (filefree != NULL) {
	3644	if (free_inodedep(inodedep) == 0) {
	3645	panic("handle_written_inodeblock: live inodedep");
	3646	}
	3647	add_to_worklist(filefree);
	3648	return (0);
	3649	}
	3650
	3651	/*
	3652	* If no outstanding dependencies, free it.
	3653	*/
	3654	if (free_inodedep(inodedep) \|\| TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
	3655	return (0);
	3656	return (hadchanges);
	3657	}
	3658
	3659	/*
	3660	* Process a diradd entry after its dependent inode has been written.
	3661	* This routine must be called with splbio interrupts blocked.
	3662	*/
	3663	static void
	3664	diradd_inode_written(struct diradd dap, struct inodedep inodedep)
	3665	{
	3666	struct pagedep *pagedep;
	3667
	3668	dap->da_state \|= COMPLETE;
	3669	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
	3670	if (dap->da_state & DIRCHG)
	3671	pagedep = dap->da_previous->dm_pagedep;
	3672	else
	3673	pagedep = dap->da_pagedep;
	3674	LIST_REMOVE(dap, da_pdlist);
	3675	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
	3676	}
	3677	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
	3678	}
	3679
	3680	/*
	3681	* Handle the completion of a mkdir dependency.
	3682	*/
	3683	static void
	3684	handle_written_mkdir(struct mkdir *mkdir, int type)
	3685	{
	3686	struct diradd *dap;
	3687	struct pagedep *pagedep;
	3688
	3689	if (mkdir->md_state != type) {
	3690	panic("handle_written_mkdir: bad type");
	3691	}
	3692	dap = mkdir->md_diradd;
	3693	dap->da_state &= ~type;
	3694	if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) == 0)
	3695	dap->da_state \|= DEPCOMPLETE;
	3696	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
	3697	if (dap->da_state & DIRCHG)
	3698	pagedep = dap->da_previous->dm_pagedep;
	3699	else
	3700	pagedep = dap->da_pagedep;
	3701	LIST_REMOVE(dap, da_pdlist);
	3702	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
	3703	}
	3704	LIST_REMOVE(mkdir, md_mkdirs);
	3705	WORKITEM_FREE(mkdir, D_MKDIR);
	3706	}
	3707
	3708	/*
	3709	* Called from within softdep_disk_write_complete above.
	3710	* A write operation was just completed. Removed inodes can
	3711	* now be freed and associated block pointers may be committed.
	3712	* Note that this routine is always called from interrupt level
	3713	* with further splbio interrupts blocked.
	3714	*
	3715	* Parameters:
	3716	* bp: buffer containing the written page
	3717	*/
	3718	static int
	3719	handle_written_filepage(struct pagedep pagedep, struct buf bp)
	3720	{
	3721	struct dirrem *dirrem;
	3722	struct diradd dap, nextdap;
	3723	struct direct *ep;
	3724	int i, chgs;
	3725
	3726	if ((pagedep->pd_state & IOSTARTED) == 0) {
	3727	panic("handle_written_filepage: not started");
	3728	}
	3729	pagedep->pd_state &= ~IOSTARTED;
	3730	/*
	3731	* Process any directory removals that have been committed.
	3732	*/
	3733	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
	3734	LIST_REMOVE(dirrem, dm_next);
	3735	dirrem->dm_dirinum = pagedep->pd_ino;
	3736	add_to_worklist(&dirrem->dm_list);
	3737	}
	3738	/*
	3739	* Free any directory additions that have been committed.
	3740	*/
	3741	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
	3742	free_diradd(dap);
	3743	/*
	3744	* Uncommitted directory entries must be restored.
	3745	*/
	3746	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
	3747	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
	3748	dap = nextdap) {
	3749	nextdap = LIST_NEXT(dap, da_pdlist);
	3750	if (dap->da_state & ATTACHED) {
	3751	panic("handle_written_filepage: attached");
	3752	}
	3753	ep = (struct direct *)
	3754	((char *)bp->b_data + dap->da_offset);
	3755	ep->d_ino = dap->da_newinum;
	3756	dap->da_state &= ~UNDONE;
	3757	dap->da_state \|= ATTACHED;
	3758	chgs = 1;
	3759	/*
	3760	* If the inode referenced by the directory has
	3761	* been written out, then the dependency can be
	3762	* moved to the pending list.
	3763	*/
	3764	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
	3765	LIST_REMOVE(dap, da_pdlist);
	3766	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
	3767	da_pdlist);
	3768	}
	3769	}
	3770	}
	3771	/*
	3772	* If there were any rollbacks in the directory, then it must be
	3773	* marked dirty so that its will eventually get written back in
	3774	* its correct form.
	3775	*/
	3776	if (chgs) {
	3777	if ((bp->b_flags & B_DELWRI) == 0)
	3778	stat_dir_entry++;
	3779	bdirty(bp);
	3780	}
	3781	/*
	3782	* If no dependencies remain, the pagedep will be freed.
	3783	* Otherwise it will remain to update the page before it
	3784	* is written back to disk.
	3785	*/
	3786	if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) {
	3787	for (i = 0; i < DAHASHSZ; i++)
	3788	if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
	3789	break;
	3790	if (i == DAHASHSZ) {
	3791	LIST_REMOVE(pagedep, pd_hash);
	3792	WORKITEM_FREE(pagedep, D_PAGEDEP);
	3793	return (0);
	3794	}
	3795	}
	3796	return (1);
	3797	}
	3798
	3799	/*
	3800	* Writing back in-core inode structures.
	3801	*
	3802	* The filesystem only accesses an inode's contents when it occupies an
	3803	* "in-core" inode structure. These "in-core" structures are separate from
	3804	* the page frames used to cache inode blocks. Only the latter are
	3805	* transferred to/from the disk. So, when the updated contents of the
	3806	* "in-core" inode structure are copied to the corresponding in-memory inode
	3807	* block, the dependencies are also transferred. The following procedure is
	3808	* called when copying a dirty "in-core" inode to a cached inode block.
	3809	*/
	3810
	3811	/*
	3812	* Called when an inode is loaded from disk. If the effective link count
	3813	* differed from the actual link count when it was last flushed, then we
	3814	* need to ensure that the correct effective link count is put back.
	3815	*
	3816	* Parameters:
	3817	* ip: the "in_core" copy of the inode
	3818	*/
	3819	void
	3820	softdep_load_inodeblock(struct inode *ip)
	3821	{
	3822	struct inodedep *inodedep;
	3823
	3824	/*
	3825	* Check for alternate nlink count.
	3826	*/
	3827	ip->i_effnlink = ip->i_nlink;
	3828	ACQUIRE_LOCK(&lk);
	3829	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
	3830	FREE_LOCK(&lk);
	3831	return;
	3832	}
	3833	ip->i_effnlink -= inodedep->id_nlinkdelta;
	3834	FREE_LOCK(&lk);
	3835	}
	3836
	3837	/*
	3838	* This routine is called just before the "in-core" inode
	3839	* information is to be copied to the in-memory inode block.
	3840	* Recall that an inode block contains several inodes. If
	3841	* the force flag is set, then the dependencies will be
	3842	* cleared so that the update can always be made. Note that
	3843	* the buffer is locked when this routine is called, so we
	3844	* will never be in the middle of writing the inode block
	3845	* to disk.
	3846	*
	3847	* Parameters:
	3848	* ip: the "in_core" copy of the inode
	3849	* bp: the buffer containing the inode block
	3850	* waitfor: nonzero => update must be allowed
	3851	*/
	3852	void
	3853	softdep_update_inodeblock(struct inode ip, struct buf bp,
	3854	int waitfor)
	3855	{
	3856	struct inodedep *inodedep;
	3857	struct worklist *wk;
	3858	struct buf *ibp;
	3859	int error, gotit;
	3860
	3861	/*
	3862	* If the effective link count is not equal to the actual link
	3863	* count, then we must track the difference in an inodedep while
	3864	* the inode is (potentially) tossed out of the cache. Otherwise,
	3865	* if there is no existing inodedep, then there are no dependencies
	3866	* to track.
	3867	*/
	3868	ACQUIRE_LOCK(&lk);
	3869	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
	3870	FREE_LOCK(&lk);
	3871	if (ip->i_effnlink != ip->i_nlink)
	3872	panic("softdep_update_inodeblock: bad link count");
	3873	return;
	3874	}
	3875	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) {
	3876	panic("softdep_update_inodeblock: bad delta");
	3877	}
	3878	/*
	3879	* Changes have been initiated. Anything depending on these
	3880	* changes cannot occur until this inode has been written.
	3881	*/
	3882	inodedep->id_state &= ~COMPLETE;
	3883	if ((inodedep->id_state & ONWORKLIST) == 0)
	3884	WORKLIST_INSERT_BP(bp, &inodedep->id_list);
	3885	/*
	3886	* Any new dependencies associated with the incore inode must
	3887	* now be moved to the list associated with the buffer holding
	3888	* the in-memory copy of the inode. Once merged process any
	3889	* allocdirects that are completed by the merger.
	3890	*/
	3891	merge_inode_lists(inodedep);
	3892	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
	3893	handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
	3894	/*
	3895	* Now that the inode has been pushed into the buffer, the
	3896	* operations dependent on the inode being written to disk
	3897	* can be moved to the id_bufwait so that they will be
	3898	* processed when the buffer I/O completes.
	3899	*/
	3900	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
	3901	WORKLIST_REMOVE(wk);
	3902	WORKLIST_INSERT(&inodedep->id_bufwait, wk);
	3903	}
	3904	/*
	3905	* Newly allocated inodes cannot be written until the bitmap
	3906	* that allocates them have been written (indicated by
	3907	* DEPCOMPLETE being set in id_state). If we are doing a
	3908	* forced sync (e.g., an fsync on a file), we force the bitmap
	3909	* to be written so that the update can be done.
	3910	*/
	3911	if (waitfor == 0) {
	3912	FREE_LOCK(&lk);
	3913	return;
	3914	}
	3915	retry:
	3916	if ((inodedep->id_state & DEPCOMPLETE) != 0) {
	3917	FREE_LOCK(&lk);
	3918	return;
	3919	}
	3920	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
	3921	if (gotit == 0) {
	3922	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) != 0)
	3923	goto retry;
	3924	FREE_LOCK(&lk);
	3925	return;
	3926	}
	3927	ibp = inodedep->id_buf;
	3928	FREE_LOCK(&lk);
	3929	if ((error = bwrite(ibp)) != 0)
	3930	softdep_error("softdep_update_inodeblock: bwrite", error);
	3931	}
	3932
	3933	/*
	3934	* Merge the new inode dependency list (id_newinoupdt) into the old
	3935	* inode dependency list (id_inoupdt). This routine must be called
	3936	* with splbio interrupts blocked.
	3937	*/
	3938	static void
	3939	merge_inode_lists(struct inodedep *inodedep)
	3940	{
	3941	struct allocdirect listadp, newadp;
	3942
	3943	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
	3944	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
	3945	if (listadp->ad_lbn < newadp->ad_lbn) {
	3946	listadp = TAILQ_NEXT(listadp, ad_next);
	3947	continue;
	3948	}
	3949	TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
	3950	TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
	3951	if (listadp->ad_lbn == newadp->ad_lbn) {
	3952	allocdirect_merge(&inodedep->id_inoupdt, newadp,
	3953	listadp);
	3954	listadp = newadp;
	3955	}
	3956	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
	3957	}
	3958	while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
	3959	TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
	3960	TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
	3961	}
	3962	}
	3963
	3964	/*
	3965	* If we are doing an fsync, then we must ensure that any directory
	3966	* entries for the inode have been written after the inode gets to disk.
	3967	*
	3968	* bioops callback - hold io_token
	3969	*
	3970	* Parameters:
	3971	* vp: the "in_core" copy of the inode
	3972	*/
	3973	static int
	3974	softdep_fsync(struct vnode *vp)
	3975	{
	3976	struct inodedep *inodedep;
	3977	struct pagedep *pagedep;
	3978	struct worklist *wk;
	3979	struct diradd *dap;
	3980	struct mount *mnt;
	3981	struct vnode *pvp;
	3982	struct inode *ip;
	3983	struct buf *bp;
	3984	struct fs *fs;
	3985	int error, flushparent;
	3986	ino_t parentino;
	3987	ufs_lbn_t lbn;
	3988
	3989	/*
	3990	* Move check from original kernel code, possibly not needed any
	3991	* more with the per-mount bioops.
	3992	*/
	3993	if ((vp->v_mount->mnt_flag & MNT_SOFTDEP) == 0)
	3994	return (0);
	3995
	3996	ip = VTOI(vp);
	3997	fs = ip->i_fs;
	3998	ACQUIRE_LOCK(&lk);
	3999	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
	4000	FREE_LOCK(&lk);
	4001	return (0);
	4002	}
	4003	if (LIST_FIRST(&inodedep->id_inowait) != NULL \|\|
	4004	LIST_FIRST(&inodedep->id_bufwait) != NULL \|\|
	4005	TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\|
	4006	TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
	4007	panic("softdep_fsync: pending ops");
	4008	}
	4009	for (error = 0, flushparent = 0; ; ) {
	4010	if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
	4011	break;
	4012	if (wk->wk_type != D_DIRADD) {
	4013	panic("softdep_fsync: Unexpected type %s",
	4014	TYPENAME(wk->wk_type));
	4015	}
	4016	dap = WK_DIRADD(wk);
	4017	/*
	4018	* Flush our parent if this directory entry
	4019	* has a MKDIR_PARENT dependency.
	4020	*/
	4021	if (dap->da_state & DIRCHG)
	4022	pagedep = dap->da_previous->dm_pagedep;
	4023	else
	4024	pagedep = dap->da_pagedep;
	4025	mnt = pagedep->pd_mnt;
	4026	parentino = pagedep->pd_ino;
	4027	lbn = pagedep->pd_lbn;
	4028	if ((dap->da_state & (MKDIR_BODY \| COMPLETE)) != COMPLETE) {
	4029	panic("softdep_fsync: dirty");
	4030	}
	4031	flushparent = dap->da_state & MKDIR_PARENT;
	4032	/*
	4033	* If we are being fsync'ed as part of vgone'ing this vnode,
	4034	* then we will not be able to release and recover the
	4035	* vnode below, so we just have to give up on writing its
	4036	* directory entry out. It will eventually be written, just
	4037	* not now, but then the user was not asking to have it
	4038	* written, so we are not breaking any promises.
	4039	*/
	4040	if (vp->v_flag & VRECLAIMED)
	4041	break;
	4042	/*
	4043	* We prevent deadlock by always fetching inodes from the
	4044	* root, moving down the directory tree. Thus, when fetching
	4045	* our parent directory, we must unlock ourselves before
	4046	* requesting the lock on our parent. See the comment in
	4047	* ufs_lookup for details on possible races.
	4048	*/
	4049	FREE_LOCK(&lk);
	4050	vn_unlock(vp);
	4051	error = VFS_VGET(mnt, NULL, parentino, &pvp);
	4052	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	4053	if (error != 0) {
	4054	return (error);
	4055	}
	4056	if (flushparent) {
	4057	if ((error = ffs_update(pvp, 1)) != 0) {
	4058	vput(pvp);
	4059	return (error);
	4060	}
	4061	}
	4062	/*
	4063	* Flush directory page containing the inode's name.
	4064	*/
	4065	error = bread(pvp, lblktodoff(fs, lbn), blksize(fs, VTOI(pvp), lbn), &bp);
	4066	if (error == 0)
	4067	error = bwrite(bp);
	4068	vput(pvp);
	4069	if (error != 0) {
	4070	return (error);
	4071	}
	4072	ACQUIRE_LOCK(&lk);
	4073	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
	4074	break;
	4075	}
	4076	FREE_LOCK(&lk);
	4077	return (0);
	4078	}
	4079
	4080	/*
	4081	* Flush all the dirty bitmaps associated with the block device
	4082	* before flushing the rest of the dirty blocks so as to reduce
	4083	* the number of dependencies that will have to be rolled back.
	4084	*/
	4085	static int softdep_fsync_mountdev_bp(struct buf bp, void data);
	4086
	4087	void
	4088	softdep_fsync_mountdev(struct vnode *vp)
	4089	{
	4090	if (!vn_isdisk(vp, NULL))
	4091	panic("softdep_fsync_mountdev: vnode not a disk");
	4092	ACQUIRE_LOCK(&lk);
	4093	lwkt_gettoken(&vp->v_token);
	4094	RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
	4095	softdep_fsync_mountdev_bp, vp);
	4096	lwkt_reltoken(&vp->v_token);
	4097	drain_output(vp, 1);
	4098	FREE_LOCK(&lk);
	4099	}
	4100
	4101	static int
	4102	softdep_fsync_mountdev_bp(struct buf bp, void data)
	4103	{
	4104	struct worklist *wk;
	4105	struct vnode *vp = data;
	4106
	4107	/*
	4108	* If it is already scheduled, skip to the next buffer.
	4109	*/
	4110	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT))
	4111	return(0);
	4112	if (bp->b_vp != vp \|\| (bp->b_flags & B_DELWRI) == 0) {
	4113	BUF_UNLOCK(bp);
	4114	kprintf("softdep_fsync_mountdev_bp: warning, buffer %p ripped out from under vnode %p\n", bp, vp);
	4115	return(0);
	4116	}
	4117	/*
	4118	* We are only interested in bitmaps with outstanding
	4119	* dependencies.
	4120	*/
	4121	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL \|\|
	4122	wk->wk_type != D_BMSAFEMAP) {
	4123	BUF_UNLOCK(bp);
	4124	return(0);
	4125	}
	4126	bremfree(bp);
	4127	FREE_LOCK(&lk);
	4128	(void) bawrite(bp);
	4129	ACQUIRE_LOCK(&lk);
	4130	return(0);
	4131	}
	4132
	4133	/*
	4134	* This routine is called when we are trying to synchronously flush a
	4135	* file. This routine must eliminate any filesystem metadata dependencies
	4136	* so that the syncing routine can succeed by pushing the dirty blocks
	4137	* associated with the file. If any I/O errors occur, they are returned.
	4138	*/
	4139	struct softdep_sync_metadata_info {
	4140	struct vnode *vp;
	4141	int waitfor;
	4142	};
	4143
	4144	static int softdep_sync_metadata_bp(struct buf bp, void data);
	4145
	4146	int
	4147	softdep_sync_metadata(struct vnode vp, struct thread td)
	4148	{
	4149	struct softdep_sync_metadata_info info;
	4150	int error, waitfor;
	4151
	4152	/*
	4153	* Check whether this vnode is involved in a filesystem
	4154	* that is doing soft dependency processing.
	4155	*/
	4156	if (!vn_isdisk(vp, NULL)) {
	4157	if (!DOINGSOFTDEP(vp))
	4158	return (0);
	4159	} else
	4160	if (vp->v_rdev->si_mountpoint == NULL \|\|
	4161	(vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0)
	4162	return (0);
	4163	/*
	4164	* Ensure that any direct block dependencies have been cleared.
	4165	*/
	4166	ACQUIRE_LOCK(&lk);
	4167	if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
	4168	FREE_LOCK(&lk);
	4169	return (error);
	4170	}
	4171	/*
	4172	* For most files, the only metadata dependencies are the
	4173	* cylinder group maps that allocate their inode or blocks.
	4174	* The block allocation dependencies can be found by traversing
	4175	* the dependency lists for any buffers that remain on their
	4176	* dirty buffer list. The inode allocation dependency will
	4177	* be resolved when the inode is updated with MNT_WAIT.
	4178	* This work is done in two passes. The first pass grabs most
	4179	* of the buffers and begins asynchronously writing them. The
	4180	* only way to wait for these asynchronous writes is to sleep
	4181	* on the filesystem vnode which may stay busy for a long time
	4182	* if the filesystem is active. So, instead, we make a second
	4183	* pass over the dependencies blocking on each write. In the
	4184	* usual case we will be blocking against a write that we
	4185	* initiated, so when it is done the dependency will have been
	4186	* resolved. Thus the second pass is expected to end quickly.
	4187	*/
	4188	waitfor = MNT_NOWAIT;
	4189	top:
	4190	/*
	4191	* We must wait for any I/O in progress to finish so that
	4192	* all potential buffers on the dirty list will be visible.
	4193	*/
	4194	drain_output(vp, 1);
	4195
	4196	info.vp = vp;
	4197	info.waitfor = waitfor;
	4198	lwkt_gettoken(&vp->v_token);
	4199	error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
	4200	softdep_sync_metadata_bp, &info);
	4201	lwkt_reltoken(&vp->v_token);
	4202	if (error < 0) {
	4203	FREE_LOCK(&lk);
	4204	return(-error); /* error code */
	4205	}
	4206
	4207	/*
	4208	* The brief unlock is to allow any pent up dependency
	4209	* processing to be done. Then proceed with the second pass.
	4210	*/
	4211	if (waitfor & MNT_NOWAIT) {
	4212	waitfor = MNT_WAIT;
	4213	FREE_LOCK(&lk);
	4214	ACQUIRE_LOCK(&lk);
	4215	goto top;
	4216	}
	4217
	4218	/*
	4219	* If we have managed to get rid of all the dirty buffers,
	4220	* then we are done. For certain directories and block
	4221	* devices, we may need to do further work.
	4222	*
	4223	* We must wait for any I/O in progress to finish so that
	4224	* all potential buffers on the dirty list will be visible.
	4225	*/
	4226	drain_output(vp, 1);
	4227	if (RB_EMPTY(&vp->v_rbdirty_tree)) {
	4228	FREE_LOCK(&lk);
	4229	return (0);
	4230	}
	4231
	4232	FREE_LOCK(&lk);
	4233	/*
	4234	* If we are trying to sync a block device, some of its buffers may
	4235	* contain metadata that cannot be written until the contents of some
	4236	* partially written files have been written to disk. The only easy
	4237	* way to accomplish this is to sync the entire filesystem (luckily
	4238	* this happens rarely).
	4239	*/
	4240	if (vn_isdisk(vp, NULL) &&
	4241	vp->v_rdev &&
	4242	vp->v_rdev->si_mountpoint && !vn_islocked(vp) &&
	4243	(error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT)) != 0)
	4244	return (error);
	4245	return (0);
	4246	}
	4247
	4248	static int
	4249	softdep_sync_metadata_bp(struct buf bp, void data)
	4250	{
	4251	struct softdep_sync_metadata_info *info = data;
	4252	struct pagedep *pagedep;
	4253	struct allocdirect *adp;
	4254	struct allocindir *aip;
	4255	struct worklist *wk;
	4256	struct buf *nbp;
	4257	int error;
	4258	int i;
	4259
	4260	if (getdirtybuf(&bp, MNT_WAIT) == 0) {
	4261	kprintf("softdep_sync_metadata_bp(1): caught buf %p going away\n", bp);
	4262	return (1);
	4263	}
	4264	if (bp->b_vp != info->vp \|\| (bp->b_flags & B_DELWRI) == 0) {
	4265	kprintf("softdep_sync_metadata_bp(2): caught buf %p going away vp %p\n", bp, info->vp);
	4266	BUF_UNLOCK(bp);
	4267	return(1);
	4268	}
	4269
	4270	/*
	4271	* As we hold the buffer locked, none of its dependencies
	4272	* will disappear.
	4273	*/
	4274	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	4275	switch (wk->wk_type) {
	4276
	4277	case D_ALLOCDIRECT:
	4278	adp = WK_ALLOCDIRECT(wk);
	4279	if (adp->ad_state & DEPCOMPLETE)
	4280	break;
	4281	nbp = adp->ad_buf;
	4282	if (getdirtybuf(&nbp, info->waitfor) == 0)
	4283	break;
	4284	FREE_LOCK(&lk);
	4285	if (info->waitfor & MNT_NOWAIT) {
	4286	bawrite(nbp);
	4287	} else if ((error = bwrite(nbp)) != 0) {
	4288	bawrite(bp);
	4289	ACQUIRE_LOCK(&lk);
	4290	return (-error);
	4291	}
	4292	ACQUIRE_LOCK(&lk);
	4293	break;
	4294
	4295	case D_ALLOCINDIR:
	4296	aip = WK_ALLOCINDIR(wk);
	4297	if (aip->ai_state & DEPCOMPLETE)
	4298	break;
	4299	nbp = aip->ai_buf;
	4300	if (getdirtybuf(&nbp, info->waitfor) == 0)
	4301	break;
	4302	FREE_LOCK(&lk);
	4303	if (info->waitfor & MNT_NOWAIT) {
	4304	bawrite(nbp);
	4305	} else if ((error = bwrite(nbp)) != 0) {
	4306	bawrite(bp);
	4307	ACQUIRE_LOCK(&lk);
	4308	return (-error);
	4309	}
	4310	ACQUIRE_LOCK(&lk);
	4311	break;
	4312
	4313	case D_INDIRDEP:
	4314	restart:
	4315
	4316	LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
	4317	if (aip->ai_state & DEPCOMPLETE)
	4318	continue;
	4319	nbp = aip->ai_buf;
	4320	if (getdirtybuf(&nbp, MNT_WAIT) == 0)
	4321	goto restart;
	4322	FREE_LOCK(&lk);
	4323	if ((error = bwrite(nbp)) != 0) {
	4324	bawrite(bp);
	4325	ACQUIRE_LOCK(&lk);
	4326	return (-error);
	4327	}
	4328	ACQUIRE_LOCK(&lk);
	4329	goto restart;
	4330	}
	4331	break;
	4332
	4333	case D_INODEDEP:
	4334	if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
	4335	WK_INODEDEP(wk)->id_ino)) != 0) {
	4336	FREE_LOCK(&lk);
	4337	bawrite(bp);
	4338	ACQUIRE_LOCK(&lk);
	4339	return (-error);
	4340	}
	4341	break;
	4342
	4343	case D_PAGEDEP:
	4344	/*
	4345	* We are trying to sync a directory that may
	4346	* have dependencies on both its own metadata
	4347	* and/or dependencies on the inodes of any
	4348	* recently allocated files. We walk its diradd
	4349	* lists pushing out the associated inode.
	4350	*/
	4351	pagedep = WK_PAGEDEP(wk);
	4352	for (i = 0; i < DAHASHSZ; i++) {
	4353	if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
	4354	continue;
	4355	if ((error =
	4356	flush_pagedep_deps(info->vp,
	4357	pagedep->pd_mnt,
	4358	&pagedep->pd_diraddhd[i]))) {
	4359	FREE_LOCK(&lk);
	4360	bawrite(bp);
	4361	ACQUIRE_LOCK(&lk);
	4362	return (-error);
	4363	}
	4364	}
	4365	break;
	4366
	4367	case D_MKDIR:
	4368	/*
	4369	* This case should never happen if the vnode has
	4370	* been properly sync'ed. However, if this function
	4371	* is used at a place where the vnode has not yet
	4372	* been sync'ed, this dependency can show up. So,
	4373	* rather than panic, just flush it.
	4374	*/
	4375	nbp = WK_MKDIR(wk)->md_buf;
	4376	if (getdirtybuf(&nbp, info->waitfor) == 0)
	4377	break;
	4378	FREE_LOCK(&lk);
	4379	if (info->waitfor & MNT_NOWAIT) {
	4380	bawrite(nbp);
	4381	} else if ((error = bwrite(nbp)) != 0) {
	4382	bawrite(bp);
	4383	ACQUIRE_LOCK(&lk);
	4384	return (-error);
	4385	}
	4386	ACQUIRE_LOCK(&lk);
	4387	break;
	4388
	4389	case D_BMSAFEMAP:
	4390	/*
	4391	* This case should never happen if the vnode has
	4392	* been properly sync'ed. However, if this function
	4393	* is used at a place where the vnode has not yet
	4394	* been sync'ed, this dependency can show up. So,
	4395	* rather than panic, just flush it.
	4396	*
	4397	* nbp can wind up == bp if a device node for the
	4398	* same filesystem is being fsynced at the same time,
	4399	* leading to a panic if we don't catch the case.
	4400	*/
	4401	nbp = WK_BMSAFEMAP(wk)->sm_buf;
	4402	if (nbp == bp)
	4403	break;
	4404	if (getdirtybuf(&nbp, info->waitfor) == 0)
	4405	break;
	4406	FREE_LOCK(&lk);
	4407	if (info->waitfor & MNT_NOWAIT) {
	4408	bawrite(nbp);
	4409	} else if ((error = bwrite(nbp)) != 0) {
	4410	bawrite(bp);
	4411	ACQUIRE_LOCK(&lk);
	4412	return (-error);
	4413	}
	4414	ACQUIRE_LOCK(&lk);
	4415	break;
	4416
	4417	default:
	4418	panic("softdep_sync_metadata: Unknown type %s",
	4419	TYPENAME(wk->wk_type));
	4420	/* NOTREACHED */
	4421	}
	4422	}
	4423	FREE_LOCK(&lk);
	4424	bawrite(bp);
	4425	ACQUIRE_LOCK(&lk);
	4426	return(0);
	4427	}
	4428
	4429	/*
	4430	* Flush the dependencies associated with an inodedep.
	4431	* Called with splbio blocked.
	4432	*/
	4433	static int
	4434	flush_inodedep_deps(struct fs *fs, ino_t ino)
	4435	{
	4436	struct inodedep *inodedep;
	4437	struct allocdirect *adp;
	4438	int error, waitfor;
	4439	struct buf *bp;
	4440
	4441	/*
	4442	* This work is done in two passes. The first pass grabs most
	4443	* of the buffers and begins asynchronously writing them. The
	4444	* only way to wait for these asynchronous writes is to sleep
	4445	* on the filesystem vnode which may stay busy for a long time
	4446	* if the filesystem is active. So, instead, we make a second
	4447	* pass over the dependencies blocking on each write. In the
	4448	* usual case we will be blocking against a write that we
	4449	* initiated, so when it is done the dependency will have been
	4450	* resolved. Thus the second pass is expected to end quickly.
	4451	* We give a brief window at the top of the loop to allow
	4452	* any pending I/O to complete.
	4453	*/
	4454	for (waitfor = MNT_NOWAIT; ; ) {
	4455	FREE_LOCK(&lk);
	4456	ACQUIRE_LOCK(&lk);
	4457	if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
	4458	return (0);
	4459	TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) {
	4460	if (adp->ad_state & DEPCOMPLETE)
	4461	continue;
	4462	bp = adp->ad_buf;
	4463	if (getdirtybuf(&bp, waitfor) == 0) {
	4464	if (waitfor & MNT_NOWAIT)
	4465	continue;
	4466	break;
	4467	}
	4468	FREE_LOCK(&lk);
	4469	if (waitfor & MNT_NOWAIT) {
	4470	bawrite(bp);
	4471	} else if ((error = bwrite(bp)) != 0) {
	4472	ACQUIRE_LOCK(&lk);
	4473	return (error);
	4474	}
	4475	ACQUIRE_LOCK(&lk);
	4476	break;
	4477	}
	4478	if (adp != NULL)
	4479	continue;
	4480	TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) {
	4481	if (adp->ad_state & DEPCOMPLETE)
	4482	continue;
	4483	bp = adp->ad_buf;
	4484	if (getdirtybuf(&bp, waitfor) == 0) {
	4485	if (waitfor & MNT_NOWAIT)
	4486	continue;
	4487	break;
	4488	}
	4489	FREE_LOCK(&lk);
	4490	if (waitfor & MNT_NOWAIT) {
	4491	bawrite(bp);
	4492	} else if ((error = bwrite(bp)) != 0) {
	4493	ACQUIRE_LOCK(&lk);
	4494	return (error);
	4495	}
	4496	ACQUIRE_LOCK(&lk);
	4497	break;
	4498	}
	4499	if (adp != NULL)
	4500	continue;
	4501	/*
	4502	* If pass2, we are done, otherwise do pass 2.
	4503	*/
	4504	if (waitfor == MNT_WAIT)
	4505	break;
	4506	waitfor = MNT_WAIT;
	4507	}
	4508	/*
	4509	* Try freeing inodedep in case all dependencies have been removed.
	4510	*/
	4511	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
	4512	(void) free_inodedep(inodedep);
	4513	return (0);
	4514	}
	4515
	4516	/*
	4517	* Eliminate a pagedep dependency by flushing out all its diradd dependencies.
	4518	* Called with splbio blocked.
	4519	*/
	4520	static int
	4521	flush_pagedep_deps(struct vnode pvp, struct mount mp,
	4522	struct diraddhd *diraddhdp)
	4523	{
	4524	struct inodedep *inodedep;
	4525	struct ufsmount *ump;
	4526	struct diradd *dap;
	4527	struct vnode *vp;
	4528	int gotit, error = 0;
	4529	struct buf *bp;
	4530	ino_t inum;
	4531
	4532	ump = VFSTOUFS(mp);
	4533	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
	4534	/*
	4535	* Flush ourselves if this directory entry
	4536	* has a MKDIR_PARENT dependency.
	4537	*/
	4538	if (dap->da_state & MKDIR_PARENT) {
	4539	FREE_LOCK(&lk);
	4540	if ((error = ffs_update(pvp, 1)) != 0)
	4541	break;
	4542	ACQUIRE_LOCK(&lk);
	4543	/*
	4544	* If that cleared dependencies, go on to next.
	4545	*/
	4546	if (dap != LIST_FIRST(diraddhdp))
	4547	continue;
	4548	if (dap->da_state & MKDIR_PARENT) {
	4549	panic("flush_pagedep_deps: MKDIR_PARENT");
	4550	}
	4551	}
	4552	/*
	4553	* A newly allocated directory must have its "." and
	4554	* ".." entries written out before its name can be
	4555	* committed in its parent. We do not want or need
	4556	* the full semantics of a synchronous VOP_FSYNC as
	4557	* that may end up here again, once for each directory
	4558	* level in the filesystem. Instead, we push the blocks
	4559	* and wait for them to clear. We have to fsync twice
	4560	* because the first call may choose to defer blocks
	4561	* that still have dependencies, but deferral will
	4562	* happen at most once.
	4563	*/
	4564	inum = dap->da_newinum;
	4565	if (dap->da_state & MKDIR_BODY) {
	4566	FREE_LOCK(&lk);
	4567	if ((error = VFS_VGET(mp, NULL, inum, &vp)) != 0)
	4568	break;
	4569	if ((error=VOP_FSYNC(vp, MNT_NOWAIT, 0)) \|\|
	4570	(error=VOP_FSYNC(vp, MNT_NOWAIT, 0))) {
	4571	vput(vp);
	4572	break;
	4573	}
	4574	drain_output(vp, 0);
	4575	vput(vp);
	4576	ACQUIRE_LOCK(&lk);
	4577	/*
	4578	* If that cleared dependencies, go on to next.
	4579	*/
	4580	if (dap != LIST_FIRST(diraddhdp))
	4581	continue;
	4582	if (dap->da_state & MKDIR_BODY) {
	4583	panic("flush_pagedep_deps: MKDIR_BODY");
	4584	}
	4585	}
	4586	/*
	4587	* Flush the inode on which the directory entry depends.
	4588	* Having accounted for MKDIR_PARENT and MKDIR_BODY above,
	4589	* the only remaining dependency is that the updated inode
	4590	* count must get pushed to disk. The inode has already
	4591	* been pushed into its inode buffer (via VOP_UPDATE) at
	4592	* the time of the reference count change. So we need only
	4593	* locate that buffer, ensure that there will be no rollback
	4594	* caused by a bitmap dependency, then write the inode buffer.
	4595	*/
	4596	if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
	4597	panic("flush_pagedep_deps: lost inode");
	4598	}
	4599	/*
	4600	* If the inode still has bitmap dependencies,
	4601	* push them to disk.
	4602	*/
	4603	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
	4604	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
	4605	FREE_LOCK(&lk);
	4606	if (gotit && (error = bwrite(inodedep->id_buf)) != 0)
	4607	break;
	4608	ACQUIRE_LOCK(&lk);
	4609	if (dap != LIST_FIRST(diraddhdp))
	4610	continue;
	4611	}
	4612	/*
	4613	* If the inode is still sitting in a buffer waiting
	4614	* to be written, push it to disk.
	4615	*/
	4616	FREE_LOCK(&lk);
	4617	if ((error = bread(ump->um_devvp,
	4618	fsbtodoff(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
	4619	(int)ump->um_fs->fs_bsize, &bp)) != 0)
	4620	break;
	4621	if ((error = bwrite(bp)) != 0)
	4622	break;
	4623	ACQUIRE_LOCK(&lk);
	4624	/*
	4625	* If we have failed to get rid of all the dependencies
	4626	* then something is seriously wrong.
	4627	*/
	4628	if (dap == LIST_FIRST(diraddhdp)) {
	4629	panic("flush_pagedep_deps: flush failed");
	4630	}
	4631	}
	4632	if (error)
	4633	ACQUIRE_LOCK(&lk);
	4634	return (error);
	4635	}
	4636
	4637	/*
	4638	* A large burst of file addition or deletion activity can drive the
	4639	* memory load excessively high. First attempt to slow things down
	4640	* using the techniques below. If that fails, this routine requests
	4641	* the offending operations to fall back to running synchronously
	4642	* until the memory load returns to a reasonable level.
	4643	*/
	4644	int
	4645	softdep_slowdown(struct vnode *vp)
	4646	{
	4647	int max_softdeps_hard;
	4648
	4649	max_softdeps_hard = max_softdeps * 11 / 10;
	4650	if (num_dirrem < max_softdeps_hard / 2 &&
	4651	num_inodedep < max_softdeps_hard)
	4652	return (0);
	4653	stat_sync_limit_hit += 1;
	4654	return (1);
	4655	}
	4656
	4657	/*
	4658	* If memory utilization has gotten too high, deliberately slow things
	4659	* down and speed up the I/O processing.
	4660	*/
	4661	static int
	4662	request_cleanup(int resource, int islocked)
	4663	{
	4664	struct thread td = curthread; / XXX */
	4665
	4666	/*
	4667	* We never hold up the filesystem syncer process.
	4668	*/
	4669	if (td == filesys_syncer)
	4670	return (0);
	4671	/*
	4672	* First check to see if the work list has gotten backlogged.
	4673	* If it has, co-opt this process to help clean up two entries.
	4674	* Because this process may hold inodes locked, we cannot
	4675	* handle any remove requests that might block on a locked
	4676	* inode as that could lead to deadlock.
	4677	*/
	4678	if (num_on_worklist > max_softdeps / 10) {
	4679	process_worklist_item(NULL, LK_NOWAIT);
	4680	process_worklist_item(NULL, LK_NOWAIT);
	4681	stat_worklist_push += 2;
	4682	return(1);
	4683	}
	4684
	4685	/*
	4686	* If we are resource constrained on inode dependencies, try
	4687	* flushing some dirty inodes. Otherwise, we are constrained
	4688	* by file deletions, so try accelerating flushes of directories
	4689	* with removal dependencies. We would like to do the cleanup
	4690	* here, but we probably hold an inode locked at this point and
	4691	* that might deadlock against one that we try to clean. So,
	4692	* the best that we can do is request the syncer daemon to do
	4693	* the cleanup for us.
	4694	*/
	4695	switch (resource) {
	4696
	4697	case FLUSH_INODES:
	4698	stat_ino_limit_push += 1;
	4699	req_clear_inodedeps += 1;
	4700	stat_countp = &stat_ino_limit_hit;
	4701	break;
	4702
	4703	case FLUSH_REMOVE:
	4704	stat_blk_limit_push += 1;
	4705	req_clear_remove += 1;
	4706	stat_countp = &stat_blk_limit_hit;
	4707	break;
	4708
	4709	default:
	4710	panic("request_cleanup: unknown type");
	4711	}
	4712	/*
	4713	* Hopefully the syncer daemon will catch up and awaken us.
	4714	* We wait at most tickdelay before proceeding in any case.
	4715	*/
	4716	if (islocked == 0)
	4717	ACQUIRE_LOCK(&lk);
	4718	lksleep(&proc_waiting, &lk, 0, "softupdate",
	4719	tickdelay > 2 ? tickdelay : 2);
	4720	if (islocked == 0)
	4721	FREE_LOCK(&lk);
	4722	return (1);
	4723	}
	4724
	4725	/*
	4726	* Flush out a directory with at least one removal dependency in an effort to
	4727	* reduce the number of dirrem, freefile, and freeblks dependency structures.
	4728	*/
	4729	static void
	4730	clear_remove(struct thread *td)
	4731	{
	4732	struct pagedep_hashhead *pagedephd;
	4733	struct pagedep *pagedep;
	4734	static int next = 0;
	4735	struct mount *mp;
	4736	struct vnode *vp;
	4737	int error, cnt;
	4738	ino_t ino;
	4739
	4740	ACQUIRE_LOCK(&lk);
	4741	for (cnt = 0; cnt < pagedep_hash; cnt++) {
	4742	pagedephd = &pagedep_hashtbl[next++];
	4743	if (next >= pagedep_hash)
	4744	next = 0;
	4745	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
	4746	if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
	4747	continue;
	4748	mp = pagedep->pd_mnt;
	4749	ino = pagedep->pd_ino;
	4750	FREE_LOCK(&lk);
	4751	if ((error = VFS_VGET(mp, NULL, ino, &vp)) != 0) {
	4752	softdep_error("clear_remove: vget", error);
	4753	return;
	4754	}
	4755	if ((error = VOP_FSYNC(vp, MNT_NOWAIT, 0)))
	4756	softdep_error("clear_remove: fsync", error);
	4757	drain_output(vp, 0);
	4758	vput(vp);
	4759	return;
	4760	}
	4761	}
	4762	FREE_LOCK(&lk);
	4763	}
	4764
	4765	/*
	4766	* Clear out a block of dirty inodes in an effort to reduce
	4767	* the number of inodedep dependency structures.
	4768	*/
	4769	struct clear_inodedeps_info {
	4770	struct fs *fs;
	4771	struct mount *mp;
	4772	};
	4773
	4774	static int
	4775	clear_inodedeps_mountlist_callback(struct mount mp, void data)
	4776	{
	4777	struct clear_inodedeps_info *info = data;
	4778
	4779	if ((mp->mnt_flag & MNT_SOFTDEP) && info->fs == VFSTOUFS(mp)->um_fs) {
	4780	info->mp = mp;
	4781	return(-1);
	4782	}
	4783	return(0);
	4784	}
	4785
	4786	static void
	4787	clear_inodedeps(struct thread *td)
	4788	{
	4789	struct clear_inodedeps_info info;
	4790	struct inodedep_hashhead *inodedephd;
	4791	struct inodedep *inodedep;
	4792	static int next = 0;
	4793	struct vnode *vp;
	4794	struct fs *fs;
	4795	int error, cnt;
	4796	ino_t firstino, lastino, ino;
	4797
	4798	ACQUIRE_LOCK(&lk);
	4799	/*
	4800	* Pick a random inode dependency to be cleared.
	4801	* We will then gather up all the inodes in its block
	4802	* that have dependencies and flush them out.
	4803	*/
	4804	for (cnt = 0; cnt < inodedep_hash; cnt++) {
	4805	inodedephd = &inodedep_hashtbl[next++];
	4806	if (next >= inodedep_hash)
	4807	next = 0;
	4808	if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
	4809	break;
	4810	}
	4811	if (inodedep == NULL) {
	4812	FREE_LOCK(&lk);
	4813	return;
	4814	}
	4815	/*
	4816	* Ugly code to find mount point given pointer to superblock.
	4817	*/
	4818	fs = inodedep->id_fs;
	4819	info.mp = NULL;
	4820	info.fs = fs;
	4821	mountlist_scan(clear_inodedeps_mountlist_callback,
	4822	&info, MNTSCAN_FORWARD\|MNTSCAN_NOBUSY);
	4823	/*
	4824	* Find the last inode in the block with dependencies.
	4825	*/
	4826	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
	4827	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
	4828	if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
	4829	break;
	4830	/*
	4831	* Asynchronously push all but the last inode with dependencies.
	4832	* Synchronously push the last inode with dependencies to ensure
	4833	* that the inode block gets written to free up the inodedeps.
	4834	*/
	4835	for (ino = firstino; ino <= lastino; ino++) {
	4836	if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
	4837	continue;
	4838	FREE_LOCK(&lk);
	4839	if ((error = VFS_VGET(info.mp, NULL, ino, &vp)) != 0) {
	4840	softdep_error("clear_inodedeps: vget", error);
	4841	return;
	4842	}
	4843	if (ino == lastino) {
	4844	if ((error = VOP_FSYNC(vp, MNT_WAIT, 0)))
	4845	softdep_error("clear_inodedeps: fsync1", error);
	4846	} else {
	4847	if ((error = VOP_FSYNC(vp, MNT_NOWAIT, 0)))
	4848	softdep_error("clear_inodedeps: fsync2", error);
	4849	drain_output(vp, 0);
	4850	}
	4851	vput(vp);
	4852	ACQUIRE_LOCK(&lk);
	4853	}
	4854	FREE_LOCK(&lk);
	4855	}
	4856
	4857	/*
	4858	* Function to determine if the buffer has outstanding dependencies
	4859	* that will cause a roll-back if the buffer is written. If wantcount
	4860	* is set, return number of dependencies, otherwise just yes or no.
	4861	*
	4862	* bioops callback - hold io_token
	4863	*/
	4864	static int
	4865	softdep_count_dependencies(struct buf *bp, int wantcount)
	4866	{
	4867	struct worklist *wk;
	4868	struct inodedep *inodedep;
	4869	struct indirdep *indirdep;
	4870	struct allocindir *aip;
	4871	struct pagedep *pagedep;
	4872	struct diradd *dap;
	4873	int i, retval;
	4874
	4875	retval = 0;
	4876	ACQUIRE_LOCK(&lk);
	4877
	4878	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	4879	switch (wk->wk_type) {
	4880
	4881	case D_INODEDEP:
	4882	inodedep = WK_INODEDEP(wk);
	4883	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
	4884	/* bitmap allocation dependency */
	4885	retval += 1;
	4886	if (!wantcount)
	4887	goto out;
	4888	}
	4889	if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
	4890	/* direct block pointer dependency */
	4891	retval += 1;
	4892	if (!wantcount)
	4893	goto out;
	4894	}
	4895	continue;
	4896
	4897	case D_INDIRDEP:
	4898	indirdep = WK_INDIRDEP(wk);
	4899
	4900	LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
	4901	/* indirect block pointer dependency */
	4902	retval += 1;
	4903	if (!wantcount)
	4904	goto out;
	4905	}
	4906	continue;
	4907
	4908	case D_PAGEDEP:
	4909	pagedep = WK_PAGEDEP(wk);
	4910	for (i = 0; i < DAHASHSZ; i++) {
	4911
	4912	LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
	4913	/* directory entry dependency */
	4914	retval += 1;
	4915	if (!wantcount)
	4916	goto out;
	4917	}
	4918	}
	4919	continue;
	4920
	4921	case D_BMSAFEMAP:
	4922	case D_ALLOCDIRECT:
	4923	case D_ALLOCINDIR:
	4924	case D_MKDIR:
	4925	/* never a dependency on these blocks */
	4926	continue;
	4927
	4928	default:
	4929	panic("softdep_check_for_rollback: Unexpected type %s",
	4930	TYPENAME(wk->wk_type));
	4931	/* NOTREACHED */
	4932	}
	4933	}
	4934	out:
	4935	FREE_LOCK(&lk);
	4936
	4937	return retval;
	4938	}
	4939
	4940	/*
	4941	* Acquire exclusive access to a buffer. Requires softdep lock
	4942	* to be held on entry. If waitfor is MNT_WAIT, may release/reacquire
	4943	* softdep lock.
	4944	*
	4945	* Returns 1 if the buffer was locked, 0 if it was not locked or
	4946	* if we had to block.
	4947	*
	4948	* NOTE! In order to return 1 we must acquire the buffer lock prior
	4949	* to any release of &lk. Once we release &lk it's all over.
	4950	* We may still have to block on the (type-stable) bp in that
	4951	* case, but we must then unlock it and return 0.
	4952	*/
	4953	static int
	4954	getdirtybuf(struct buf **bpp, int waitfor)
	4955	{
	4956	struct buf *bp;
	4957	int error;
	4958
	4959	/*
	4960	* If the contents of *bpp is NULL the caller presumably lost a race.
	4961	*/
	4962	bp = *bpp;
	4963	if (bp == NULL)
	4964	return (0);
	4965
	4966	/*
	4967	* Try to obtain the buffer lock without deadlocking on &lk.
	4968	*/
	4969	KKASSERT(lock_held(&lk) > 0);
	4970	error = BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT);
	4971	if (error == 0) {
	4972	/*
	4973	* If the buffer is no longer dirty the OS already wrote it
	4974	* out, return failure.
	4975	*/
	4976	if ((bp->b_flags & B_DELWRI) == 0) {
	4977	BUF_UNLOCK(bp);
	4978	return (0);
	4979	}
	4980
	4981	/*
	4982	* Finish nominal buffer locking sequence return success.
	4983	*/
	4984	bremfree(bp);
	4985	return (1);
	4986	}
	4987
	4988	/*
	4989	* Failure case.
	4990	*
	4991	* If we are not being asked to wait, return 0 immediately.
	4992	*/
	4993	if (waitfor != MNT_WAIT)
	4994	return (0);
	4995
	4996	/*
	4997	* Once we release the softdep lock we can never return success,
	4998	* but we still have to block on the type-stable buf for the caller
	4999	* to be able to retry without livelocking the system.
	5000	*
	5001	* The caller will normally retry in this case.
	5002	*/
	5003	FREE_LOCK(&lk);
	5004	error = BUF_LOCK(bp, LK_EXCLUSIVE \| LK_SLEEPFAIL);
	5005	ACQUIRE_LOCK(&lk);
	5006	if (error == 0)
	5007	BUF_UNLOCK(bp);
	5008	return (0);
	5009	}
	5010
	5011	/*
	5012	* Wait for pending output on a vnode to complete.
	5013	* Must be called with vnode locked.
	5014	*/
	5015	static void
	5016	drain_output(struct vnode *vp, int islocked)
	5017	{
	5018
	5019	if (!islocked)
	5020	ACQUIRE_LOCK(&lk);
	5021	while (bio_track_active(&vp->v_track_write)) {
	5022	FREE_LOCK(&lk);
	5023	bio_track_wait(&vp->v_track_write, 0, 0);
	5024	ACQUIRE_LOCK(&lk);
	5025	}
	5026	if (!islocked)
	5027	FREE_LOCK(&lk);
	5028	}
	5029
	5030	/*
	5031	* Called whenever a buffer that is being invalidated or reallocated
	5032	* contains dependencies. This should only happen if an I/O error has
	5033	* occurred. The routine is called with the buffer locked.
	5034	*
	5035	* bioops callback - hold io_token
	5036	*/
	5037	static void
	5038	softdep_deallocate_dependencies(struct buf *bp)
	5039	{
	5040	/* nothing to do, mp lock not needed */
	5041	if ((bp->b_flags & B_ERROR) == 0)
	5042	panic("softdep_deallocate_dependencies: dangling deps");
	5043	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntfromname, bp->b_error);
	5044	panic("softdep_deallocate_dependencies: unrecovered I/O error");
	5045	}
	5046
	5047	/*
	5048	* Function to handle asynchronous write errors in the filesystem.
	5049	*/
	5050	void
	5051	softdep_error(char *func, int error)
	5052	{
	5053	/* XXX should do something better! */
	5054	kprintf("%s: got error %d while accessing filesystem\n", func, error);
	5055	}