gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
	3	*
	4	* The soft updates code is derived from the appendix of a University
	5	* of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
	6	* "Soft Updates: A Solution to the Metadata Update Problem in File
	7	* Systems", CSE-TR-254-95, August 1995).
	8	*
	9	* Further information about soft updates can be obtained from:
	10	*
	11	* Marshall Kirk McKusick http://www.mckusick.com/softdep/
	12	* 1614 Oxford Street mckusick@mckusick.com
	13	* Berkeley, CA 94709-1608 +1-510-843-9542
	14	* USA
	15	*
	16	* Redistribution and use in source and binary forms, with or without
	17	* modification, are permitted provided that the following conditions
	18	* are met:
	19	*
	20	* 1. Redistributions of source code must retain the above copyright
	21	* notice, this list of conditions and the following disclaimer.
	22	* 2. Redistributions in binary form must reproduce the above copyright
	23	* notice, this list of conditions and the following disclaimer in the
	24	* documentation and/or other materials provided with the distribution.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
	27	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	28	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	29	* DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
	30	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
	39	* $FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.57.2.11 2002/02/05 18:46:53 dillon Exp $
	40	* $DragonFly: src/sys/vfs/ufs/ffs_softdep.c,v 1.57 2008/06/28 17:59:51 dillon Exp $
	41	*/
	42
	43	/*
	44	* For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
	45	*/
	46	#ifndef DIAGNOSTIC
	47	#define DIAGNOSTIC
	48	#endif
	49	#ifndef DEBUG
	50	#define DEBUG
	51	#endif
	52
	53	#include <sys/param.h>
	54	#include <sys/kernel.h>
	55	#include <sys/systm.h>
	56	#include <sys/buf.h>
	57	#include <sys/malloc.h>
	58	#include <sys/mount.h>
	59	#include <sys/proc.h>
	60	#include <sys/syslog.h>
	61	#include <sys/vnode.h>
	62	#include <sys/conf.h>
	63	#include <sys/buf2.h>
	64	#include <machine/inttypes.h>
	65	#include "dir.h"
	66	#include "quota.h"
	67	#include "inode.h"
	68	#include "ufsmount.h"
	69	#include "fs.h"
	70	#include "softdep.h"
	71	#include "ffs_extern.h"
	72	#include "ufs_extern.h"
	73
	74	#include <sys/thread2.h>
	75
	76	/*
	77	* These definitions need to be adapted to the system to which
	78	* this file is being ported.
	79	*/
	80	/*
	81	* malloc types defined for the softdep system.
	82	*/
	83	MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
	84	MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
	85	MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
	86	MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
	87	MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
	88	MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
	89	MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
	90	MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
	91	MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
	92	MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
	93	MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
	94	MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
	95	MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
	96
	97	#define M_SOFTDEP_FLAGS (M_WAITOK \| M_USE_RESERVE)
	98
	99	#define D_PAGEDEP 0
	100	#define D_INODEDEP 1
	101	#define D_NEWBLK 2
	102	#define D_BMSAFEMAP 3
	103	#define D_ALLOCDIRECT 4
	104	#define D_INDIRDEP 5
	105	#define D_ALLOCINDIR 6
	106	#define D_FREEFRAG 7
	107	#define D_FREEBLKS 8
	108	#define D_FREEFILE 9
	109	#define D_DIRADD 10
	110	#define D_MKDIR 11
	111	#define D_DIRREM 12
	112	#define D_LAST D_DIRREM
	113
	114	/*
	115	* translate from workitem type to memory type
	116	* MUST match the defines above, such that memtype[D_XXX] == M_XXX
	117	*/
	118	static struct malloc_type *memtype[] = {
	119	M_PAGEDEP,
	120	M_INODEDEP,
	121	M_NEWBLK,
	122	M_BMSAFEMAP,
	123	M_ALLOCDIRECT,
	124	M_INDIRDEP,
	125	M_ALLOCINDIR,
	126	M_FREEFRAG,
	127	M_FREEBLKS,
	128	M_FREEFILE,
	129	M_DIRADD,
	130	M_MKDIR,
	131	M_DIRREM
	132	};
	133
	134	#define DtoM(type) (memtype[type])
	135
	136	/*
	137	* Names of malloc types.
	138	*/
	139	#define TYPENAME(type) \
	140	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
	141	/*
	142	* End system adaptaion definitions.
	143	*/
	144
	145	/*
	146	* Internal function prototypes.
	147	*/
	148	static void softdep_error(char *, int);
	149	static void drain_output(struct vnode *, int);
	150	static int getdirtybuf(struct buf **, int);
	151	static void clear_remove(struct thread *);
	152	static void clear_inodedeps(struct thread *);
	153	static int flush_pagedep_deps(struct vnode , struct mount ,
	154	struct diraddhd *);
	155	static int flush_inodedep_deps(struct fs *, ino_t);
	156	static int handle_written_filepage(struct pagedep , struct buf );
	157	static void diradd_inode_written(struct diradd , struct inodedep );
	158	static int handle_written_inodeblock(struct inodedep , struct buf );
	159	static void handle_allocdirect_partdone(struct allocdirect *);
	160	static void handle_allocindir_partdone(struct allocindir *);
	161	static void initiate_write_filepage(struct pagedep , struct buf );
	162	static void handle_written_mkdir(struct mkdir *, int);
	163	static void initiate_write_inodeblock(struct inodedep , struct buf );
	164	static void handle_workitem_freefile(struct freefile *);
	165	static void handle_workitem_remove(struct dirrem *);
	166	static struct dirrem newdirrem(struct buf , struct inode *,
	167	struct inode , int, struct dirrem *);
	168	static void free_diradd(struct diradd *);
	169	static void free_allocindir(struct allocindir , struct inodedep );
	170	static int indir_trunc (struct inode , off_t, int, ufs_lbn_t, long );
	171	static void deallocate_dependencies(struct buf , struct inodedep );
	172	static void free_allocdirect(struct allocdirectlst *,
	173	struct allocdirect *, int);
	174	static int check_inode_unwritten(struct inodedep *);
	175	static int free_inodedep(struct inodedep *);
	176	static void handle_workitem_freeblocks(struct freeblks *);
	177	static void merge_inode_lists(struct inodedep *);
	178	static void setup_allocindir_phase2(struct buf , struct inode ,
	179	struct allocindir *);
	180	static struct allocindir newallocindir(struct inode , int, ufs_daddr_t,
	181	ufs_daddr_t);
	182	static void handle_workitem_freefrag(struct freefrag *);
	183	static struct freefrag newfreefrag(struct inode , ufs_daddr_t, long);
	184	static void allocdirect_merge(struct allocdirectlst *,
	185	struct allocdirect , struct allocdirect );
	186	static struct bmsafemap bmsafemap_lookup(struct buf );
	187	static int newblk_lookup(struct fs *, ufs_daddr_t, int,
	188	struct newblk **);
	189	static int inodedep_lookup(struct fs , ino_t, int, struct inodedep *);
	190	static int pagedep_lookup(struct inode *, ufs_lbn_t, int,
	191	struct pagedep **);
	192	static void pause_timer(void *);
	193	static int request_cleanup(int, int);
	194	static int process_worklist_item(struct mount *, int);
	195	static void add_to_worklist(struct worklist *);
	196
	197	/*
	198	* Exported softdep operations.
	199	*/
	200	static void softdep_disk_io_initiation(struct buf *);
	201	static void softdep_disk_write_complete(struct buf *);
	202	static void softdep_deallocate_dependencies(struct buf *);
	203	static int softdep_fsync(struct vnode *);
	204	static int softdep_process_worklist(struct mount *);
	205	static void softdep_move_dependencies(struct buf , struct buf );
	206	static int softdep_count_dependencies(struct buf *bp, int);
	207	static int softdep_checkread(struct buf *bp);
	208	static int softdep_checkwrite(struct buf *bp);
	209
	210	static struct bio_ops softdep_bioops = {
	211	.io_start = softdep_disk_io_initiation,
	212	.io_complete = softdep_disk_write_complete,
	213	.io_deallocate = softdep_deallocate_dependencies,
	214	.io_fsync = softdep_fsync,
	215	.io_sync = softdep_process_worklist,
	216	.io_movedeps = softdep_move_dependencies,
	217	.io_countdeps = softdep_count_dependencies,
	218	.io_checkread = softdep_checkread,
	219	.io_checkwrite = softdep_checkwrite
	220	};
	221
	222	/*
	223	* Locking primitives.
	224	*
	225	* For a uniprocessor, all we need to do is protect against disk
	226	* interrupts. For a multiprocessor, this lock would have to be
	227	* a mutex. A single mutex is used throughout this file, though
	228	* finer grain locking could be used if contention warranted it.
	229	*
	230	* For a multiprocessor, the sleep call would accept a lock and
	231	* release it after the sleep processing was complete. In a uniprocessor
	232	* implementation there is no such interlock, so we simple mark
	233	* the places where it needs to be done with the `interlocked' form
	234	* of the lock calls. Since the uniprocessor sleep already interlocks
	235	* the spl, there is nothing that really needs to be done.
	236	*/
	237	#ifndef /* NOT */ DEBUG
	238	static struct lockit {
	239	} lk = { 0 };
	240	#define ACQUIRE_LOCK(lk) crit_enter_id("softupdates");
	241	#define FREE_LOCK(lk) crit_exit_id("softupdates");
	242
	243	#else /* DEBUG */
	244	#define NOHOLDER ((struct thread *)-1)
	245	#define SPECIAL_FLAG ((struct thread *)-2)
	246	static struct lockit {
	247	int lkt_spl;
	248	struct thread *lkt_held;
	249	} lk = { 0, NOHOLDER };
	250	static int lockcnt;
	251
	252	static void acquire_lock(struct lockit *);
	253	static void free_lock(struct lockit *);
	254	void softdep_panic(char *);
	255
	256	#define ACQUIRE_LOCK(lk) acquire_lock(lk)
	257	#define FREE_LOCK(lk) free_lock(lk)
	258
	259	static void
	260	acquire_lock(struct lockit *lk)
	261	{
	262	thread_t holder;
	263
	264	if (lk->lkt_held != NOHOLDER) {
	265	holder = lk->lkt_held;
	266	FREE_LOCK(lk);
	267	if (holder == curthread)
	268	panic("softdep_lock: locking against myself");
	269	else
	270	panic("softdep_lock: lock held by %p", holder);
	271	}
	272	crit_enter_id("softupdates");
	273	lk->lkt_held = curthread;
	274	lockcnt++;
	275	}
	276
	277	static void
	278	free_lock(struct lockit *lk)
	279	{
	280
	281	if (lk->lkt_held == NOHOLDER)
	282	panic("softdep_unlock: lock not held");
	283	lk->lkt_held = NOHOLDER;
	284	crit_exit_id("softupdates");
	285	}
	286
	287	/*
	288	* Function to release soft updates lock and panic.
	289	*/
	290	void
	291	softdep_panic(char *msg)
	292	{
	293
	294	if (lk.lkt_held != NOHOLDER)
	295	FREE_LOCK(&lk);
	296	panic(msg);
	297	}
	298	#endif /* DEBUG */
	299
	300	static int interlocked_sleep(struct lockit , int, void , int,
	301	const char *, int);
	302
	303	/*
	304	* When going to sleep, we must save our SPL so that it does
	305	* not get lost if some other process uses the lock while we
	306	* are sleeping. We restore it after we have slept. This routine
	307	* wraps the interlocking with functions that sleep. The list
	308	* below enumerates the available set of operations.
	309	*/
	310	#define UNKNOWN 0
	311	#define SLEEP 1
	312	#define LOCKBUF 2
	313
	314	static int
	315	interlocked_sleep(struct lockit lk, int op, void ident, int flags,
	316	const char *wmesg, int timo)
	317	{
	318	thread_t holder;
	319	int s, retval;
	320
	321	s = lk->lkt_spl;
	322	# ifdef DEBUG
	323	if (lk->lkt_held == NOHOLDER)
	324	panic("interlocked_sleep: lock not held");
	325	lk->lkt_held = NOHOLDER;
	326	# endif /* DEBUG */
	327	switch (op) {
	328	case SLEEP:
	329	retval = tsleep(ident, flags, wmesg, timo);
	330	break;
	331	case LOCKBUF:
	332	retval = BUF_LOCK((struct buf *)ident, flags);
	333	break;
	334	default:
	335	panic("interlocked_sleep: unknown operation");
	336	}
	337	# ifdef DEBUG
	338	if (lk->lkt_held != NOHOLDER) {
	339	holder = lk->lkt_held;
	340	FREE_LOCK(lk);
	341	if (holder == curthread)
	342	panic("interlocked_sleep: locking against self");
	343	else
	344	panic("interlocked_sleep: lock held by %p", holder);
	345	}
	346	lk->lkt_held = curthread;
	347	lockcnt++;
	348	# endif /* DEBUG */
	349	lk->lkt_spl = s;
	350	return (retval);
	351	}
	352
	353	/*
	354	* Place holder for real semaphores.
	355	*/
	356	struct sema {
	357	int value;
	358	thread_t holder;
	359	char *name;
	360	int prio;
	361	int timo;
	362	};
	363	static void sema_init(struct sema , char , int, int);
	364	static int sema_get(struct sema , struct lockit );
	365	static void sema_release(struct sema *);
	366
	367	static void
	368	sema_init(struct sema semap, char name, int prio, int timo)
	369	{
	370
	371	semap->holder = NOHOLDER;
	372	semap->value = 0;
	373	semap->name = name;
	374	semap->prio = prio;
	375	semap->timo = timo;
	376	}
	377
	378	static int
	379	sema_get(struct sema semap, struct lockit interlock)
	380	{
	381
	382	if (semap->value++ > 0) {
	383	if (interlock != NULL) {
	384	interlocked_sleep(interlock, SLEEP, (caddr_t)semap,
	385	semap->prio, semap->name, semap->timo);
	386	FREE_LOCK(interlock);
	387	} else {
	388	tsleep((caddr_t)semap, semap->prio, semap->name,
	389	semap->timo);
	390	}
	391	return (0);
	392	}
	393	semap->holder = curthread;
	394	if (interlock != NULL)
	395	FREE_LOCK(interlock);
	396	return (1);
	397	}
	398
	399	static void
	400	sema_release(struct sema *semap)
	401	{
	402
	403	if (semap->value <= 0 \|\| semap->holder != curthread) {
	404	if (lk.lkt_held != NOHOLDER)
	405	FREE_LOCK(&lk);
	406	panic("sema_release: not held");
	407	}
	408	if (--semap->value > 0) {
	409	semap->value = 0;
	410	wakeup(semap);
	411	}
	412	semap->holder = NOHOLDER;
	413	}
	414
	415	/*
	416	* Worklist queue management.
	417	* These routines require that the lock be held.
	418	*/
	419	#ifndef /* NOT */ DEBUG
	420	#define WORKLIST_INSERT(head, item) do { \
	421	(item)->wk_state \|= ONWORKLIST; \
	422	LIST_INSERT_HEAD(head, item, wk_list); \
	423	} while (0)
	424
	425	#define WORKLIST_INSERT_BP(bp, item) do { \
	426	(item)->wk_state \|= ONWORKLIST; \
	427	(bp)->b_ops = &softdep_bioops; \
	428	LIST_INSERT_HEAD(&(bp)->b_dep, item, wk_list); \
	429	} while (0)
	430
	431	#define WORKLIST_REMOVE(item) do { \
	432	(item)->wk_state &= ~ONWORKLIST; \
	433	LIST_REMOVE(item, wk_list); \
	434	} while (0)
	435
	436	#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
	437
	438	#else /* DEBUG */
	439	static void worklist_insert(struct workhead , struct worklist );
	440	static void worklist_remove(struct worklist *);
	441	static void workitem_free(struct worklist *, int);
	442
	443	#define WORKLIST_INSERT_BP(bp, item) do { \
	444	(bp)->b_ops = &softdep_bioops; \
	445	worklist_insert(&(bp)->b_dep, item); \
	446	} while (0)
	447
	448	#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
	449	#define WORKLIST_REMOVE(item) worklist_remove(item)
	450	#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
	451
	452	static void
	453	worklist_insert(struct workhead head, struct worklist item)
	454	{
	455
	456	if (lk.lkt_held == NOHOLDER)
	457	panic("worklist_insert: lock not held");
	458	if (item->wk_state & ONWORKLIST) {
	459	FREE_LOCK(&lk);
	460	panic("worklist_insert: already on list");
	461	}
	462	item->wk_state \|= ONWORKLIST;
	463	LIST_INSERT_HEAD(head, item, wk_list);
	464	}
	465
	466	static void
	467	worklist_remove(struct worklist *item)
	468	{
	469
	470	if (lk.lkt_held == NOHOLDER)
	471	panic("worklist_remove: lock not held");
	472	if ((item->wk_state & ONWORKLIST) == 0) {
	473	FREE_LOCK(&lk);
	474	panic("worklist_remove: not on list");
	475	}
	476	item->wk_state &= ~ONWORKLIST;
	477	LIST_REMOVE(item, wk_list);
	478	}
	479
	480	static void
	481	workitem_free(struct worklist *item, int type)
	482	{
	483
	484	if (item->wk_state & ONWORKLIST) {
	485	if (lk.lkt_held != NOHOLDER)
	486	FREE_LOCK(&lk);
	487	panic("workitem_free: still on list");
	488	}
	489	if (item->wk_type != type) {
	490	if (lk.lkt_held != NOHOLDER)
	491	FREE_LOCK(&lk);
	492	panic("workitem_free: type mismatch");
	493	}
	494	FREE(item, DtoM(type));
	495	}
	496	#endif /* DEBUG */
	497
	498	/*
	499	* Workitem queue management
	500	*/
	501	static struct workhead softdep_workitem_pending;
	502	static int num_on_worklist; /* number of worklist items to be processed */
	503	static int softdep_worklist_busy; /* 1 => trying to do unmount */
	504	static int softdep_worklist_req; /* serialized waiters */
	505	static int max_softdeps; /* maximum number of structs before slowdown */
	506	static int tickdelay = 2; /* number of ticks to pause during slowdown */
	507	static int stat_countp; / statistic to count in proc_waiting timeout */
	508	static int proc_waiting; /* tracks whether we have a timeout posted */
	509	static struct callout handle; /* handle on posted proc_waiting timeout */
	510	static struct thread filesys_syncer; / proc of filesystem syncer process */
	511	static int req_clear_inodedeps; /* syncer process flush some inodedeps */
	512	#define FLUSH_INODES 1
	513	static int req_clear_remove; /* syncer process flush some freeblks */
	514	#define FLUSH_REMOVE 2
	515	/*
	516	* runtime statistics
	517	*/
	518	static int stat_worklist_push; /* number of worklist cleanups */
	519	static int stat_blk_limit_push; /* number of times block limit neared */
	520	static int stat_ino_limit_push; /* number of times inode limit neared */
	521	static int stat_blk_limit_hit; /* number of times block slowdown imposed */
	522	static int stat_ino_limit_hit; /* number of times inode slowdown imposed */
	523	static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
	524	static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
	525	static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
	526	static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
	527	static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
	528	#ifdef DEBUG
	529	#include <vm/vm.h>
	530	#include <sys/sysctl.h>
	531	SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
	532	SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
	533	SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
	534	SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
	535	SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
	536	SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
	537	SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
	538	SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
	539	SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
	540	SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
	541	SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
	542	SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
	543	#endif /* DEBUG */
	544
	545	/*
	546	* Add an item to the end of the work queue.
	547	* This routine requires that the lock be held.
	548	* This is the only routine that adds items to the list.
	549	* The following routine is the only one that removes items
	550	* and does so in order from first to last.
	551	*/
	552	static void
	553	add_to_worklist(struct worklist *wk)
	554	{
	555	static struct worklist *worklist_tail;
	556
	557	if (wk->wk_state & ONWORKLIST) {
	558	if (lk.lkt_held != NOHOLDER)
	559	FREE_LOCK(&lk);
	560	panic("add_to_worklist: already on list");
	561	}
	562	wk->wk_state \|= ONWORKLIST;
	563	if (LIST_FIRST(&softdep_workitem_pending) == NULL)
	564	LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
	565	else
	566	LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
	567	worklist_tail = wk;
	568	num_on_worklist += 1;
	569	}
	570
	571	/*
	572	* Process that runs once per second to handle items in the background queue.
	573	*
	574	* Note that we ensure that everything is done in the order in which they
	575	* appear in the queue. The code below depends on this property to ensure
	576	* that blocks of a file are freed before the inode itself is freed. This
	577	* ordering ensures that no new <vfsid, inum, lbn> triples will be generated
	578	* until all the old ones have been purged from the dependency lists.
	579	*/
	580	static int
	581	softdep_process_worklist(struct mount *matchmnt)
	582	{
	583	thread_t td = curthread;
	584	int matchcnt, loopcount;
	585	long starttime;
	586
	587	/*
	588	* Record the process identifier of our caller so that we can give
	589	* this process preferential treatment in request_cleanup below.
	590	*/
	591	filesys_syncer = td;
	592	matchcnt = 0;
	593
	594	/*
	595	* There is no danger of having multiple processes run this
	596	* code, but we have to single-thread it when softdep_flushfiles()
	597	* is in operation to get an accurate count of the number of items
	598	* related to its mount point that are in the list.
	599	*/
	600	if (matchmnt == NULL) {
	601	if (softdep_worklist_busy < 0)
	602	return(-1);
	603	softdep_worklist_busy += 1;
	604	}
	605
	606	/*
	607	* If requested, try removing inode or removal dependencies.
	608	*/
	609	if (req_clear_inodedeps) {
	610	clear_inodedeps(td);
	611	req_clear_inodedeps -= 1;
	612	wakeup_one(&proc_waiting);
	613	}
	614	if (req_clear_remove) {
	615	clear_remove(td);
	616	req_clear_remove -= 1;
	617	wakeup_one(&proc_waiting);
	618	}
	619	loopcount = 1;
	620	starttime = time_second;
	621	while (num_on_worklist > 0) {
	622	matchcnt += process_worklist_item(matchmnt, 0);
	623
	624	/*
	625	* If a umount operation wants to run the worklist
	626	* accurately, abort.
	627	*/
	628	if (softdep_worklist_req && matchmnt == NULL) {
	629	matchcnt = -1;
	630	break;
	631	}
	632
	633	/*
	634	* If requested, try removing inode or removal dependencies.
	635	*/
	636	if (req_clear_inodedeps) {
	637	clear_inodedeps(td);
	638	req_clear_inodedeps -= 1;
	639	wakeup_one(&proc_waiting);
	640	}
	641	if (req_clear_remove) {
	642	clear_remove(td);
	643	req_clear_remove -= 1;
	644	wakeup_one(&proc_waiting);
	645	}
	646	/*
	647	* We do not generally want to stop for buffer space, but if
	648	* we are really being a buffer hog, we will stop and wait.
	649	*/
	650	if (loopcount++ % 128 == 0)
	651	bwillinode(1);
	652	/*
	653	* Never allow processing to run for more than one
	654	* second. Otherwise the other syncer tasks may get
	655	* excessively backlogged.
	656	*/
	657	if (starttime != time_second && matchmnt == NULL) {
	658	matchcnt = -1;
	659	break;
	660	}
	661	}
	662	if (matchmnt == NULL) {
	663	--softdep_worklist_busy;
	664	if (softdep_worklist_req && softdep_worklist_busy == 0)
	665	wakeup(&softdep_worklist_req);
	666	}
	667	return (matchcnt);
	668	}
	669
	670	/*
	671	* Process one item on the worklist.
	672	*/
	673	static int
	674	process_worklist_item(struct mount *matchmnt, int flags)
	675	{
	676	struct worklist *wk;
	677	struct dirrem *dirrem;
	678	struct fs *matchfs;
	679	struct vnode *vp;
	680	int matchcnt = 0;
	681
	682	matchfs = NULL;
	683	if (matchmnt != NULL)
	684	matchfs = VFSTOUFS(matchmnt)->um_fs;
	685	ACQUIRE_LOCK(&lk);
	686	/*
	687	* Normally we just process each item on the worklist in order.
	688	* However, if we are in a situation where we cannot lock any
	689	* inodes, we have to skip over any dirrem requests whose
	690	* vnodes are resident and locked.
	691	*/
	692	LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
	693	if ((flags & LK_NOWAIT) == 0 \|\| wk->wk_type != D_DIRREM)
	694	break;
	695	dirrem = WK_DIRREM(wk);
	696	vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev,
	697	dirrem->dm_oldinum);
	698	if (vp == NULL \|\| !vn_islocked(vp))
	699	break;
	700	}
	701	if (wk == 0) {
	702	FREE_LOCK(&lk);
	703	return (0);
	704	}
	705	WORKLIST_REMOVE(wk);
	706	num_on_worklist -= 1;
	707	FREE_LOCK(&lk);
	708	switch (wk->wk_type) {
	709
	710	case D_DIRREM:
	711	/* removal of a directory entry */
	712	if (WK_DIRREM(wk)->dm_mnt == matchmnt)
	713	matchcnt += 1;
	714	handle_workitem_remove(WK_DIRREM(wk));
	715	break;
	716
	717	case D_FREEBLKS:
	718	/* releasing blocks and/or fragments from a file */
	719	if (WK_FREEBLKS(wk)->fb_fs == matchfs)
	720	matchcnt += 1;
	721	handle_workitem_freeblocks(WK_FREEBLKS(wk));
	722	break;
	723
	724	case D_FREEFRAG:
	725	/* releasing a fragment when replaced as a file grows */
	726	if (WK_FREEFRAG(wk)->ff_fs == matchfs)
	727	matchcnt += 1;
	728	handle_workitem_freefrag(WK_FREEFRAG(wk));
	729	break;
	730
	731	case D_FREEFILE:
	732	/* releasing an inode when its link count drops to 0 */
	733	if (WK_FREEFILE(wk)->fx_fs == matchfs)
	734	matchcnt += 1;
	735	handle_workitem_freefile(WK_FREEFILE(wk));
	736	break;
	737
	738	default:
	739	panic("%s_process_worklist: Unknown type %s",
	740	"softdep", TYPENAME(wk->wk_type));
	741	/* NOTREACHED */
	742	}
	743	return (matchcnt);
	744	}
	745
	746	/*
	747	* Move dependencies from one buffer to another.
	748	*/
	749	static void
	750	softdep_move_dependencies(struct buf oldbp, struct buf newbp)
	751	{
	752	struct worklist wk, wktail;
	753
	754	if (LIST_FIRST(&newbp->b_dep) != NULL)
	755	panic("softdep_move_dependencies: need merge code");
	756	wktail = NULL;
	757	ACQUIRE_LOCK(&lk);
	758	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
	759	LIST_REMOVE(wk, wk_list);
	760	if (wktail == NULL)
	761	LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
	762	else
	763	LIST_INSERT_AFTER(wktail, wk, wk_list);
	764	wktail = wk;
	765	newbp->b_ops = &softdep_bioops;
	766	}
	767	FREE_LOCK(&lk);
	768	}
	769
	770	/*
	771	* Purge the work list of all items associated with a particular mount point.
	772	*/
	773	int
	774	softdep_flushfiles(struct mount *oldmnt, int flags)
	775	{
	776	struct vnode *devvp;
	777	int error, loopcnt;
	778
	779	/*
	780	* Await our turn to clear out the queue, then serialize access.
	781	*/
	782	while (softdep_worklist_busy != 0) {
	783	softdep_worklist_req += 1;
	784	tsleep(&softdep_worklist_req, 0, "softflush", 0);
	785	softdep_worklist_req -= 1;
	786	}
	787	softdep_worklist_busy = -1;
	788
	789	if ((error = ffs_flushfiles(oldmnt, flags)) != 0) {
	790	softdep_worklist_busy = 0;
	791	if (softdep_worklist_req)
	792	wakeup(&softdep_worklist_req);
	793	return (error);
	794	}
	795	/*
	796	* Alternately flush the block device associated with the mount
	797	* point and process any dependencies that the flushing
	798	* creates. In theory, this loop can happen at most twice,
	799	* but we give it a few extra just to be sure.
	800	*/
	801	devvp = VFSTOUFS(oldmnt)->um_devvp;
	802	for (loopcnt = 10; loopcnt > 0; ) {
	803	if (softdep_process_worklist(oldmnt) == 0) {
	804	loopcnt--;
	805	/*
	806	* Do another flush in case any vnodes were brought in
	807	* as part of the cleanup operations.
	808	*/
	809	if ((error = ffs_flushfiles(oldmnt, flags)) != 0)
	810	break;
	811	/*
	812	* If we still found nothing to do, we are really done.
	813	*/
	814	if (softdep_process_worklist(oldmnt) == 0)
	815	break;
	816	}
	817	vn_lock(devvp, LK_EXCLUSIVE \| LK_RETRY);
	818	error = VOP_FSYNC(devvp, MNT_WAIT, 0);
	819	vn_unlock(devvp);
	820	if (error)
	821	break;
	822	}
	823	softdep_worklist_busy = 0;
	824	if (softdep_worklist_req)
	825	wakeup(&softdep_worklist_req);
	826
	827	/*
	828	* If we are unmounting then it is an error to fail. If we
	829	* are simply trying to downgrade to read-only, then filesystem
	830	* activity can keep us busy forever, so we just fail with EBUSY.
	831	*/
	832	if (loopcnt == 0) {
	833	if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
	834	panic("softdep_flushfiles: looping");
	835	error = EBUSY;
	836	}
	837	return (error);
	838	}
	839
	840	/*
	841	* Structure hashing.
	842	*
	843	* There are three types of structures that can be looked up:
	844	* 1) pagedep structures identified by mount point, inode number,
	845	* and logical block.
	846	* 2) inodedep structures identified by mount point and inode number.
	847	* 3) newblk structures identified by mount point and
	848	* physical block number.
	849	*
	850	* The "pagedep" and "inodedep" dependency structures are hashed
	851	* separately from the file blocks and inodes to which they correspond.
	852	* This separation helps when the in-memory copy of an inode or
	853	* file block must be replaced. It also obviates the need to access
	854	* an inode or file page when simply updating (or de-allocating)
	855	* dependency structures. Lookup of newblk structures is needed to
	856	* find newly allocated blocks when trying to associate them with
	857	* their allocdirect or allocindir structure.
	858	*
	859	* The lookup routines optionally create and hash a new instance when
	860	* an existing entry is not found.
	861	*/
	862	#define DEPALLOC 0x0001 /* allocate structure if lookup fails */
	863	#define NODELAY 0x0002 /* cannot do background work */
	864
	865	/*
	866	* Structures and routines associated with pagedep caching.
	867	*/
	868	LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
	869	u_long pagedep_hash; /* size of hash table - 1 */
	870	#define PAGEDEP_HASH(mp, inum, lbn) \
	871	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
	872	pagedep_hash])
	873	static struct sema pagedep_in_progress;
	874
	875	/*
	876	* Helper routine for pagedep_lookup()
	877	*/
	878	static __inline
	879	struct pagedep *
	880	pagedep_find(struct pagedep_hashhead *pagedephd, ino_t ino, ufs_lbn_t lbn,
	881	struct mount *mp)
	882	{
	883	struct pagedep *pagedep;
	884
	885	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
	886	if (ino == pagedep->pd_ino &&
	887	lbn == pagedep->pd_lbn &&
	888	mp == pagedep->pd_mnt) {
	889	return (pagedep);
	890	}
	891	}
	892	return(NULL);
	893	}
	894
	895	/*
	896	* Look up a pagedep. Return 1 if found, 0 if not found.
	897	* If not found, allocate if DEPALLOC flag is passed.
	898	* Found or allocated entry is returned in pagedeppp.
	899	* This routine must be called with splbio interrupts blocked.
	900	*/
	901	static int
	902	pagedep_lookup(struct inode *ip, ufs_lbn_t lbn, int flags,
	903	struct pagedep **pagedeppp)
	904	{
	905	struct pagedep *pagedep;
	906	struct pagedep_hashhead *pagedephd;
	907	struct mount *mp;
	908	int i;
	909
	910	#ifdef DEBUG
	911	if (lk.lkt_held == NOHOLDER)
	912	panic("pagedep_lookup: lock not held");
	913	#endif
	914	mp = ITOV(ip)->v_mount;
	915	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
	916	top:
	917	*pagedeppp = pagedep_find(pagedephd, ip->i_number, lbn, mp);
	918	if (*pagedeppp)
	919	return(1);
	920	if ((flags & DEPALLOC) == 0)
	921	return (0);
	922	if (sema_get(&pagedep_in_progress, &lk) == 0) {
	923	ACQUIRE_LOCK(&lk);
	924	goto top;
	925	}
	926	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
	927	M_SOFTDEP_FLAGS \| M_ZERO);
	928
	929	if (pagedep_find(pagedephd, ip->i_number, lbn, mp)) {
	930	kprintf("pagedep_lookup: blocking race avoided\n");
	931	ACQUIRE_LOCK(&lk);
	932	sema_release(&pagedep_in_progress);
	933	kfree(pagedep, M_PAGEDEP);
	934	goto top;
	935	}
	936
	937	pagedep->pd_list.wk_type = D_PAGEDEP;
	938	pagedep->pd_mnt = mp;
	939	pagedep->pd_ino = ip->i_number;
	940	pagedep->pd_lbn = lbn;
	941	LIST_INIT(&pagedep->pd_dirremhd);
	942	LIST_INIT(&pagedep->pd_pendinghd);
	943	for (i = 0; i < DAHASHSZ; i++)
	944	LIST_INIT(&pagedep->pd_diraddhd[i]);
	945	ACQUIRE_LOCK(&lk);
	946	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
	947	sema_release(&pagedep_in_progress);
	948	*pagedeppp = pagedep;
	949	return (0);
	950	}
	951
	952	/*
	953	* Structures and routines associated with inodedep caching.
	954	*/
	955	LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
	956	static u_long inodedep_hash; /* size of hash table - 1 */
	957	static long num_inodedep; /* number of inodedep allocated */
	958	#define INODEDEP_HASH(fs, inum) \
	959	(&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
	960	static struct sema inodedep_in_progress;
	961
	962	/*
	963	* Helper routine for inodedep_lookup()
	964	*/
	965	static __inline
	966	struct inodedep *
	967	inodedep_find(struct inodedep_hashhead inodedephd, struct fs fs, ino_t inum)
	968	{
	969	struct inodedep *inodedep;
	970
	971	LIST_FOREACH(inodedep, inodedephd, id_hash) {
	972	if (inum == inodedep->id_ino && fs == inodedep->id_fs)
	973	return(inodedep);
	974	}
	975	return (NULL);
	976	}
	977
	978	/*
	979	* Look up a inodedep. Return 1 if found, 0 if not found.
	980	* If not found, allocate if DEPALLOC flag is passed.
	981	* Found or allocated entry is returned in inodedeppp.
	982	* This routine must be called with splbio interrupts blocked.
	983	*/
	984	static int
	985	inodedep_lookup(struct fs *fs, ino_t inum, int flags,
	986	struct inodedep **inodedeppp)
	987	{
	988	struct inodedep *inodedep;
	989	struct inodedep_hashhead *inodedephd;
	990	int firsttry;
	991
	992	#ifdef DEBUG
	993	if (lk.lkt_held == NOHOLDER)
	994	panic("inodedep_lookup: lock not held");
	995	#endif
	996	firsttry = 1;
	997	inodedephd = INODEDEP_HASH(fs, inum);
	998	top:
	999	*inodedeppp = inodedep_find(inodedephd, fs, inum);
	1000	if (*inodedeppp)
	1001	return (1);
	1002	if ((flags & DEPALLOC) == 0)
	1003	return (0);
	1004	/*
	1005	* If we are over our limit, try to improve the situation.
	1006	*/
	1007	if (num_inodedep > max_softdeps && firsttry &&
	1008	speedup_syncer() == 0 && (flags & NODELAY) == 0 &&
	1009	request_cleanup(FLUSH_INODES, 1)) {
	1010	firsttry = 0;
	1011	goto top;
	1012	}
	1013	if (sema_get(&inodedep_in_progress, &lk) == 0) {
	1014	ACQUIRE_LOCK(&lk);
	1015	goto top;
	1016	}
	1017	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
	1018	M_INODEDEP, M_SOFTDEP_FLAGS \| M_ZERO);
	1019	if (inodedep_find(inodedephd, fs, inum)) {
	1020	kprintf("inodedep_lookup: blocking race avoided\n");
	1021	ACQUIRE_LOCK(&lk);
	1022	sema_release(&inodedep_in_progress);
	1023	kfree(inodedep, M_INODEDEP);
	1024	goto top;
	1025	}
	1026	inodedep->id_list.wk_type = D_INODEDEP;
	1027	inodedep->id_fs = fs;
	1028	inodedep->id_ino = inum;
	1029	inodedep->id_state = ALLCOMPLETE;
	1030	inodedep->id_nlinkdelta = 0;
	1031	inodedep->id_savedino = NULL;
	1032	inodedep->id_savedsize = -1;
	1033	inodedep->id_buf = NULL;
	1034	LIST_INIT(&inodedep->id_pendinghd);
	1035	LIST_INIT(&inodedep->id_inowait);
	1036	LIST_INIT(&inodedep->id_bufwait);
	1037	TAILQ_INIT(&inodedep->id_inoupdt);
	1038	TAILQ_INIT(&inodedep->id_newinoupdt);
	1039	ACQUIRE_LOCK(&lk);
	1040	num_inodedep += 1;
	1041	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
	1042	sema_release(&inodedep_in_progress);
	1043	*inodedeppp = inodedep;
	1044	return (0);
	1045	}
	1046
	1047	/*
	1048	* Structures and routines associated with newblk caching.
	1049	*/
	1050	LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
	1051	u_long newblk_hash; /* size of hash table - 1 */
	1052	#define NEWBLK_HASH(fs, inum) \
	1053	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
	1054	static struct sema newblk_in_progress;
	1055
	1056	/*
	1057	* Helper routine for newblk_lookup()
	1058	*/
	1059	static __inline
	1060	struct newblk *
	1061	newblk_find(struct newblk_hashhead newblkhd, struct fs fs,
	1062	ufs_daddr_t newblkno)
	1063	{
	1064	struct newblk *newblk;
	1065
	1066	LIST_FOREACH(newblk, newblkhd, nb_hash) {
	1067	if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
	1068	return (newblk);
	1069	}
	1070	return(NULL);
	1071	}
	1072
	1073	/*
	1074	* Look up a newblk. Return 1 if found, 0 if not found.
	1075	* If not found, allocate if DEPALLOC flag is passed.
	1076	* Found or allocated entry is returned in newblkpp.
	1077	*/
	1078	static int
	1079	newblk_lookup(struct fs *fs, ufs_daddr_t newblkno, int flags,
	1080	struct newblk **newblkpp)
	1081	{
	1082	struct newblk *newblk;
	1083	struct newblk_hashhead *newblkhd;
	1084
	1085	newblkhd = NEWBLK_HASH(fs, newblkno);
	1086	top:
	1087	*newblkpp = newblk_find(newblkhd, fs, newblkno);
	1088	if (*newblkpp)
	1089	return(1);
	1090	if ((flags & DEPALLOC) == 0)
	1091	return (0);
	1092	if (sema_get(&newblk_in_progress, 0) == 0)
	1093	goto top;
	1094	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
	1095	M_NEWBLK, M_SOFTDEP_FLAGS \| M_ZERO);
	1096
	1097	if (newblk_find(newblkhd, fs, newblkno)) {
	1098	kprintf("newblk_lookup: blocking race avoided\n");
	1099	sema_release(&pagedep_in_progress);
	1100	kfree(newblk, M_NEWBLK);
	1101	goto top;
	1102	}
	1103	newblk->nb_state = 0;
	1104	newblk->nb_fs = fs;
	1105	newblk->nb_newblkno = newblkno;
	1106	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
	1107	sema_release(&newblk_in_progress);
	1108	*newblkpp = newblk;
	1109	return (0);
	1110	}
	1111
	1112	/*
	1113	* Executed during filesystem system initialization before
	1114	* mounting any filesystems.
	1115	*/
	1116	void
	1117	softdep_initialize(void)
	1118	{
	1119	callout_init(&handle);
	1120
	1121	LIST_INIT(&mkdirlisthd);
	1122	LIST_INIT(&softdep_workitem_pending);
	1123	max_softdeps = min(desiredvnodes * 8,
	1124	M_INODEDEP->ks_limit / (2 * sizeof(struct inodedep)));
	1125	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
	1126	&pagedep_hash);
	1127	sema_init(&pagedep_in_progress, "pagedep", 0, 0);
	1128	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
	1129	sema_init(&inodedep_in_progress, "inodedep", 0, 0);
	1130	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
	1131	sema_init(&newblk_in_progress, "newblk", 0, 0);
	1132	add_bio_ops(&softdep_bioops);
	1133	}
	1134
	1135	/*
	1136	* Called at mount time to notify the dependency code that a
	1137	* filesystem wishes to use it.
	1138	*/
	1139	int
	1140	softdep_mount(struct vnode devvp, struct mount mp, struct fs *fs)
	1141	{
	1142	struct csum cstotal;
	1143	struct cg *cgp;
	1144	struct buf *bp;
	1145	int error, cyl;
	1146
	1147	mp->mnt_flag &= ~MNT_ASYNC;
	1148	mp->mnt_flag \|= MNT_SOFTDEP;
	1149	mp->mnt_bioops = &softdep_bioops;
	1150	/*
	1151	* When doing soft updates, the counters in the
	1152	* superblock may have gotten out of sync, so we have
	1153	* to scan the cylinder groups and recalculate them.
	1154	*/
	1155	if (fs->fs_clean != 0)
	1156	return (0);
	1157	bzero(&cstotal, sizeof cstotal);
	1158	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
	1159	if ((error = bread(devvp, fsbtodoff(fs, cgtod(fs, cyl)),
	1160	fs->fs_cgsize, &bp)) != 0) {
	1161	brelse(bp);
	1162	return (error);
	1163	}
	1164	cgp = (struct cg *)bp->b_data;
	1165	cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
	1166	cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
	1167	cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
	1168	cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
	1169	fs->fs_cs(fs, cyl) = cgp->cg_cs;
	1170	brelse(bp);
	1171	}
	1172	#ifdef DEBUG
	1173	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
	1174	kprintf("ffs_mountfs: superblock updated for soft updates\n");
	1175	#endif
	1176	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
	1177	return (0);
	1178	}
	1179
	1180	/*
	1181	* Protecting the freemaps (or bitmaps).
	1182	*
	1183	* To eliminate the need to execute fsck before mounting a filesystem
	1184	* after a power failure, one must (conservatively) guarantee that the
	1185	* on-disk copy of the bitmaps never indicate that a live inode or block is
	1186	* free. So, when a block or inode is allocated, the bitmap should be
	1187	* updated (on disk) before any new pointers. When a block or inode is
	1188	* freed, the bitmap should not be updated until all pointers have been
	1189	* reset. The latter dependency is handled by the delayed de-allocation
	1190	* approach described below for block and inode de-allocation. The former
	1191	* dependency is handled by calling the following procedure when a block or
	1192	* inode is allocated. When an inode is allocated an "inodedep" is created
	1193	* with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
	1194	* Each "inodedep" is also inserted into the hash indexing structure so
	1195	* that any additional link additions can be made dependent on the inode
	1196	* allocation.
	1197	*
	1198	* The ufs filesystem maintains a number of free block counts (e.g., per
	1199	* cylinder group, per cylinder and per <cylinder, rotational position> pair)
	1200	* in addition to the bitmaps. These counts are used to improve efficiency
	1201	* during allocation and therefore must be consistent with the bitmaps.
	1202	* There is no convenient way to guarantee post-crash consistency of these
	1203	* counts with simple update ordering, for two main reasons: (1) The counts
	1204	* and bitmaps for a single cylinder group block are not in the same disk
	1205	* sector. If a disk write is interrupted (e.g., by power failure), one may
	1206	* be written and the other not. (2) Some of the counts are located in the
	1207	* superblock rather than the cylinder group block. So, we focus our soft
	1208	* updates implementation on protecting the bitmaps. When mounting a
	1209	* filesystem, we recompute the auxiliary counts from the bitmaps.
	1210	*/
	1211
	1212	/*
	1213	* Called just after updating the cylinder group block to allocate an inode.
	1214	*
	1215	* Parameters:
	1216	* bp: buffer for cylgroup block with inode map
	1217	* ip: inode related to allocation
	1218	* newinum: new inode number being allocated
	1219	*/
	1220	void
	1221	softdep_setup_inomapdep(struct buf bp, struct inode ip, ino_t newinum)
	1222	{
	1223	struct inodedep *inodedep;
	1224	struct bmsafemap *bmsafemap;
	1225
	1226	/*
	1227	* Create a dependency for the newly allocated inode.
	1228	* Panic if it already exists as something is seriously wrong.
	1229	* Otherwise add it to the dependency list for the buffer holding
	1230	* the cylinder group map from which it was allocated.
	1231	*/
	1232	ACQUIRE_LOCK(&lk);
	1233	if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC\|NODELAY, &inodedep))) {
	1234	FREE_LOCK(&lk);
	1235	panic("softdep_setup_inomapdep: found inode");
	1236	}
	1237	inodedep->id_buf = bp;
	1238	inodedep->id_state &= ~DEPCOMPLETE;
	1239	bmsafemap = bmsafemap_lookup(bp);
	1240	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
	1241	FREE_LOCK(&lk);
	1242	}
	1243
	1244	/*
	1245	* Called just after updating the cylinder group block to
	1246	* allocate block or fragment.
	1247	*
	1248	* Parameters:
	1249	* bp: buffer for cylgroup block with block map
	1250	* fs: filesystem doing allocation
	1251	* newblkno: number of newly allocated block
	1252	*/
	1253	void
	1254	softdep_setup_blkmapdep(struct buf bp, struct fs fs,
	1255	ufs_daddr_t newblkno)
	1256	{
	1257	struct newblk *newblk;
	1258	struct bmsafemap *bmsafemap;
	1259
	1260	/*
	1261	* Create a dependency for the newly allocated block.
	1262	* Add it to the dependency list for the buffer holding
	1263	* the cylinder group map from which it was allocated.
	1264	*/
	1265	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
	1266	panic("softdep_setup_blkmapdep: found block");
	1267	ACQUIRE_LOCK(&lk);
	1268	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
	1269	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
	1270	FREE_LOCK(&lk);
	1271	}
	1272
	1273	/*
	1274	* Find the bmsafemap associated with a cylinder group buffer.
	1275	* If none exists, create one. The buffer must be locked when
	1276	* this routine is called and this routine must be called with
	1277	* splbio interrupts blocked.
	1278	*/
	1279	static struct bmsafemap *
	1280	bmsafemap_lookup(struct buf *bp)
	1281	{
	1282	struct bmsafemap *bmsafemap;
	1283	struct worklist *wk;
	1284
	1285	#ifdef DEBUG
	1286	if (lk.lkt_held == NOHOLDER)
	1287	panic("bmsafemap_lookup: lock not held");
	1288	#endif
	1289	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	1290	if (wk->wk_type == D_BMSAFEMAP)
	1291	return (WK_BMSAFEMAP(wk));
	1292	}
	1293	FREE_LOCK(&lk);
	1294	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
	1295	M_BMSAFEMAP, M_SOFTDEP_FLAGS);
	1296	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
	1297	bmsafemap->sm_list.wk_state = 0;
	1298	bmsafemap->sm_buf = bp;
	1299	LIST_INIT(&bmsafemap->sm_allocdirecthd);
	1300	LIST_INIT(&bmsafemap->sm_allocindirhd);
	1301	LIST_INIT(&bmsafemap->sm_inodedephd);
	1302	LIST_INIT(&bmsafemap->sm_newblkhd);
	1303	ACQUIRE_LOCK(&lk);
	1304	WORKLIST_INSERT_BP(bp, &bmsafemap->sm_list);
	1305	return (bmsafemap);
	1306	}
	1307
	1308	/*
	1309	* Direct block allocation dependencies.
	1310	*
	1311	* When a new block is allocated, the corresponding disk locations must be
	1312	* initialized (with zeros or new data) before the on-disk inode points to
	1313	* them. Also, the freemap from which the block was allocated must be
	1314	* updated (on disk) before the inode's pointer. These two dependencies are
	1315	* independent of each other and are needed for all file blocks and indirect
	1316	* blocks that are pointed to directly by the inode. Just before the
	1317	* "in-core" version of the inode is updated with a newly allocated block
	1318	* number, a procedure (below) is called to setup allocation dependency
	1319	* structures. These structures are removed when the corresponding
	1320	* dependencies are satisfied or when the block allocation becomes obsolete
	1321	* (i.e., the file is deleted, the block is de-allocated, or the block is a
	1322	* fragment that gets upgraded). All of these cases are handled in
	1323	* procedures described later.
	1324	*
	1325	* When a file extension causes a fragment to be upgraded, either to a larger
	1326	* fragment or to a full block, the on-disk location may change (if the
	1327	* previous fragment could not simply be extended). In this case, the old
	1328	* fragment must be de-allocated, but not until after the inode's pointer has
	1329	* been updated. In most cases, this is handled by later procedures, which
	1330	* will construct a "freefrag" structure to be added to the workitem queue
	1331	* when the inode update is complete (or obsolete). The main exception to
	1332	* this is when an allocation occurs while a pending allocation dependency
	1333	* (for the same block pointer) remains. This case is handled in the main
	1334	* allocation dependency setup procedure by immediately freeing the
	1335	* unreferenced fragments.
	1336	*
	1337	* Parameters:
	1338	* ip: inode to which block is being added
	1339	* lbn: block pointer within inode
	1340	* newblkno: disk block number being added
	1341	* oldblkno: previous block number, 0 unless frag
	1342	* newsize: size of new block
	1343	* oldsize: size of new block
	1344	* bp: bp for allocated block
	1345	*/
	1346	void
	1347	softdep_setup_allocdirect(struct inode *ip, ufs_lbn_t lbn, ufs_daddr_t newblkno,
	1348	ufs_daddr_t oldblkno, long newsize, long oldsize,
	1349	struct buf *bp)
	1350	{
	1351	struct allocdirect adp, oldadp;
	1352	struct allocdirectlst *adphead;
	1353	struct bmsafemap *bmsafemap;
	1354	struct inodedep *inodedep;
	1355	struct pagedep *pagedep;
	1356	struct newblk *newblk;
	1357
	1358	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
	1359	M_ALLOCDIRECT, M_SOFTDEP_FLAGS \| M_ZERO);
	1360	adp->ad_list.wk_type = D_ALLOCDIRECT;
	1361	adp->ad_lbn = lbn;
	1362	adp->ad_newblkno = newblkno;
	1363	adp->ad_oldblkno = oldblkno;
	1364	adp->ad_newsize = newsize;
	1365	adp->ad_oldsize = oldsize;
	1366	adp->ad_state = ATTACHED;
	1367	if (newblkno == oldblkno)
	1368	adp->ad_freefrag = NULL;
	1369	else
	1370	adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
	1371
	1372	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
	1373	panic("softdep_setup_allocdirect: lost block");
	1374
	1375	ACQUIRE_LOCK(&lk);
	1376	inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC \| NODELAY, &inodedep);
	1377	adp->ad_inodedep = inodedep;
	1378
	1379	if (newblk->nb_state == DEPCOMPLETE) {
	1380	adp->ad_state \|= DEPCOMPLETE;
	1381	adp->ad_buf = NULL;
	1382	} else {
	1383	bmsafemap = newblk->nb_bmsafemap;
	1384	adp->ad_buf = bmsafemap->sm_buf;
	1385	LIST_REMOVE(newblk, nb_deps);
	1386	LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
	1387	}
	1388	LIST_REMOVE(newblk, nb_hash);
	1389	FREE(newblk, M_NEWBLK);
	1390
	1391	WORKLIST_INSERT_BP(bp, &adp->ad_list);
	1392	if (lbn >= NDADDR) {
	1393	/* allocating an indirect block */
	1394	if (oldblkno != 0) {
	1395	FREE_LOCK(&lk);
	1396	panic("softdep_setup_allocdirect: non-zero indir");
	1397	}
	1398	} else {
	1399	/*
	1400	* Allocating a direct block.
	1401	*
	1402	* If we are allocating a directory block, then we must
	1403	* allocate an associated pagedep to track additions and
	1404	* deletions.
	1405	*/
	1406	if ((ip->i_mode & IFMT) == IFDIR &&
	1407	pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) {
	1408	WORKLIST_INSERT_BP(bp, &pagedep->pd_list);
	1409	}
	1410	}
	1411	/*
	1412	* The list of allocdirects must be kept in sorted and ascending
	1413	* order so that the rollback routines can quickly determine the
	1414	* first uncommitted block (the size of the file stored on disk
	1415	* ends at the end of the lowest committed fragment, or if there
	1416	* are no fragments, at the end of the highest committed block).
	1417	* Since files generally grow, the typical case is that the new
	1418	* block is to be added at the end of the list. We speed this
	1419	* special case by checking against the last allocdirect in the
	1420	* list before laboriously traversing the list looking for the
	1421	* insertion point.
	1422	*/
	1423	adphead = &inodedep->id_newinoupdt;
	1424	oldadp = TAILQ_LAST(adphead, allocdirectlst);
	1425	if (oldadp == NULL \|\| oldadp->ad_lbn <= lbn) {
	1426	/* insert at end of list */
	1427	TAILQ_INSERT_TAIL(adphead, adp, ad_next);
	1428	if (oldadp != NULL && oldadp->ad_lbn == lbn)
	1429	allocdirect_merge(adphead, adp, oldadp);
	1430	FREE_LOCK(&lk);
	1431	return;
	1432	}
	1433	TAILQ_FOREACH(oldadp, adphead, ad_next) {
	1434	if (oldadp->ad_lbn >= lbn)
	1435	break;
	1436	}
	1437	if (oldadp == NULL) {
	1438	FREE_LOCK(&lk);
	1439	panic("softdep_setup_allocdirect: lost entry");
	1440	}
	1441	/* insert in middle of list */
	1442	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
	1443	if (oldadp->ad_lbn == lbn)
	1444	allocdirect_merge(adphead, adp, oldadp);
	1445	FREE_LOCK(&lk);
	1446	}
	1447
	1448	/*
	1449	* Replace an old allocdirect dependency with a newer one.
	1450	* This routine must be called with splbio interrupts blocked.
	1451	*
	1452	* Parameters:
	1453	* adphead: head of list holding allocdirects
	1454	* newadp: allocdirect being added
	1455	* oldadp: existing allocdirect being checked
	1456	*/
	1457	static void
	1458	allocdirect_merge(struct allocdirectlst *adphead,
	1459	struct allocdirect *newadp,
	1460	struct allocdirect *oldadp)
	1461	{
	1462	struct freefrag *freefrag;
	1463
	1464	#ifdef DEBUG
	1465	if (lk.lkt_held == NOHOLDER)
	1466	panic("allocdirect_merge: lock not held");
	1467	#endif
	1468	if (newadp->ad_oldblkno != oldadp->ad_newblkno \|\|
	1469	newadp->ad_oldsize != oldadp->ad_newsize \|\|
	1470	newadp->ad_lbn >= NDADDR) {
	1471	FREE_LOCK(&lk);
	1472	panic("allocdirect_check: old %d != new %d \|\| lbn %ld >= %d",
	1473	newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
	1474	NDADDR);
	1475	}
	1476	newadp->ad_oldblkno = oldadp->ad_oldblkno;
	1477	newadp->ad_oldsize = oldadp->ad_oldsize;
	1478	/*
	1479	* If the old dependency had a fragment to free or had never
	1480	* previously had a block allocated, then the new dependency
	1481	* can immediately post its freefrag and adopt the old freefrag.
	1482	* This action is done by swapping the freefrag dependencies.
	1483	* The new dependency gains the old one's freefrag, and the
	1484	* old one gets the new one and then immediately puts it on
	1485	* the worklist when it is freed by free_allocdirect. It is
	1486	* not possible to do this swap when the old dependency had a
	1487	* non-zero size but no previous fragment to free. This condition
	1488	* arises when the new block is an extension of the old block.
	1489	* Here, the first part of the fragment allocated to the new
	1490	* dependency is part of the block currently claimed on disk by
	1491	* the old dependency, so cannot legitimately be freed until the
	1492	* conditions for the new dependency are fulfilled.
	1493	*/
	1494	if (oldadp->ad_freefrag != NULL \|\| oldadp->ad_oldblkno == 0) {
	1495	freefrag = newadp->ad_freefrag;
	1496	newadp->ad_freefrag = oldadp->ad_freefrag;
	1497	oldadp->ad_freefrag = freefrag;
	1498	}
	1499	free_allocdirect(adphead, oldadp, 0);
	1500	}
	1501
	1502	/*
	1503	* Allocate a new freefrag structure if needed.
	1504	*/
	1505	static struct freefrag *
	1506	newfreefrag(struct inode *ip, ufs_daddr_t blkno, long size)
	1507	{
	1508	struct freefrag *freefrag;
	1509	struct fs *fs;
	1510
	1511	if (blkno == 0)
	1512	return (NULL);
	1513	fs = ip->i_fs;
	1514	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
	1515	panic("newfreefrag: frag size");
	1516	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
	1517	M_FREEFRAG, M_SOFTDEP_FLAGS);
	1518	freefrag->ff_list.wk_type = D_FREEFRAG;
	1519	freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */
	1520	freefrag->ff_inum = ip->i_number;
	1521	freefrag->ff_fs = fs;
	1522	freefrag->ff_devvp = ip->i_devvp;
	1523	freefrag->ff_blkno = blkno;
	1524	freefrag->ff_fragsize = size;
	1525	return (freefrag);
	1526	}
	1527
	1528	/*
	1529	* This workitem de-allocates fragments that were replaced during
	1530	* file block allocation.
	1531	*/
	1532	static void
	1533	handle_workitem_freefrag(struct freefrag *freefrag)
	1534	{
	1535	struct inode tip;
	1536
	1537	tip.i_fs = freefrag->ff_fs;
	1538	tip.i_devvp = freefrag->ff_devvp;
	1539	tip.i_dev = freefrag->ff_devvp->v_rdev;
	1540	tip.i_number = freefrag->ff_inum;
	1541	tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above */
	1542	ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
	1543	FREE(freefrag, M_FREEFRAG);
	1544	}
	1545
	1546	/*
	1547	* Indirect block allocation dependencies.
	1548	*
	1549	* The same dependencies that exist for a direct block also exist when
	1550	* a new block is allocated and pointed to by an entry in a block of
	1551	* indirect pointers. The undo/redo states described above are also
	1552	* used here. Because an indirect block contains many pointers that
	1553	* may have dependencies, a second copy of the entire in-memory indirect
	1554	* block is kept. The buffer cache copy is always completely up-to-date.
	1555	* The second copy, which is used only as a source for disk writes,
	1556	* contains only the safe pointers (i.e., those that have no remaining
	1557	* update dependencies). The second copy is freed when all pointers
	1558	* are safe. The cache is not allowed to replace indirect blocks with
	1559	* pending update dependencies. If a buffer containing an indirect
	1560	* block with dependencies is written, these routines will mark it
	1561	* dirty again. It can only be successfully written once all the
	1562	* dependencies are removed. The ffs_fsync routine in conjunction with
	1563	* softdep_sync_metadata work together to get all the dependencies
	1564	* removed so that a file can be successfully written to disk. Three
	1565	* procedures are used when setting up indirect block pointer
	1566	* dependencies. The division is necessary because of the organization
	1567	* of the "balloc" routine and because of the distinction between file
	1568	* pages and file metadata blocks.
	1569	*/
	1570
	1571	/*
	1572	* Allocate a new allocindir structure.
	1573	*
	1574	* Parameters:
	1575	* ip: inode for file being extended
	1576	* ptrno: offset of pointer in indirect block
	1577	* newblkno: disk block number being added
	1578	* oldblkno: previous block number, 0 if none
	1579	*/
	1580	static struct allocindir *
	1581	newallocindir(struct inode *ip, int ptrno, ufs_daddr_t newblkno,
	1582	ufs_daddr_t oldblkno)
	1583	{
	1584	struct allocindir *aip;
	1585
	1586	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
	1587	M_ALLOCINDIR, M_SOFTDEP_FLAGS \| M_ZERO);
	1588	aip->ai_list.wk_type = D_ALLOCINDIR;
	1589	aip->ai_state = ATTACHED;
	1590	aip->ai_offset = ptrno;
	1591	aip->ai_newblkno = newblkno;
	1592	aip->ai_oldblkno = oldblkno;
	1593	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
	1594	return (aip);
	1595	}
	1596
	1597	/*
	1598	* Called just before setting an indirect block pointer
	1599	* to a newly allocated file page.
	1600	*
	1601	* Parameters:
	1602	* ip: inode for file being extended
	1603	* lbn: allocated block number within file
	1604	* bp: buffer with indirect blk referencing page
	1605	* ptrno: offset of pointer in indirect block
	1606	* newblkno: disk block number being added
	1607	* oldblkno: previous block number, 0 if none
	1608	* nbp: buffer holding allocated page
	1609	*/
	1610	void
	1611	softdep_setup_allocindir_page(struct inode *ip, ufs_lbn_t lbn,
	1612	struct buf *bp, int ptrno,
	1613	ufs_daddr_t newblkno, ufs_daddr_t oldblkno,
	1614	struct buf *nbp)
	1615	{
	1616	struct allocindir *aip;
	1617	struct pagedep *pagedep;
	1618
	1619	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
	1620	ACQUIRE_LOCK(&lk);
	1621	/*
	1622	* If we are allocating a directory page, then we must
	1623	* allocate an associated pagedep to track additions and
	1624	* deletions.
	1625	*/
	1626	if ((ip->i_mode & IFMT) == IFDIR &&
	1627	pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
	1628	WORKLIST_INSERT_BP(nbp, &pagedep->pd_list);
	1629	WORKLIST_INSERT_BP(nbp, &aip->ai_list);
	1630	FREE_LOCK(&lk);
	1631	setup_allocindir_phase2(bp, ip, aip);
	1632	}
	1633
	1634	/*
	1635	* Called just before setting an indirect block pointer to a
	1636	* newly allocated indirect block.
	1637	* Parameters:
	1638	* nbp: newly allocated indirect block
	1639	* ip: inode for file being extended
	1640	* bp: indirect block referencing allocated block
	1641	* ptrno: offset of pointer in indirect block
	1642	* newblkno: disk block number being added
	1643	*/
	1644	void
	1645	softdep_setup_allocindir_meta(struct buf nbp, struct inode ip,
	1646	struct buf *bp, int ptrno,
	1647	ufs_daddr_t newblkno)
	1648	{
	1649	struct allocindir *aip;
	1650
	1651	aip = newallocindir(ip, ptrno, newblkno, 0);
	1652	ACQUIRE_LOCK(&lk);
	1653	WORKLIST_INSERT_BP(nbp, &aip->ai_list);
	1654	FREE_LOCK(&lk);
	1655	setup_allocindir_phase2(bp, ip, aip);
	1656	}
	1657
	1658	/*
	1659	* Called to finish the allocation of the "aip" allocated
	1660	* by one of the two routines above.
	1661	*
	1662	* Parameters:
	1663	* bp: in-memory copy of the indirect block
	1664	* ip: inode for file being extended
	1665	* aip: allocindir allocated by the above routines
	1666	*/
	1667	static void
	1668	setup_allocindir_phase2(struct buf bp, struct inode ip,
	1669	struct allocindir *aip)
	1670	{
	1671	struct worklist *wk;
	1672	struct indirdep indirdep, newindirdep;
	1673	struct bmsafemap *bmsafemap;
	1674	struct allocindir *oldaip;
	1675	struct freefrag *freefrag;
	1676	struct newblk *newblk;
	1677
	1678	if (bp->b_loffset >= 0)
	1679	panic("setup_allocindir_phase2: not indir blk");
	1680	for (indirdep = NULL, newindirdep = NULL; ; ) {
	1681	ACQUIRE_LOCK(&lk);
	1682	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	1683	if (wk->wk_type != D_INDIRDEP)
	1684	continue;
	1685	indirdep = WK_INDIRDEP(wk);
	1686	break;
	1687	}
	1688	if (indirdep == NULL && newindirdep) {
	1689	indirdep = newindirdep;
	1690	WORKLIST_INSERT_BP(bp, &indirdep->ir_list);
	1691	newindirdep = NULL;
	1692	}
	1693	FREE_LOCK(&lk);
	1694	if (indirdep) {
	1695	if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
	1696	&newblk) == 0)
	1697	panic("setup_allocindir: lost block");
	1698	ACQUIRE_LOCK(&lk);
	1699	if (newblk->nb_state == DEPCOMPLETE) {
	1700	aip->ai_state \|= DEPCOMPLETE;
	1701	aip->ai_buf = NULL;
	1702	} else {
	1703	bmsafemap = newblk->nb_bmsafemap;
	1704	aip->ai_buf = bmsafemap->sm_buf;
	1705	LIST_REMOVE(newblk, nb_deps);
	1706	LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
	1707	aip, ai_deps);
	1708	}
	1709	LIST_REMOVE(newblk, nb_hash);
	1710	FREE(newblk, M_NEWBLK);
	1711	aip->ai_indirdep = indirdep;
	1712	/*
	1713	* Check to see if there is an existing dependency
	1714	* for this block. If there is, merge the old
	1715	* dependency into the new one.
	1716	*/
	1717	if (aip->ai_oldblkno == 0)
	1718	oldaip = NULL;
	1719	else
	1720
	1721	LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
	1722	if (oldaip->ai_offset == aip->ai_offset)
	1723	break;
	1724	if (oldaip != NULL) {
	1725	if (oldaip->ai_newblkno != aip->ai_oldblkno) {
	1726	FREE_LOCK(&lk);
	1727	panic("setup_allocindir_phase2: blkno");
	1728	}
	1729	aip->ai_oldblkno = oldaip->ai_oldblkno;
	1730	freefrag = oldaip->ai_freefrag;
	1731	oldaip->ai_freefrag = aip->ai_freefrag;
	1732	aip->ai_freefrag = freefrag;
	1733	free_allocindir(oldaip, NULL);
	1734	}
	1735	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
	1736	((ufs_daddr_t *)indirdep->ir_savebp->b_data)
	1737	[aip->ai_offset] = aip->ai_oldblkno;
	1738	FREE_LOCK(&lk);
	1739	}
	1740	if (newindirdep) {
	1741	/*
	1742	* Avoid any possibility of data corruption by
	1743	* ensuring that our old version is thrown away.
	1744	*/
	1745	newindirdep->ir_savebp->b_flags \|= B_INVAL \| B_NOCACHE;
	1746	brelse(newindirdep->ir_savebp);
	1747	WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
	1748	}
	1749	if (indirdep)
	1750	break;
	1751	MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
	1752	M_INDIRDEP, M_SOFTDEP_FLAGS);
	1753	newindirdep->ir_list.wk_type = D_INDIRDEP;
	1754	newindirdep->ir_state = ATTACHED;
	1755	LIST_INIT(&newindirdep->ir_deplisthd);
	1756	LIST_INIT(&newindirdep->ir_donehd);
	1757	if (bp->b_bio2.bio_offset == NOOFFSET) {
	1758	VOP_BMAP(bp->b_vp, bp->b_bio1.bio_offset,
	1759	&bp->b_bio2.bio_offset, NULL, NULL,
	1760	BUF_CMD_WRITE);
	1761	}
	1762	KKASSERT(bp->b_bio2.bio_offset != NOOFFSET);
	1763	newindirdep->ir_savebp = getblk(ip->i_devvp,
	1764	bp->b_bio2.bio_offset,
	1765	bp->b_bcount, 0, 0);
	1766	BUF_KERNPROC(newindirdep->ir_savebp);
	1767	bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
	1768	}
	1769	}
	1770
	1771	/*
	1772	* Block de-allocation dependencies.
	1773	*
	1774	* When blocks are de-allocated, the on-disk pointers must be nullified before
	1775	* the blocks are made available for use by other files. (The true
	1776	* requirement is that old pointers must be nullified before new on-disk
	1777	* pointers are set. We chose this slightly more stringent requirement to
	1778	* reduce complexity.) Our implementation handles this dependency by updating
	1779	* the inode (or indirect block) appropriately but delaying the actual block
	1780	* de-allocation (i.e., freemap and free space count manipulation) until
	1781	* after the updated versions reach stable storage. After the disk is
	1782	* updated, the blocks can be safely de-allocated whenever it is convenient.
	1783	* This implementation handles only the common case of reducing a file's
	1784	* length to zero. Other cases are handled by the conventional synchronous
	1785	* write approach.
	1786	*
	1787	* The ffs implementation with which we worked double-checks
	1788	* the state of the block pointers and file size as it reduces
	1789	* a file's length. Some of this code is replicated here in our
	1790	* soft updates implementation. The freeblks->fb_chkcnt field is
	1791	* used to transfer a part of this information to the procedure
	1792	* that eventually de-allocates the blocks.
	1793	*
	1794	* This routine should be called from the routine that shortens
	1795	* a file's length, before the inode's size or block pointers
	1796	* are modified. It will save the block pointer information for
	1797	* later release and zero the inode so that the calling routine
	1798	* can release it.
	1799	*/
	1800	struct softdep_setup_freeblocks_info {
	1801	struct fs *fs;
	1802	struct inode *ip;
	1803	};
	1804
	1805	static int softdep_setup_freeblocks_bp(struct buf bp, void data);
	1806
	1807	/*
	1808	* Parameters:
	1809	* ip: The inode whose length is to be reduced
	1810	* length: The new length for the file
	1811	*/
	1812	void
	1813	softdep_setup_freeblocks(struct inode *ip, off_t length)
	1814	{
	1815	struct softdep_setup_freeblocks_info info;
	1816	struct freeblks *freeblks;
	1817	struct inodedep *inodedep;
	1818	struct allocdirect *adp;
	1819	struct vnode *vp;
	1820	struct buf *bp;
	1821	struct fs *fs;
	1822	int i, error, delay;
	1823	int count;
	1824
	1825	fs = ip->i_fs;
	1826	if (length != 0)
	1827	panic("softde_setup_freeblocks: non-zero length");
	1828	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
	1829	M_FREEBLKS, M_SOFTDEP_FLAGS \| M_ZERO);
	1830	freeblks->fb_list.wk_type = D_FREEBLKS;
	1831	freeblks->fb_state = ATTACHED;
	1832	freeblks->fb_uid = ip->i_uid;
	1833	freeblks->fb_previousinum = ip->i_number;
	1834	freeblks->fb_devvp = ip->i_devvp;
	1835	freeblks->fb_fs = fs;
	1836	freeblks->fb_oldsize = ip->i_size;
	1837	freeblks->fb_newsize = length;
	1838	freeblks->fb_chkcnt = ip->i_blocks;
	1839	for (i = 0; i < NDADDR; i++) {
	1840	freeblks->fb_dblks[i] = ip->i_db[i];
	1841	ip->i_db[i] = 0;
	1842	}
	1843	for (i = 0; i < NIADDR; i++) {
	1844	freeblks->fb_iblks[i] = ip->i_ib[i];
	1845	ip->i_ib[i] = 0;
	1846	}
	1847	ip->i_blocks = 0;
	1848	ip->i_size = 0;
	1849	/*
	1850	* Push the zero'ed inode to to its disk buffer so that we are free
	1851	* to delete its dependencies below. Once the dependencies are gone
	1852	* the buffer can be safely released.
	1853	*/
	1854	if ((error = bread(ip->i_devvp,
	1855	fsbtodoff(fs, ino_to_fsba(fs, ip->i_number)),
	1856	(int)fs->fs_bsize, &bp)) != 0)
	1857	softdep_error("softdep_setup_freeblocks", error);
	1858	((struct ufs1_dinode )bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
	1859	ip->i_din;
	1860	/*
	1861	* Find and eliminate any inode dependencies.
	1862	*/
	1863	ACQUIRE_LOCK(&lk);
	1864	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
	1865	if ((inodedep->id_state & IOSTARTED) != 0) {
	1866	FREE_LOCK(&lk);
	1867	panic("softdep_setup_freeblocks: inode busy");
	1868	}
	1869	/*
	1870	* Add the freeblks structure to the list of operations that
	1871	* must await the zero'ed inode being written to disk. If we
	1872	* still have a bitmap dependency (delay == 0), then the inode
	1873	* has never been written to disk, so we can process the
	1874	* freeblks below once we have deleted the dependencies.
	1875	*/
	1876	delay = (inodedep->id_state & DEPCOMPLETE);
	1877	if (delay)
	1878	WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
	1879	/*
	1880	* Because the file length has been truncated to zero, any
	1881	* pending block allocation dependency structures associated
	1882	* with this inode are obsolete and can simply be de-allocated.
	1883	* We must first merge the two dependency lists to get rid of
	1884	* any duplicate freefrag structures, then purge the merged list.
	1885	*/
	1886	merge_inode_lists(inodedep);
	1887	while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
	1888	free_allocdirect(&inodedep->id_inoupdt, adp, 1);
	1889	FREE_LOCK(&lk);
	1890	bdwrite(bp);
	1891	/*
	1892	* We must wait for any I/O in progress to finish so that
	1893	* all potential buffers on the dirty list will be visible.
	1894	* Once they are all there, walk the list and get rid of
	1895	* any dependencies.
	1896	*/
	1897	vp = ITOV(ip);
	1898	ACQUIRE_LOCK(&lk);
	1899	drain_output(vp, 1);
	1900
	1901	info.fs = fs;
	1902	info.ip = ip;
	1903	lwkt_gettoken(&vp->v_token);
	1904	do {
	1905	count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
	1906	softdep_setup_freeblocks_bp, &info);
	1907	} while (count != 0);
	1908	lwkt_reltoken(&vp->v_token);
	1909
	1910	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
	1911	(void)free_inodedep(inodedep);
	1912
	1913	if (delay) {
	1914	freeblks->fb_state \|= DEPCOMPLETE;
	1915	/*
	1916	* If the inode with zeroed block pointers is now on disk
	1917	* we can start freeing blocks. Add freeblks to the worklist
	1918	* instead of calling handle_workitem_freeblocks directly as
	1919	* it is more likely that additional IO is needed to complete
	1920	* the request here than in the !delay case.
	1921	*/
	1922	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
	1923	add_to_worklist(&freeblks->fb_list);
	1924	}
	1925
	1926	FREE_LOCK(&lk);
	1927	/*
	1928	* If the inode has never been written to disk (delay == 0),
	1929	* then we can process the freeblks now that we have deleted
	1930	* the dependencies.
	1931	*/
	1932	if (!delay)
	1933	handle_workitem_freeblocks(freeblks);
	1934	}
	1935
	1936	static int
	1937	softdep_setup_freeblocks_bp(struct buf bp, void data)
	1938	{
	1939	struct softdep_setup_freeblocks_info *info = data;
	1940	struct inodedep *inodedep;
	1941
	1942	if (getdirtybuf(&bp, MNT_WAIT) == 0) {
	1943	kprintf("softdep_setup_freeblocks_bp(1): caught bp %p going away\n", bp);
	1944	return(-1);
	1945	}
	1946	if (bp->b_vp != ITOV(info->ip) \|\| (bp->b_flags & B_DELWRI) == 0) {
	1947	kprintf("softdep_setup_freeblocks_bp(2): caught bp %p going away\n", bp);
	1948	BUF_UNLOCK(bp);
	1949	return(-1);
	1950	}
	1951	(void) inodedep_lookup(info->fs, info->ip->i_number, 0, &inodedep);
	1952	deallocate_dependencies(bp, inodedep);
	1953	bp->b_flags \|= B_INVAL \| B_NOCACHE;
	1954	FREE_LOCK(&lk);
	1955	brelse(bp);
	1956	ACQUIRE_LOCK(&lk);
	1957	return(1);
	1958	}
	1959
	1960	/*
	1961	* Reclaim any dependency structures from a buffer that is about to
	1962	* be reallocated to a new vnode. The buffer must be locked, thus,
	1963	* no I/O completion operations can occur while we are manipulating
	1964	* its associated dependencies. The mutex is held so that other I/O's
	1965	* associated with related dependencies do not occur.
	1966	*/
	1967	static void
	1968	deallocate_dependencies(struct buf bp, struct inodedep inodedep)
	1969	{
	1970	struct worklist *wk;
	1971	struct indirdep *indirdep;
	1972	struct allocindir *aip;
	1973	struct pagedep *pagedep;
	1974	struct dirrem *dirrem;
	1975	struct diradd *dap;
	1976	int i;
	1977
	1978	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
	1979	switch (wk->wk_type) {
	1980
	1981	case D_INDIRDEP:
	1982	indirdep = WK_INDIRDEP(wk);
	1983	/*
	1984	* None of the indirect pointers will ever be visible,
	1985	* so they can simply be tossed. GOINGAWAY ensures
	1986	* that allocated pointers will be saved in the buffer
	1987	* cache until they are freed. Note that they will
	1988	* only be able to be found by their physical address
	1989	* since the inode mapping the logical address will
	1990	* be gone. The save buffer used for the safe copy
	1991	* was allocated in setup_allocindir_phase2 using
	1992	* the physical address so it could be used for this
	1993	* purpose. Hence we swap the safe copy with the real
	1994	* copy, allowing the safe copy to be freed and holding
	1995	* on to the real copy for later use in indir_trunc.
	1996	*
	1997	* NOTE: ir_savebp is relative to the block device
	1998	* so b_bio1 contains the device block number.
	1999	*/
	2000	if (indirdep->ir_state & GOINGAWAY) {
	2001	FREE_LOCK(&lk);
	2002	panic("deallocate_dependencies: already gone");
	2003	}
	2004	indirdep->ir_state \|= GOINGAWAY;
	2005	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
	2006	free_allocindir(aip, inodedep);
	2007	if (bp->b_bio1.bio_offset >= 0 \|\|
	2008	bp->b_bio2.bio_offset != indirdep->ir_savebp->b_bio1.bio_offset) {
	2009	FREE_LOCK(&lk);
	2010	panic("deallocate_dependencies: not indir");
	2011	}
	2012	bcopy(bp->b_data, indirdep->ir_savebp->b_data,
	2013	bp->b_bcount);
	2014	WORKLIST_REMOVE(wk);
	2015	WORKLIST_INSERT_BP(indirdep->ir_savebp, wk);
	2016	continue;
	2017
	2018	case D_PAGEDEP:
	2019	pagedep = WK_PAGEDEP(wk);
	2020	/*
	2021	* None of the directory additions will ever be
	2022	* visible, so they can simply be tossed.
	2023	*/
	2024	for (i = 0; i < DAHASHSZ; i++)
	2025	while ((dap =
	2026	LIST_FIRST(&pagedep->pd_diraddhd[i])))
	2027	free_diradd(dap);
	2028	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
	2029	free_diradd(dap);
	2030	/*
	2031	* Copy any directory remove dependencies to the list
	2032	* to be processed after the zero'ed inode is written.
	2033	* If the inode has already been written, then they
	2034	* can be dumped directly onto the work list.
	2035	*/
	2036	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
	2037	LIST_REMOVE(dirrem, dm_next);
	2038	dirrem->dm_dirinum = pagedep->pd_ino;
	2039	if (inodedep == NULL \|\|
	2040	(inodedep->id_state & ALLCOMPLETE) ==
	2041	ALLCOMPLETE)
	2042	add_to_worklist(&dirrem->dm_list);
	2043	else
	2044	WORKLIST_INSERT(&inodedep->id_bufwait,
	2045	&dirrem->dm_list);
	2046	}
	2047	WORKLIST_REMOVE(&pagedep->pd_list);
	2048	LIST_REMOVE(pagedep, pd_hash);
	2049	WORKITEM_FREE(pagedep, D_PAGEDEP);
	2050	continue;
	2051
	2052	case D_ALLOCINDIR:
	2053	free_allocindir(WK_ALLOCINDIR(wk), inodedep);
	2054	continue;
	2055
	2056	case D_ALLOCDIRECT:
	2057	case D_INODEDEP:
	2058	FREE_LOCK(&lk);
	2059	panic("deallocate_dependencies: Unexpected type %s",
	2060	TYPENAME(wk->wk_type));
	2061	/* NOTREACHED */
	2062
	2063	default:
	2064	FREE_LOCK(&lk);
	2065	panic("deallocate_dependencies: Unknown type %s",
	2066	TYPENAME(wk->wk_type));
	2067	/* NOTREACHED */
	2068	}
	2069	}
	2070	}
	2071
	2072	/*
	2073	* Free an allocdirect. Generate a new freefrag work request if appropriate.
	2074	* This routine must be called with splbio interrupts blocked.
	2075	*/
	2076	static void
	2077	free_allocdirect(struct allocdirectlst *adphead,
	2078	struct allocdirect *adp, int delay)
	2079	{
	2080
	2081	#ifdef DEBUG
	2082	if (lk.lkt_held == NOHOLDER)
	2083	panic("free_allocdirect: lock not held");
	2084	#endif
	2085	if ((adp->ad_state & DEPCOMPLETE) == 0)
	2086	LIST_REMOVE(adp, ad_deps);
	2087	TAILQ_REMOVE(adphead, adp, ad_next);
	2088	if ((adp->ad_state & COMPLETE) == 0)
	2089	WORKLIST_REMOVE(&adp->ad_list);
	2090	if (adp->ad_freefrag != NULL) {
	2091	if (delay)
	2092	WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
	2093	&adp->ad_freefrag->ff_list);
	2094	else
	2095	add_to_worklist(&adp->ad_freefrag->ff_list);
	2096	}
	2097	WORKITEM_FREE(adp, D_ALLOCDIRECT);
	2098	}
	2099
	2100	/*
	2101	* Prepare an inode to be freed. The actual free operation is not
	2102	* done until the zero'ed inode has been written to disk.
	2103	*/
	2104	void
	2105	softdep_freefile(struct vnode *pvp, ino_t ino, int mode)
	2106	{
	2107	struct inode *ip = VTOI(pvp);
	2108	struct inodedep *inodedep;
	2109	struct freefile *freefile;
	2110
	2111	/*
	2112	* This sets up the inode de-allocation dependency.
	2113	*/
	2114	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
	2115	M_FREEFILE, M_SOFTDEP_FLAGS);
	2116	freefile->fx_list.wk_type = D_FREEFILE;
	2117	freefile->fx_list.wk_state = 0;
	2118	freefile->fx_mode = mode;
	2119	freefile->fx_oldinum = ino;
	2120	freefile->fx_devvp = ip->i_devvp;
	2121	freefile->fx_fs = ip->i_fs;
	2122
	2123	/*
	2124	* If the inodedep does not exist, then the zero'ed inode has
	2125	* been written to disk. If the allocated inode has never been
	2126	* written to disk, then the on-disk inode is zero'ed. In either
	2127	* case we can free the file immediately.
	2128	*/
	2129	ACQUIRE_LOCK(&lk);
	2130	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 \|\|
	2131	check_inode_unwritten(inodedep)) {
	2132	FREE_LOCK(&lk);
	2133	handle_workitem_freefile(freefile);
	2134	return;
	2135	}
	2136	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
	2137	FREE_LOCK(&lk);
	2138	}
	2139
	2140	/*
	2141	* Check to see if an inode has never been written to disk. If
	2142	* so free the inodedep and return success, otherwise return failure.
	2143	* This routine must be called with splbio interrupts blocked.
	2144	*
	2145	* If we still have a bitmap dependency, then the inode has never
	2146	* been written to disk. Drop the dependency as it is no longer
	2147	* necessary since the inode is being deallocated. We set the
	2148	* ALLCOMPLETE flags since the bitmap now properly shows that the
	2149	* inode is not allocated. Even if the inode is actively being
	2150	* written, it has been rolled back to its zero'ed state, so we
	2151	* are ensured that a zero inode is what is on the disk. For short
	2152	* lived files, this change will usually result in removing all the
	2153	* dependencies from the inode so that it can be freed immediately.
	2154	*/
	2155	static int
	2156	check_inode_unwritten(struct inodedep *inodedep)
	2157	{
	2158
	2159	if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\|
	2160	LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\|
	2161	LIST_FIRST(&inodedep->id_bufwait) != NULL \|\|
	2162	LIST_FIRST(&inodedep->id_inowait) != NULL \|\|
	2163	TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\|
	2164	TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\|
	2165	inodedep->id_nlinkdelta != 0)
	2166	return (0);
	2167
	2168	/*
	2169	* Another process might be in initiate_write_inodeblock
	2170	* trying to allocate memory without holding "Softdep Lock".
	2171	*/
	2172	if ((inodedep->id_state & IOSTARTED) != 0 &&
	2173	inodedep->id_savedino == NULL)
	2174	return(0);
	2175
	2176	inodedep->id_state \|= ALLCOMPLETE;
	2177	LIST_REMOVE(inodedep, id_deps);
	2178	inodedep->id_buf = NULL;
	2179	if (inodedep->id_state & ONWORKLIST)
	2180	WORKLIST_REMOVE(&inodedep->id_list);
	2181	if (inodedep->id_savedino != NULL) {
	2182	FREE(inodedep->id_savedino, M_INODEDEP);
	2183	inodedep->id_savedino = NULL;
	2184	}
	2185	if (free_inodedep(inodedep) == 0) {
	2186	FREE_LOCK(&lk);
	2187	panic("check_inode_unwritten: busy inode");
	2188	}
	2189	return (1);
	2190	}
	2191
	2192	/*
	2193	* Try to free an inodedep structure. Return 1 if it could be freed.
	2194	*/
	2195	static int
	2196	free_inodedep(struct inodedep *inodedep)
	2197	{
	2198
	2199	if ((inodedep->id_state & ONWORKLIST) != 0 \|\|
	2200	(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE \|\|
	2201	LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\|
	2202	LIST_FIRST(&inodedep->id_bufwait) != NULL \|\|
	2203	LIST_FIRST(&inodedep->id_inowait) != NULL \|\|
	2204	TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\|
	2205	TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\|
	2206	inodedep->id_nlinkdelta != 0 \|\| inodedep->id_savedino != NULL)
	2207	return (0);
	2208	LIST_REMOVE(inodedep, id_hash);
	2209	WORKITEM_FREE(inodedep, D_INODEDEP);
	2210	num_inodedep -= 1;
	2211	return (1);
	2212	}
	2213
	2214	/*
	2215	* This workitem routine performs the block de-allocation.
	2216	* The workitem is added to the pending list after the updated
	2217	* inode block has been written to disk. As mentioned above,
	2218	* checks regarding the number of blocks de-allocated (compared
	2219	* to the number of blocks allocated for the file) are also
	2220	* performed in this function.
	2221	*/
	2222	static void
	2223	handle_workitem_freeblocks(struct freeblks *freeblks)
	2224	{
	2225	struct inode tip;
	2226	ufs_daddr_t bn;
	2227	struct fs *fs;
	2228	int i, level, bsize;
	2229	long nblocks, blocksreleased = 0;
	2230	int error, allerror = 0;
	2231	ufs_lbn_t baselbns[NIADDR], tmpval;
	2232
	2233	tip.i_number = freeblks->fb_previousinum;
	2234	tip.i_devvp = freeblks->fb_devvp;
	2235	tip.i_dev = freeblks->fb_devvp->v_rdev;
	2236	tip.i_fs = freeblks->fb_fs;
	2237	tip.i_size = freeblks->fb_oldsize;
	2238	tip.i_uid = freeblks->fb_uid;
	2239	fs = freeblks->fb_fs;
	2240	tmpval = 1;
	2241	baselbns[0] = NDADDR;
	2242	for (i = 1; i < NIADDR; i++) {
	2243	tmpval *= NINDIR(fs);
	2244	baselbns[i] = baselbns[i - 1] + tmpval;
	2245	}
	2246	nblocks = btodb(fs->fs_bsize);
	2247	blocksreleased = 0;
	2248	/*
	2249	* Indirect blocks first.
	2250	*/
	2251	for (level = (NIADDR - 1); level >= 0; level--) {
	2252	if ((bn = freeblks->fb_iblks[level]) == 0)
	2253	continue;
	2254	if ((error = indir_trunc(&tip, fsbtodoff(fs, bn), level,
	2255	baselbns[level], &blocksreleased)) == 0)
	2256	allerror = error;
	2257	ffs_blkfree(&tip, bn, fs->fs_bsize);
	2258	blocksreleased += nblocks;
	2259	}
	2260	/*
	2261	* All direct blocks or frags.
	2262	*/
	2263	for (i = (NDADDR - 1); i >= 0; i--) {
	2264	if ((bn = freeblks->fb_dblks[i]) == 0)
	2265	continue;
	2266	bsize = blksize(fs, &tip, i);
	2267	ffs_blkfree(&tip, bn, bsize);
	2268	blocksreleased += btodb(bsize);
	2269	}
	2270
	2271	#ifdef DIAGNOSTIC
	2272	if (freeblks->fb_chkcnt != blocksreleased)
	2273	kprintf("handle_workitem_freeblocks: block count\n");
	2274	if (allerror)
	2275	softdep_error("handle_workitem_freeblks", allerror);
	2276	#endif /* DIAGNOSTIC */
	2277	WORKITEM_FREE(freeblks, D_FREEBLKS);
	2278	}
	2279
	2280	/*
	2281	* Release blocks associated with the inode ip and stored in the indirect
	2282	* block at doffset. If level is greater than SINGLE, the block is an
	2283	* indirect block and recursive calls to indirtrunc must be used to
	2284	* cleanse other indirect blocks.
	2285	*/
	2286	static int
	2287	indir_trunc(struct inode *ip, off_t doffset, int level, ufs_lbn_t lbn,
	2288	long *countp)
	2289	{
	2290	struct buf *bp;
	2291	ufs_daddr_t *bap;
	2292	ufs_daddr_t nb;
	2293	struct fs *fs;
	2294	struct worklist *wk;
	2295	struct indirdep *indirdep;
	2296	int i, lbnadd, nblocks;
	2297	int error, allerror = 0;
	2298
	2299	fs = ip->i_fs;
	2300	lbnadd = 1;
	2301	for (i = level; i > 0; i--)
	2302	lbnadd *= NINDIR(fs);
	2303	/*
	2304	* Get buffer of block pointers to be freed. This routine is not
	2305	* called until the zero'ed inode has been written, so it is safe
	2306	* to free blocks as they are encountered. Because the inode has
	2307	* been zero'ed, calls to bmap on these blocks will fail. So, we
	2308	* have to use the on-disk address and the block device for the
	2309	* filesystem to look them up. If the file was deleted before its
	2310	* indirect blocks were all written to disk, the routine that set
	2311	* us up (deallocate_dependencies) will have arranged to leave
	2312	* a complete copy of the indirect block in memory for our use.
	2313	* Otherwise we have to read the blocks in from the disk.
	2314	*/
	2315	ACQUIRE_LOCK(&lk);
	2316	if ((bp = findblk(ip->i_devvp, doffset, FINDBLK_TEST)) != NULL &&
	2317	(wk = LIST_FIRST(&bp->b_dep)) != NULL) {
	2318	/*
	2319	* bp must be ir_savebp, which is held locked for our use.
	2320	*/
	2321	if (wk->wk_type != D_INDIRDEP \|\|
	2322	(indirdep = WK_INDIRDEP(wk))->ir_savebp != bp \|\|
	2323	(indirdep->ir_state & GOINGAWAY) == 0) {
	2324	FREE_LOCK(&lk);
	2325	panic("indir_trunc: lost indirdep");
	2326	}
	2327	WORKLIST_REMOVE(wk);
	2328	WORKITEM_FREE(indirdep, D_INDIRDEP);
	2329	if (LIST_FIRST(&bp->b_dep) != NULL) {
	2330	FREE_LOCK(&lk);
	2331	panic("indir_trunc: dangling dep");
	2332	}
	2333	FREE_LOCK(&lk);
	2334	} else {
	2335	FREE_LOCK(&lk);
	2336	error = bread(ip->i_devvp, doffset, (int)fs->fs_bsize, &bp);
	2337	if (error)
	2338	return (error);
	2339	}
	2340	/*
	2341	* Recursively free indirect blocks.
	2342	*/
	2343	bap = (ufs_daddr_t *)bp->b_data;
	2344	nblocks = btodb(fs->fs_bsize);
	2345	for (i = NINDIR(fs) - 1; i >= 0; i--) {
	2346	if ((nb = bap[i]) == 0)
	2347	continue;
	2348	if (level != 0) {
	2349	if ((error = indir_trunc(ip, fsbtodoff(fs, nb),
	2350	level - 1, lbn + (i * lbnadd), countp)) != 0)
	2351	allerror = error;
	2352	}
	2353	ffs_blkfree(ip, nb, fs->fs_bsize);
	2354	*countp += nblocks;
	2355	}
	2356	bp->b_flags \|= B_INVAL \| B_NOCACHE;
	2357	brelse(bp);
	2358	return (allerror);
	2359	}
	2360
	2361	/*
	2362	* Free an allocindir.
	2363	* This routine must be called with splbio interrupts blocked.
	2364	*/
	2365	static void
	2366	free_allocindir(struct allocindir aip, struct inodedep inodedep)
	2367	{
	2368	struct freefrag *freefrag;
	2369
	2370	#ifdef DEBUG
	2371	if (lk.lkt_held == NOHOLDER)
	2372	panic("free_allocindir: lock not held");
	2373	#endif
	2374	if ((aip->ai_state & DEPCOMPLETE) == 0)
	2375	LIST_REMOVE(aip, ai_deps);
	2376	if (aip->ai_state & ONWORKLIST)
	2377	WORKLIST_REMOVE(&aip->ai_list);
	2378	LIST_REMOVE(aip, ai_next);
	2379	if ((freefrag = aip->ai_freefrag) != NULL) {
	2380	if (inodedep == NULL)
	2381	add_to_worklist(&freefrag->ff_list);
	2382	else
	2383	WORKLIST_INSERT(&inodedep->id_bufwait,
	2384	&freefrag->ff_list);
	2385	}
	2386	WORKITEM_FREE(aip, D_ALLOCINDIR);
	2387	}
	2388
	2389	/*
	2390	* Directory entry addition dependencies.
	2391	*
	2392	* When adding a new directory entry, the inode (with its incremented link
	2393	* count) must be written to disk before the directory entry's pointer to it.
	2394	* Also, if the inode is newly allocated, the corresponding freemap must be
	2395	* updated (on disk) before the directory entry's pointer. These requirements
	2396	* are met via undo/redo on the directory entry's pointer, which consists
	2397	* simply of the inode number.
	2398	*
	2399	* As directory entries are added and deleted, the free space within a
	2400	* directory block can become fragmented. The ufs filesystem will compact
	2401	* a fragmented directory block to make space for a new entry. When this
	2402	* occurs, the offsets of previously added entries change. Any "diradd"
	2403	* dependency structures corresponding to these entries must be updated with
	2404	* the new offsets.
	2405	*/
	2406
	2407	/*
	2408	* This routine is called after the in-memory inode's link
	2409	* count has been incremented, but before the directory entry's
	2410	* pointer to the inode has been set.
	2411	*
	2412	* Parameters:
	2413	* bp: buffer containing directory block
	2414	* dp: inode for directory
	2415	* diroffset: offset of new entry in directory
	2416	* newinum: inode referenced by new directory entry
	2417	* newdirbp: non-NULL => contents of new mkdir
	2418	*/
	2419	void
	2420	softdep_setup_directory_add(struct buf bp, struct inode dp, off_t diroffset,
	2421	ino_t newinum, struct buf *newdirbp)
	2422	{
	2423	int offset; /* offset of new entry within directory block */
	2424	ufs_lbn_t lbn; /* block in directory containing new entry */
	2425	struct fs *fs;
	2426	struct diradd *dap;
	2427	struct pagedep *pagedep;
	2428	struct inodedep *inodedep;
	2429	struct mkdir mkdir1, mkdir2;
	2430
	2431	/*
	2432	* Whiteouts have no dependencies.
	2433	*/
	2434	if (newinum == WINO) {
	2435	if (newdirbp != NULL)
	2436	bdwrite(newdirbp);
	2437	return;
	2438	}
	2439
	2440	fs = dp->i_fs;
	2441	lbn = lblkno(fs, diroffset);
	2442	offset = blkoff(fs, diroffset);
	2443	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
	2444	M_SOFTDEP_FLAGS \| M_ZERO);
	2445	dap->da_list.wk_type = D_DIRADD;
	2446	dap->da_offset = offset;
	2447	dap->da_newinum = newinum;
	2448	dap->da_state = ATTACHED;
	2449	if (newdirbp == NULL) {
	2450	dap->da_state \|= DEPCOMPLETE;
	2451	ACQUIRE_LOCK(&lk);
	2452	} else {
	2453	dap->da_state \|= MKDIR_BODY \| MKDIR_PARENT;
	2454	MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
	2455	M_SOFTDEP_FLAGS);
	2456	mkdir1->md_list.wk_type = D_MKDIR;
	2457	mkdir1->md_state = MKDIR_BODY;
	2458	mkdir1->md_diradd = dap;
	2459	MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
	2460	M_SOFTDEP_FLAGS);
	2461	mkdir2->md_list.wk_type = D_MKDIR;
	2462	mkdir2->md_state = MKDIR_PARENT;
	2463	mkdir2->md_diradd = dap;
	2464	/*
	2465	* Dependency on "." and ".." being written to disk.
	2466	*/
	2467	mkdir1->md_buf = newdirbp;
	2468	ACQUIRE_LOCK(&lk);
	2469	LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
	2470	WORKLIST_INSERT_BP(newdirbp, &mkdir1->md_list);
	2471	FREE_LOCK(&lk);
	2472	bdwrite(newdirbp);
	2473	/*
	2474	* Dependency on link count increase for parent directory
	2475	*/
	2476	ACQUIRE_LOCK(&lk);
	2477	if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
	2478	\|\| (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
	2479	dap->da_state &= ~MKDIR_PARENT;
	2480	WORKITEM_FREE(mkdir2, D_MKDIR);
	2481	} else {
	2482	LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
	2483	WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
	2484	}
	2485	}
	2486	/*
	2487	* Link into parent directory pagedep to await its being written.
	2488	*/
	2489	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
	2490	WORKLIST_INSERT_BP(bp, &pagedep->pd_list);
	2491	dap->da_pagedep = pagedep;
	2492	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
	2493	da_pdlist);
	2494	/*
	2495	* Link into its inodedep. Put it on the id_bufwait list if the inode
	2496	* is not yet written. If it is written, do the post-inode write
	2497	* processing to put it on the id_pendinghd list.
	2498	*/
	2499	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
	2500	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
	2501	diradd_inode_written(dap, inodedep);
	2502	else
	2503	WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
	2504	FREE_LOCK(&lk);
	2505	}
	2506
	2507	/*
	2508	* This procedure is called to change the offset of a directory
	2509	* entry when compacting a directory block which must be owned
	2510	* exclusively by the caller. Note that the actual entry movement
	2511	* must be done in this procedure to ensure that no I/O completions
	2512	* occur while the move is in progress.
	2513	*
	2514	* Parameters:
	2515	* dp: inode for directory
	2516	* base: address of dp->i_offset
	2517	* oldloc: address of old directory location
	2518	* newloc: address of new directory location
	2519	* entrysize: size of directory entry
	2520	*/
	2521	void
	2522	softdep_change_directoryentry_offset(struct inode *dp, caddr_t base,
	2523	caddr_t oldloc, caddr_t newloc,
	2524	int entrysize)
	2525	{
	2526	int offset, oldoffset, newoffset;
	2527	struct pagedep *pagedep;
	2528	struct diradd *dap;
	2529	ufs_lbn_t lbn;
	2530
	2531	ACQUIRE_LOCK(&lk);
	2532	lbn = lblkno(dp->i_fs, dp->i_offset);
	2533	offset = blkoff(dp->i_fs, dp->i_offset);
	2534	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
	2535	goto done;
	2536	oldoffset = offset + (oldloc - base);
	2537	newoffset = offset + (newloc - base);
	2538
	2539	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
	2540	if (dap->da_offset != oldoffset)
	2541	continue;
	2542	dap->da_offset = newoffset;
	2543	if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
	2544	break;
	2545	LIST_REMOVE(dap, da_pdlist);
	2546	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
	2547	dap, da_pdlist);
	2548	break;
	2549	}
	2550	if (dap == NULL) {
	2551
	2552	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
	2553	if (dap->da_offset == oldoffset) {
	2554	dap->da_offset = newoffset;
	2555	break;
	2556	}
	2557	}
	2558	}
	2559	done:
	2560	bcopy(oldloc, newloc, entrysize);
	2561	FREE_LOCK(&lk);
	2562	}
	2563
	2564	/*
	2565	* Free a diradd dependency structure. This routine must be called
	2566	* with splbio interrupts blocked.
	2567	*/
	2568	static void
	2569	free_diradd(struct diradd *dap)
	2570	{
	2571	struct dirrem *dirrem;
	2572	struct pagedep *pagedep;
	2573	struct inodedep *inodedep;
	2574	struct mkdir mkdir, nextmd;
	2575
	2576	#ifdef DEBUG
	2577	if (lk.lkt_held == NOHOLDER)
	2578	panic("free_diradd: lock not held");
	2579	#endif
	2580	WORKLIST_REMOVE(&dap->da_list);
	2581	LIST_REMOVE(dap, da_pdlist);
	2582	if ((dap->da_state & DIRCHG) == 0) {
	2583	pagedep = dap->da_pagedep;
	2584	} else {
	2585	dirrem = dap->da_previous;
	2586	pagedep = dirrem->dm_pagedep;
	2587	dirrem->dm_dirinum = pagedep->pd_ino;
	2588	add_to_worklist(&dirrem->dm_list);
	2589	}
	2590	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
	2591	0, &inodedep) != 0)
	2592	(void) free_inodedep(inodedep);
	2593	if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) {
	2594	for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
	2595	nextmd = LIST_NEXT(mkdir, md_mkdirs);
	2596	if (mkdir->md_diradd != dap)
	2597	continue;
	2598	dap->da_state &= ~mkdir->md_state;
	2599	WORKLIST_REMOVE(&mkdir->md_list);
	2600	LIST_REMOVE(mkdir, md_mkdirs);
	2601	WORKITEM_FREE(mkdir, D_MKDIR);
	2602	}
	2603	if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) {
	2604	FREE_LOCK(&lk);
	2605	panic("free_diradd: unfound ref");
	2606	}
	2607	}
	2608	WORKITEM_FREE(dap, D_DIRADD);
	2609	}
	2610
	2611	/*
	2612	* Directory entry removal dependencies.
	2613	*
	2614	* When removing a directory entry, the entry's inode pointer must be
	2615	* zero'ed on disk before the corresponding inode's link count is decremented
	2616	* (possibly freeing the inode for re-use). This dependency is handled by
	2617	* updating the directory entry but delaying the inode count reduction until
	2618	* after the directory block has been written to disk. After this point, the
	2619	* inode count can be decremented whenever it is convenient.
	2620	*/
	2621
	2622	/*
	2623	* This routine should be called immediately after removing
	2624	* a directory entry. The inode's link count should not be
	2625	* decremented by the calling procedure -- the soft updates
	2626	* code will do this task when it is safe.
	2627	*
	2628	* Parameters:
	2629	* bp: buffer containing directory block
	2630	* dp: inode for the directory being modified
	2631	* ip: inode for directory entry being removed
	2632	* isrmdir: indicates if doing RMDIR
	2633	*/
	2634	void
	2635	softdep_setup_remove(struct buf bp, struct inode dp, struct inode *ip,
	2636	int isrmdir)
	2637	{
	2638	struct dirrem dirrem, prevdirrem;
	2639
	2640	/*
	2641	* Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
	2642	*/
	2643	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
	2644
	2645	/*
	2646	* If the COMPLETE flag is clear, then there were no active
	2647	* entries and we want to roll back to a zeroed entry until
	2648	* the new inode is committed to disk. If the COMPLETE flag is
	2649	* set then we have deleted an entry that never made it to
	2650	* disk. If the entry we deleted resulted from a name change,
	2651	* then the old name still resides on disk. We cannot delete
	2652	* its inode (returned to us in prevdirrem) until the zeroed
	2653	* directory entry gets to disk. The new inode has never been
	2654	* referenced on the disk, so can be deleted immediately.
	2655	*/
	2656	if ((dirrem->dm_state & COMPLETE) == 0) {
	2657	LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
	2658	dm_next);
	2659	FREE_LOCK(&lk);
	2660	} else {
	2661	if (prevdirrem != NULL)
	2662	LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
	2663	prevdirrem, dm_next);
	2664	dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
	2665	FREE_LOCK(&lk);
	2666	handle_workitem_remove(dirrem);
	2667	}
	2668	}
	2669
	2670	/*
	2671	* Allocate a new dirrem if appropriate and return it along with
	2672	* its associated pagedep. Called without a lock, returns with lock.
	2673	*/
	2674	static long num_dirrem; /* number of dirrem allocated */
	2675
	2676	/*
	2677	* Parameters:
	2678	* bp: buffer containing directory block
	2679	* dp: inode for the directory being modified
	2680	* ip: inode for directory entry being removed
	2681	* isrmdir: indicates if doing RMDIR
	2682	* prevdirremp: previously referenced inode, if any
	2683	*/
	2684	static struct dirrem *
	2685	newdirrem(struct buf bp, struct inode dp, struct inode *ip,
	2686	int isrmdir, struct dirrem **prevdirremp)
	2687	{
	2688	int offset;
	2689	ufs_lbn_t lbn;
	2690	struct diradd *dap;
	2691	struct dirrem *dirrem;
	2692	struct pagedep *pagedep;
	2693
	2694	/*
	2695	* Whiteouts have no deletion dependencies.
	2696	*/
	2697	if (ip == NULL)
	2698	panic("newdirrem: whiteout");
	2699	/*
	2700	* If we are over our limit, try to improve the situation.
	2701	* Limiting the number of dirrem structures will also limit
	2702	* the number of freefile and freeblks structures.
	2703	*/
	2704	if (num_dirrem > max_softdeps / 2 && speedup_syncer() == 0)
	2705	(void) request_cleanup(FLUSH_REMOVE, 0);
	2706	num_dirrem += 1;
	2707	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
	2708	M_DIRREM, M_SOFTDEP_FLAGS \| M_ZERO);
	2709	dirrem->dm_list.wk_type = D_DIRREM;
	2710	dirrem->dm_state = isrmdir ? RMDIR : 0;
	2711	dirrem->dm_mnt = ITOV(ip)->v_mount;
	2712	dirrem->dm_oldinum = ip->i_number;
	2713	*prevdirremp = NULL;
	2714
	2715	ACQUIRE_LOCK(&lk);
	2716	lbn = lblkno(dp->i_fs, dp->i_offset);
	2717	offset = blkoff(dp->i_fs, dp->i_offset);
	2718	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
	2719	WORKLIST_INSERT_BP(bp, &pagedep->pd_list);
	2720	dirrem->dm_pagedep = pagedep;
	2721	/*
	2722	* Check for a diradd dependency for the same directory entry.
	2723	* If present, then both dependencies become obsolete and can
	2724	* be de-allocated. Check for an entry on both the pd_dirraddhd
	2725	* list and the pd_pendinghd list.
	2726	*/
	2727
	2728	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
	2729	if (dap->da_offset == offset)
	2730	break;
	2731	if (dap == NULL) {
	2732
	2733	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
	2734	if (dap->da_offset == offset)
	2735	break;
	2736	if (dap == NULL)
	2737	return (dirrem);
	2738	}
	2739	/*
	2740	* Must be ATTACHED at this point.
	2741	*/
	2742	if ((dap->da_state & ATTACHED) == 0) {
	2743	FREE_LOCK(&lk);
	2744	panic("newdirrem: not ATTACHED");
	2745	}
	2746	if (dap->da_newinum != ip->i_number) {
	2747	FREE_LOCK(&lk);
	2748	panic("newdirrem: inum %"PRId64" should be %"PRId64,
	2749	ip->i_number, dap->da_newinum);
	2750	}
	2751	/*
	2752	* If we are deleting a changed name that never made it to disk,
	2753	* then return the dirrem describing the previous inode (which
	2754	* represents the inode currently referenced from this entry on disk).
	2755	*/
	2756	if ((dap->da_state & DIRCHG) != 0) {
	2757	*prevdirremp = dap->da_previous;
	2758	dap->da_state &= ~DIRCHG;
	2759	dap->da_pagedep = pagedep;
	2760	}
	2761	/*
	2762	* We are deleting an entry that never made it to disk.
	2763	* Mark it COMPLETE so we can delete its inode immediately.
	2764	*/
	2765	dirrem->dm_state \|= COMPLETE;
	2766	free_diradd(dap);
	2767	return (dirrem);
	2768	}
	2769
	2770	/*
	2771	* Directory entry change dependencies.
	2772	*
	2773	* Changing an existing directory entry requires that an add operation
	2774	* be completed first followed by a deletion. The semantics for the addition
	2775	* are identical to the description of adding a new entry above except
	2776	* that the rollback is to the old inode number rather than zero. Once
	2777	* the addition dependency is completed, the removal is done as described
	2778	* in the removal routine above.
	2779	*/
	2780
	2781	/*
	2782	* This routine should be called immediately after changing
	2783	* a directory entry. The inode's link count should not be
	2784	* decremented by the calling procedure -- the soft updates
	2785	* code will perform this task when it is safe.
	2786	*
	2787	* Parameters:
	2788	* bp: buffer containing directory block
	2789	* dp: inode for the directory being modified
	2790	* ip: inode for directory entry being removed
	2791	* newinum: new inode number for changed entry
	2792	* isrmdir: indicates if doing RMDIR
	2793	*/
	2794	void
	2795	softdep_setup_directory_change(struct buf bp, struct inode dp,
	2796	struct inode *ip, ino_t newinum,
	2797	int isrmdir)
	2798	{
	2799	int offset;
	2800	struct diradd *dap = NULL;
	2801	struct dirrem dirrem, prevdirrem;
	2802	struct pagedep *pagedep;
	2803	struct inodedep *inodedep;
	2804
	2805	offset = blkoff(dp->i_fs, dp->i_offset);
	2806
	2807	/*
	2808	* Whiteouts do not need diradd dependencies.
	2809	*/
	2810	if (newinum != WINO) {
	2811	MALLOC(dap, struct diradd *, sizeof(struct diradd),
	2812	M_DIRADD, M_SOFTDEP_FLAGS \| M_ZERO);
	2813	dap->da_list.wk_type = D_DIRADD;
	2814	dap->da_state = DIRCHG \| ATTACHED \| DEPCOMPLETE;
	2815	dap->da_offset = offset;
	2816	dap->da_newinum = newinum;
	2817	}
	2818
	2819	/*
	2820	* Allocate a new dirrem and ACQUIRE_LOCK.
	2821	*/
	2822	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
	2823	pagedep = dirrem->dm_pagedep;
	2824	/*
	2825	* The possible values for isrmdir:
	2826	* 0 - non-directory file rename
	2827	* 1 - directory rename within same directory
	2828	* inum - directory rename to new directory of given inode number
	2829	* When renaming to a new directory, we are both deleting and
	2830	* creating a new directory entry, so the link count on the new
	2831	* directory should not change. Thus we do not need the followup
	2832	* dirrem which is usually done in handle_workitem_remove. We set
	2833	* the DIRCHG flag to tell handle_workitem_remove to skip the
	2834	* followup dirrem.
	2835	*/
	2836	if (isrmdir > 1)
	2837	dirrem->dm_state \|= DIRCHG;
	2838
	2839	/*
	2840	* Whiteouts have no additional dependencies,
	2841	* so just put the dirrem on the correct list.
	2842	*/
	2843	if (newinum == WINO) {
	2844	if ((dirrem->dm_state & COMPLETE) == 0) {
	2845	LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
	2846	dm_next);
	2847	} else {
	2848	dirrem->dm_dirinum = pagedep->pd_ino;
	2849	add_to_worklist(&dirrem->dm_list);
	2850	}
	2851	FREE_LOCK(&lk);
	2852	return;
	2853	}
	2854
	2855	/*
	2856	* If the COMPLETE flag is clear, then there were no active
	2857	* entries and we want to roll back to the previous inode until
	2858	* the new inode is committed to disk. If the COMPLETE flag is
	2859	* set, then we have deleted an entry that never made it to disk.
	2860	* If the entry we deleted resulted from a name change, then the old
	2861	* inode reference still resides on disk. Any rollback that we do
	2862	* needs to be to that old inode (returned to us in prevdirrem). If
	2863	* the entry we deleted resulted from a create, then there is
	2864	* no entry on the disk, so we want to roll back to zero rather
	2865	* than the uncommitted inode. In either of the COMPLETE cases we
	2866	* want to immediately free the unwritten and unreferenced inode.
	2867	*/
	2868	if ((dirrem->dm_state & COMPLETE) == 0) {
	2869	dap->da_previous = dirrem;
	2870	} else {
	2871	if (prevdirrem != NULL) {
	2872	dap->da_previous = prevdirrem;
	2873	} else {
	2874	dap->da_state &= ~DIRCHG;
	2875	dap->da_pagedep = pagedep;
	2876	}
	2877	dirrem->dm_dirinum = pagedep->pd_ino;
	2878	add_to_worklist(&dirrem->dm_list);
	2879	}
	2880	/*
	2881	* Link into its inodedep. Put it on the id_bufwait list if the inode
	2882	* is not yet written. If it is written, do the post-inode write
	2883	* processing to put it on the id_pendinghd list.
	2884	*/
	2885	if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 \|\|
	2886	(inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
	2887	dap->da_state \|= COMPLETE;
	2888	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
	2889	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
	2890	} else {
	2891	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
	2892	dap, da_pdlist);
	2893	WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
	2894	}
	2895	FREE_LOCK(&lk);
	2896	}
	2897
	2898	/*
	2899	* Called whenever the link count on an inode is changed.
	2900	* It creates an inode dependency so that the new reference(s)
	2901	* to the inode cannot be committed to disk until the updated
	2902	* inode has been written.
	2903	*
	2904	* Parameters:
	2905	* ip: the inode with the increased link count
	2906	*/
	2907	void
	2908	softdep_change_linkcnt(struct inode *ip)
	2909	{
	2910	struct inodedep *inodedep;
	2911
	2912	ACQUIRE_LOCK(&lk);
	2913	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
	2914	if (ip->i_nlink < ip->i_effnlink) {
	2915	FREE_LOCK(&lk);
	2916	panic("softdep_change_linkcnt: bad delta");
	2917	}
	2918	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
	2919	FREE_LOCK(&lk);
	2920	}
	2921
	2922	/*
	2923	* This workitem decrements the inode's link count.
	2924	* If the link count reaches zero, the file is removed.
	2925	*/
	2926	static void
	2927	handle_workitem_remove(struct dirrem *dirrem)
	2928	{
	2929	struct inodedep *inodedep;
	2930	struct vnode *vp;
	2931	struct inode *ip;
	2932	ino_t oldinum;
	2933	int error;
	2934
	2935	error = VFS_VGET(dirrem->dm_mnt, NULL, dirrem->dm_oldinum, &vp);
	2936	if (error) {
	2937	softdep_error("handle_workitem_remove: vget", error);
	2938	return;
	2939	}
	2940	ip = VTOI(vp);
	2941	ACQUIRE_LOCK(&lk);
	2942	if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){
	2943	FREE_LOCK(&lk);
	2944	panic("handle_workitem_remove: lost inodedep");
	2945	}
	2946	/*
	2947	* Normal file deletion.
	2948	*/
	2949	if ((dirrem->dm_state & RMDIR) == 0) {
	2950	ip->i_nlink--;
	2951	ip->i_flag \|= IN_CHANGE;
	2952	if (ip->i_nlink < ip->i_effnlink) {
	2953	FREE_LOCK(&lk);
	2954	panic("handle_workitem_remove: bad file delta");
	2955	}
	2956	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
	2957	FREE_LOCK(&lk);
	2958	vput(vp);
	2959	num_dirrem -= 1;
	2960	WORKITEM_FREE(dirrem, D_DIRREM);
	2961	return;
	2962	}
	2963	/*
	2964	* Directory deletion. Decrement reference count for both the
	2965	* just deleted parent directory entry and the reference for ".".
	2966	* Next truncate the directory to length zero. When the
	2967	* truncation completes, arrange to have the reference count on
	2968	* the parent decremented to account for the loss of "..".
	2969	*/
	2970	ip->i_nlink -= 2;
	2971	ip->i_flag \|= IN_CHANGE;
	2972	if (ip->i_nlink < ip->i_effnlink) {
	2973	FREE_LOCK(&lk);
	2974	panic("handle_workitem_remove: bad dir delta");
	2975	}
	2976	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
	2977	FREE_LOCK(&lk);
	2978	if ((error = ffs_truncate(vp, (off_t)0, 0, proc0.p_ucred)) != 0)
	2979	softdep_error("handle_workitem_remove: truncate", error);
	2980	/*
	2981	* Rename a directory to a new parent. Since, we are both deleting
	2982	* and creating a new directory entry, the link count on the new
	2983	* directory should not change. Thus we skip the followup dirrem.
	2984	*/
	2985	if (dirrem->dm_state & DIRCHG) {
	2986	vput(vp);
	2987	num_dirrem -= 1;
	2988	WORKITEM_FREE(dirrem, D_DIRREM);
	2989	return;
	2990	}
	2991	/*
	2992	* If the inodedep does not exist, then the zero'ed inode has
	2993	* been written to disk. If the allocated inode has never been
	2994	* written to disk, then the on-disk inode is zero'ed. In either
	2995	* case we can remove the file immediately.
	2996	*/
	2997	ACQUIRE_LOCK(&lk);
	2998	dirrem->dm_state = 0;
	2999	oldinum = dirrem->dm_oldinum;
	3000	dirrem->dm_oldinum = dirrem->dm_dirinum;
	3001	if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 \|\|
	3002	check_inode_unwritten(inodedep)) {
	3003	FREE_LOCK(&lk);
	3004	vput(vp);
	3005	handle_workitem_remove(dirrem);
	3006	return;
	3007	}
	3008	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
	3009	FREE_LOCK(&lk);
	3010	ip->i_flag \|= IN_CHANGE;
	3011	ffs_update(vp, 0);
	3012	vput(vp);
	3013	}
	3014
	3015	/*
	3016	* Inode de-allocation dependencies.
	3017	*
	3018	* When an inode's link count is reduced to zero, it can be de-allocated. We
	3019	* found it convenient to postpone de-allocation until after the inode is
	3020	* written to disk with its new link count (zero). At this point, all of the
	3021	* on-disk inode's block pointers are nullified and, with careful dependency
	3022	* list ordering, all dependencies related to the inode will be satisfied and
	3023	* the corresponding dependency structures de-allocated. So, if/when the
	3024	* inode is reused, there will be no mixing of old dependencies with new
	3025	* ones. This artificial dependency is set up by the block de-allocation
	3026	* procedure above (softdep_setup_freeblocks) and completed by the
	3027	* following procedure.
	3028	*/
	3029	static void
	3030	handle_workitem_freefile(struct freefile *freefile)
	3031	{
	3032	struct vnode vp;
	3033	struct inode tip;
	3034	struct inodedep *idp;
	3035	int error;
	3036
	3037	#ifdef DEBUG
	3038	ACQUIRE_LOCK(&lk);
	3039	error = inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp);
	3040	FREE_LOCK(&lk);
	3041	if (error)
	3042	panic("handle_workitem_freefile: inodedep survived");
	3043	#endif
	3044	tip.i_devvp = freefile->fx_devvp;
	3045	tip.i_dev = freefile->fx_devvp->v_rdev;
	3046	tip.i_fs = freefile->fx_fs;
	3047	vp.v_data = &tip;
	3048	if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
	3049	softdep_error("handle_workitem_freefile", error);
	3050	WORKITEM_FREE(freefile, D_FREEFILE);
	3051	}
	3052
	3053	/*
	3054	* Helper function which unlinks marker element from work list and returns
	3055	* the next element on the list.
	3056	*/
	3057	static __inline struct worklist *
	3058	markernext(struct worklist *marker)
	3059	{
	3060	struct worklist *next;
	3061
	3062	next = LIST_NEXT(marker, wk_list);
	3063	LIST_REMOVE(marker, wk_list);
	3064	return next;
	3065	}
	3066
	3067	/*
	3068	* checkread, checkwrite
	3069	*
	3070	*/
	3071	static int
	3072	softdep_checkread(struct buf *bp)
	3073	{
	3074	return(0);
	3075	}
	3076
	3077	static int
	3078	softdep_checkwrite(struct buf *bp)
	3079	{
	3080	return(0);
	3081	}
	3082
	3083	/*
	3084	* Disk writes.
	3085	*
	3086	* The dependency structures constructed above are most actively used when file
	3087	* system blocks are written to disk. No constraints are placed on when a
	3088	* block can be written, but unsatisfied update dependencies are made safe by
	3089	* modifying (or replacing) the source memory for the duration of the disk
	3090	* write. When the disk write completes, the memory block is again brought
	3091	* up-to-date.
	3092	*
	3093	* In-core inode structure reclamation.
	3094	*
	3095	* Because there are a finite number of "in-core" inode structures, they are
	3096	* reused regularly. By transferring all inode-related dependencies to the
	3097	* in-memory inode block and indexing them separately (via "inodedep"s), we
	3098	* can allow "in-core" inode structures to be reused at any time and avoid
	3099	* any increase in contention.
	3100	*
	3101	* Called just before entering the device driver to initiate a new disk I/O.
	3102	* The buffer must be locked, thus, no I/O completion operations can occur
	3103	* while we are manipulating its associated dependencies.
	3104	*
	3105	* Parameters:
	3106	* bp: structure describing disk write to occur
	3107	*/
	3108	static void
	3109	softdep_disk_io_initiation(struct buf *bp)
	3110	{
	3111	struct worklist *wk;
	3112	struct worklist marker;
	3113	struct indirdep *indirdep;
	3114
	3115	/*
	3116	* We only care about write operations. There should never
	3117	* be dependencies for reads.
	3118	*/
	3119	if (bp->b_cmd == BUF_CMD_READ)
	3120	panic("softdep_disk_io_initiation: read");
	3121
	3122	marker.wk_type = D_LAST + 1; /* Not a normal workitem */
	3123
	3124	/*
	3125	* Do any necessary pre-I/O processing.
	3126	*/
	3127	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = markernext(&marker)) {
	3128	LIST_INSERT_AFTER(wk, &marker, wk_list);
	3129
	3130	switch (wk->wk_type) {
	3131
	3132	case D_PAGEDEP:
	3133	initiate_write_filepage(WK_PAGEDEP(wk), bp);
	3134	continue;
	3135
	3136	case D_INODEDEP:
	3137	initiate_write_inodeblock(WK_INODEDEP(wk), bp);
	3138	continue;
	3139
	3140	case D_INDIRDEP:
	3141	indirdep = WK_INDIRDEP(wk);
	3142	if (indirdep->ir_state & GOINGAWAY)
	3143	panic("disk_io_initiation: indirdep gone");
	3144	/*
	3145	* If there are no remaining dependencies, this
	3146	* will be writing the real pointers, so the
	3147	* dependency can be freed.
	3148	*/
	3149	if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
	3150	indirdep->ir_savebp->b_flags \|= B_INVAL \| B_NOCACHE;
	3151	brelse(indirdep->ir_savebp);
	3152	/* inline expand WORKLIST_REMOVE(wk); */
	3153	wk->wk_state &= ~ONWORKLIST;
	3154	LIST_REMOVE(wk, wk_list);
	3155	WORKITEM_FREE(indirdep, D_INDIRDEP);
	3156	continue;
	3157	}
	3158	/*
	3159	* Replace up-to-date version with safe version.
	3160	*/
	3161	MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
	3162	M_INDIRDEP, M_SOFTDEP_FLAGS);
	3163	ACQUIRE_LOCK(&lk);
	3164	indirdep->ir_state &= ~ATTACHED;
	3165	indirdep->ir_state \|= UNDONE;
	3166	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
	3167	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
	3168	bp->b_bcount);
	3169	FREE_LOCK(&lk);
	3170	continue;
	3171
	3172	case D_MKDIR:
	3173	case D_BMSAFEMAP:
	3174	case D_ALLOCDIRECT:
	3175	case D_ALLOCINDIR:
	3176	continue;
	3177
	3178	default:
	3179	panic("handle_disk_io_initiation: Unexpected type %s",
	3180	TYPENAME(wk->wk_type));
	3181	/* NOTREACHED */
	3182	}
	3183	}
	3184	}
	3185
	3186	/*
	3187	* Called from within the procedure above to deal with unsatisfied
	3188	* allocation dependencies in a directory. The buffer must be locked,
	3189	* thus, no I/O completion operations can occur while we are
	3190	* manipulating its associated dependencies.
	3191	*/
	3192	static void
	3193	initiate_write_filepage(struct pagedep pagedep, struct buf bp)
	3194	{
	3195	struct diradd *dap;
	3196	struct direct *ep;
	3197	int i;
	3198
	3199	if (pagedep->pd_state & IOSTARTED) {
	3200	/*
	3201	* This can only happen if there is a driver that does not
	3202	* understand chaining. Here biodone will reissue the call
	3203	* to strategy for the incomplete buffers.
	3204	*/
	3205	kprintf("initiate_write_filepage: already started\n");
	3206	return;
	3207	}
	3208	pagedep->pd_state \|= IOSTARTED;
	3209	ACQUIRE_LOCK(&lk);
	3210	for (i = 0; i < DAHASHSZ; i++) {
	3211	LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
	3212	ep = (struct direct *)
	3213	((char *)bp->b_data + dap->da_offset);
	3214	if (ep->d_ino != dap->da_newinum) {
	3215	FREE_LOCK(&lk);
	3216	panic("%s: dir inum %d != new %"PRId64,
	3217	"initiate_write_filepage",
	3218	ep->d_ino, dap->da_newinum);
	3219	}
	3220	if (dap->da_state & DIRCHG)
	3221	ep->d_ino = dap->da_previous->dm_oldinum;
	3222	else
	3223	ep->d_ino = 0;
	3224	dap->da_state &= ~ATTACHED;
	3225	dap->da_state \|= UNDONE;
	3226	}
	3227	}
	3228	FREE_LOCK(&lk);
	3229	}
	3230
	3231	/*
	3232	* Called from within the procedure above to deal with unsatisfied
	3233	* allocation dependencies in an inodeblock. The buffer must be
	3234	* locked, thus, no I/O completion operations can occur while we
	3235	* are manipulating its associated dependencies.
	3236	*
	3237	* Parameters:
	3238	* bp: The inode block
	3239	*/
	3240	static void
	3241	initiate_write_inodeblock(struct inodedep inodedep, struct buf bp)
	3242	{
	3243	struct allocdirect adp, lastadp;
	3244	struct ufs1_dinode *dp;
	3245	struct ufs1_dinode *sip;
	3246	struct fs *fs;
	3247	ufs_lbn_t prevlbn = 0;
	3248	int i, deplist;
	3249
	3250	if (inodedep->id_state & IOSTARTED)
	3251	panic("initiate_write_inodeblock: already started");
	3252	inodedep->id_state \|= IOSTARTED;
	3253	fs = inodedep->id_fs;
	3254	dp = (struct ufs1_dinode *)bp->b_data +
	3255	ino_to_fsbo(fs, inodedep->id_ino);
	3256	/*
	3257	* If the bitmap is not yet written, then the allocated
	3258	* inode cannot be written to disk.
	3259	*/
	3260	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
	3261	if (inodedep->id_savedino != NULL)
	3262	panic("initiate_write_inodeblock: already doing I/O");
	3263	MALLOC(sip, struct ufs1_dinode *,
	3264	sizeof(struct ufs1_dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
	3265	inodedep->id_savedino = sip;
	3266	inodedep->id_savedino = dp;
	3267	bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
	3268	dp->di_gen = inodedep->id_savedino->di_gen;
	3269	return;
	3270	}
	3271	/*
	3272	* If no dependencies, then there is nothing to roll back.
	3273	*/
	3274	inodedep->id_savedsize = dp->di_size;
	3275	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
	3276	return;
	3277	/*
	3278	* Set the dependencies to busy.
	3279	*/
	3280	ACQUIRE_LOCK(&lk);
	3281	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
	3282	adp = TAILQ_NEXT(adp, ad_next)) {
	3283	#ifdef DIAGNOSTIC
	3284	if (deplist != 0 && prevlbn >= adp->ad_lbn) {
	3285	FREE_LOCK(&lk);
	3286	panic("softdep_write_inodeblock: lbn order");
	3287	}
	3288	prevlbn = adp->ad_lbn;
	3289	if (adp->ad_lbn < NDADDR &&
	3290	dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
	3291	FREE_LOCK(&lk);
	3292	panic("%s: direct pointer #%ld mismatch %d != %d",
	3293	"softdep_write_inodeblock", adp->ad_lbn,
	3294	dp->di_db[adp->ad_lbn], adp->ad_newblkno);
	3295	}
	3296	if (adp->ad_lbn >= NDADDR &&
	3297	dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) {
	3298	FREE_LOCK(&lk);
	3299	panic("%s: indirect pointer #%ld mismatch %d != %d",
	3300	"softdep_write_inodeblock", adp->ad_lbn - NDADDR,
	3301	dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno);
	3302	}
	3303	deplist \|= 1 << adp->ad_lbn;
	3304	if ((adp->ad_state & ATTACHED) == 0) {
	3305	FREE_LOCK(&lk);
	3306	panic("softdep_write_inodeblock: Unknown state 0x%x",
	3307	adp->ad_state);
	3308	}
	3309	#endif /* DIAGNOSTIC */
	3310	adp->ad_state &= ~ATTACHED;
	3311	adp->ad_state \|= UNDONE;
	3312	}
	3313	/*
	3314	* The on-disk inode cannot claim to be any larger than the last
	3315	* fragment that has been written. Otherwise, the on-disk inode
	3316	* might have fragments that were not the last block in the file
	3317	* which would corrupt the filesystem.
	3318	*/
	3319	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
	3320	lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
	3321	if (adp->ad_lbn >= NDADDR)
	3322	break;
	3323	dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
	3324	/* keep going until hitting a rollback to a frag */
	3325	if (adp->ad_oldsize == 0 \|\| adp->ad_oldsize == fs->fs_bsize)
	3326	continue;
	3327	dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
	3328	for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
	3329	#ifdef DIAGNOSTIC
	3330	if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
	3331	FREE_LOCK(&lk);
	3332	panic("softdep_write_inodeblock: lost dep1");
	3333	}
	3334	#endif /* DIAGNOSTIC */
	3335	dp->di_db[i] = 0;
	3336	}
	3337	for (i = 0; i < NIADDR; i++) {
	3338	#ifdef DIAGNOSTIC
	3339	if (dp->di_ib[i] != 0 &&
	3340	(deplist & ((1 << NDADDR) << i)) == 0) {
	3341	FREE_LOCK(&lk);
	3342	panic("softdep_write_inodeblock: lost dep2");
	3343	}
	3344	#endif /* DIAGNOSTIC */
	3345	dp->di_ib[i] = 0;
	3346	}
	3347	FREE_LOCK(&lk);
	3348	return;
	3349	}
	3350	/*
	3351	* If we have zero'ed out the last allocated block of the file,
	3352	* roll back the size to the last currently allocated block.
	3353	* We know that this last allocated block is a full-sized as
	3354	* we already checked for fragments in the loop above.
	3355	*/
	3356	if (lastadp != NULL &&
	3357	dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
	3358	for (i = lastadp->ad_lbn; i >= 0; i--)
	3359	if (dp->di_db[i] != 0)
	3360	break;
	3361	dp->di_size = (i + 1) * fs->fs_bsize;
	3362	}
	3363	/*
	3364	* The only dependencies are for indirect blocks.
	3365	*
	3366	* The file size for indirect block additions is not guaranteed.
	3367	* Such a guarantee would be non-trivial to achieve. The conventional
	3368	* synchronous write implementation also does not make this guarantee.
	3369	* Fsck should catch and fix discrepancies. Arguably, the file size
	3370	* can be over-estimated without destroying integrity when the file
	3371	* moves into the indirect blocks (i.e., is large). If we want to
	3372	* postpone fsck, we are stuck with this argument.
	3373	*/
	3374	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
	3375	dp->di_ib[adp->ad_lbn - NDADDR] = 0;
	3376	FREE_LOCK(&lk);
	3377	}
	3378
	3379	/*
	3380	* This routine is called during the completion interrupt
	3381	* service routine for a disk write (from the procedure called
	3382	* by the device driver to inform the filesystem caches of
	3383	* a request completion). It should be called early in this
	3384	* procedure, before the block is made available to other
	3385	* processes or other routines are called.
	3386	*
	3387	* Parameters:
	3388	* bp: describes the completed disk write
	3389	*/
	3390	static void
	3391	softdep_disk_write_complete(struct buf *bp)
	3392	{
	3393	struct worklist *wk;
	3394	struct workhead reattach;
	3395	struct newblk *newblk;
	3396	struct allocindir *aip;
	3397	struct allocdirect *adp;
	3398	struct indirdep *indirdep;
	3399	struct inodedep *inodedep;
	3400	struct bmsafemap *bmsafemap;
	3401
	3402	#ifdef DEBUG
	3403	if (lk.lkt_held != NOHOLDER)
	3404	panic("softdep_disk_write_complete: lock is held");
	3405	lk.lkt_held = SPECIAL_FLAG;
	3406	#endif
	3407	LIST_INIT(&reattach);
	3408	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
	3409	WORKLIST_REMOVE(wk);
	3410	switch (wk->wk_type) {
	3411
	3412	case D_PAGEDEP:
	3413	if (handle_written_filepage(WK_PAGEDEP(wk), bp))
	3414	WORKLIST_INSERT(&reattach, wk);
	3415	continue;
	3416
	3417	case D_INODEDEP:
	3418	if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
	3419	WORKLIST_INSERT(&reattach, wk);
	3420	continue;
	3421
	3422	case D_BMSAFEMAP:
	3423	bmsafemap = WK_BMSAFEMAP(wk);
	3424	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
	3425	newblk->nb_state \|= DEPCOMPLETE;
	3426	newblk->nb_bmsafemap = NULL;
	3427	LIST_REMOVE(newblk, nb_deps);
	3428	}
	3429	while ((adp =
	3430	LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
	3431	adp->ad_state \|= DEPCOMPLETE;
	3432	adp->ad_buf = NULL;
	3433	LIST_REMOVE(adp, ad_deps);
	3434	handle_allocdirect_partdone(adp);
	3435	}
	3436	while ((aip =
	3437	LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
	3438	aip->ai_state \|= DEPCOMPLETE;
	3439	aip->ai_buf = NULL;
	3440	LIST_REMOVE(aip, ai_deps);
	3441	handle_allocindir_partdone(aip);
	3442	}
	3443	while ((inodedep =
	3444	LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
	3445	inodedep->id_state \|= DEPCOMPLETE;
	3446	LIST_REMOVE(inodedep, id_deps);
	3447	inodedep->id_buf = NULL;
	3448	}
	3449	WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
	3450	continue;
	3451
	3452	case D_MKDIR:
	3453	handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
	3454	continue;
	3455
	3456	case D_ALLOCDIRECT:
	3457	adp = WK_ALLOCDIRECT(wk);
	3458	adp->ad_state \|= COMPLETE;
	3459	handle_allocdirect_partdone(adp);
	3460	continue;
	3461
	3462	case D_ALLOCINDIR:
	3463	aip = WK_ALLOCINDIR(wk);
	3464	aip->ai_state \|= COMPLETE;
	3465	handle_allocindir_partdone(aip);
	3466	continue;
	3467
	3468	case D_INDIRDEP:
	3469	indirdep = WK_INDIRDEP(wk);
	3470	if (indirdep->ir_state & GOINGAWAY) {
	3471	lk.lkt_held = NOHOLDER;
	3472	panic("disk_write_complete: indirdep gone");
	3473	}
	3474	bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
	3475	FREE(indirdep->ir_saveddata, M_INDIRDEP);
	3476	indirdep->ir_saveddata = 0;
	3477	indirdep->ir_state &= ~UNDONE;
	3478	indirdep->ir_state \|= ATTACHED;
	3479	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
	3480	handle_allocindir_partdone(aip);
	3481	if (aip == LIST_FIRST(&indirdep->ir_donehd)) {
	3482	lk.lkt_held = NOHOLDER;
	3483	panic("disk_write_complete: not gone");
	3484	}
	3485	}
	3486	WORKLIST_INSERT(&reattach, wk);
	3487	if ((bp->b_flags & B_DELWRI) == 0)
	3488	stat_indir_blk_ptrs++;
	3489	bdirty(bp);
	3490	continue;
	3491
	3492	default:
	3493	lk.lkt_held = NOHOLDER;
	3494	panic("handle_disk_write_complete: Unknown type %s",
	3495	TYPENAME(wk->wk_type));
	3496	/* NOTREACHED */
	3497	}
	3498	}
	3499	/*
	3500	* Reattach any requests that must be redone.
	3501	*/
	3502	while ((wk = LIST_FIRST(&reattach)) != NULL) {
	3503	WORKLIST_REMOVE(wk);
	3504	WORKLIST_INSERT_BP(bp, wk);
	3505	}
	3506	#ifdef DEBUG
	3507	if (lk.lkt_held != SPECIAL_FLAG)
	3508	panic("softdep_disk_write_complete: lock lost");
	3509	lk.lkt_held = NOHOLDER;
	3510	#endif
	3511	}
	3512
	3513	/*
	3514	* Called from within softdep_disk_write_complete above. Note that
	3515	* this routine is always called from interrupt level with further
	3516	* splbio interrupts blocked.
	3517	*
	3518	* Parameters:
	3519	* adp: the completed allocdirect
	3520	*/
	3521	static void
	3522	handle_allocdirect_partdone(struct allocdirect *adp)
	3523	{
	3524	struct allocdirect *listadp;
	3525	struct inodedep *inodedep;
	3526	long bsize;
	3527
	3528	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
	3529	return;
	3530	if (adp->ad_buf != NULL) {
	3531	lk.lkt_held = NOHOLDER;
	3532	panic("handle_allocdirect_partdone: dangling dep");
	3533	}
	3534	/*
	3535	* The on-disk inode cannot claim to be any larger than the last
	3536	* fragment that has been written. Otherwise, the on-disk inode
	3537	* might have fragments that were not the last block in the file
	3538	* which would corrupt the filesystem. Thus, we cannot free any
	3539	* allocdirects after one whose ad_oldblkno claims a fragment as
	3540	* these blocks must be rolled back to zero before writing the inode.
	3541	* We check the currently active set of allocdirects in id_inoupdt.
	3542	*/
	3543	inodedep = adp->ad_inodedep;
	3544	bsize = inodedep->id_fs->fs_bsize;
	3545	TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) {
	3546	/* found our block */
	3547	if (listadp == adp)
	3548	break;
	3549	/* continue if ad_oldlbn is not a fragment */
	3550	if (listadp->ad_oldsize == 0 \|\|
	3551	listadp->ad_oldsize == bsize)
	3552	continue;
	3553	/* hit a fragment */
	3554	return;
	3555	}
	3556	/*
	3557	* If we have reached the end of the current list without
	3558	* finding the just finished dependency, then it must be
	3559	* on the future dependency list. Future dependencies cannot
	3560	* be freed until they are moved to the current list.
	3561	*/
	3562	if (listadp == NULL) {
	3563	#ifdef DEBUG
	3564	TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next)
	3565	/* found our block */
	3566	if (listadp == adp)
	3567	break;
	3568	if (listadp == NULL) {
	3569	lk.lkt_held = NOHOLDER;
	3570	panic("handle_allocdirect_partdone: lost dep");
	3571	}
	3572	#endif /* DEBUG */
	3573	return;
	3574	}
	3575	/*
	3576	* If we have found the just finished dependency, then free
	3577	* it along with anything that follows it that is complete.
	3578	*/
	3579	for (; adp; adp = listadp) {
	3580	listadp = TAILQ_NEXT(adp, ad_next);
	3581	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
	3582	return;
	3583	free_allocdirect(&inodedep->id_inoupdt, adp, 1);
	3584	}
	3585	}
	3586
	3587	/*
	3588	* Called from within softdep_disk_write_complete above. Note that
	3589	* this routine is always called from interrupt level with further
	3590	* splbio interrupts blocked.
	3591	*
	3592	* Parameters:
	3593	* aip: the completed allocindir
	3594	*/
	3595	static void
	3596	handle_allocindir_partdone(struct allocindir *aip)
	3597	{
	3598	struct indirdep *indirdep;
	3599
	3600	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
	3601	return;
	3602	if (aip->ai_buf != NULL) {
	3603	lk.lkt_held = NOHOLDER;
	3604	panic("handle_allocindir_partdone: dangling dependency");
	3605	}
	3606	indirdep = aip->ai_indirdep;
	3607	if (indirdep->ir_state & UNDONE) {
	3608	LIST_REMOVE(aip, ai_next);
	3609	LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
	3610	return;
	3611	}
	3612	((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
	3613	aip->ai_newblkno;
	3614	LIST_REMOVE(aip, ai_next);
	3615	if (aip->ai_freefrag != NULL)
	3616	add_to_worklist(&aip->ai_freefrag->ff_list);
	3617	WORKITEM_FREE(aip, D_ALLOCINDIR);
	3618	}
	3619
	3620	/*
	3621	* Called from within softdep_disk_write_complete above to restore
	3622	* in-memory inode block contents to their most up-to-date state. Note
	3623	* that this routine is always called from interrupt level with further
	3624	* splbio interrupts blocked.
	3625	*
	3626	* Parameters:
	3627	* bp: buffer containing the inode block
	3628	*/
	3629	static int
	3630	handle_written_inodeblock(struct inodedep inodedep, struct buf bp)
	3631	{
	3632	struct worklist wk, filefree;
	3633	struct allocdirect adp, nextadp;
	3634	struct ufs1_dinode *dp;
	3635	int hadchanges;
	3636
	3637	if ((inodedep->id_state & IOSTARTED) == 0) {
	3638	lk.lkt_held = NOHOLDER;
	3639	panic("handle_written_inodeblock: not started");
	3640	}
	3641	inodedep->id_state &= ~IOSTARTED;
	3642	dp = (struct ufs1_dinode *)bp->b_data +
	3643	ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
	3644	/*
	3645	* If we had to rollback the inode allocation because of
	3646	* bitmaps being incomplete, then simply restore it.
	3647	* Keep the block dirty so that it will not be reclaimed until
	3648	* all associated dependencies have been cleared and the
	3649	* corresponding updates written to disk.
	3650	*/
	3651	if (inodedep->id_savedino != NULL) {
	3652	dp = inodedep->id_savedino;
	3653	FREE(inodedep->id_savedino, M_INODEDEP);
	3654	inodedep->id_savedino = NULL;
	3655	if ((bp->b_flags & B_DELWRI) == 0)
	3656	stat_inode_bitmap++;
	3657	bdirty(bp);
	3658	return (1);
	3659	}
	3660	inodedep->id_state \|= COMPLETE;
	3661	/*
	3662	* Roll forward anything that had to be rolled back before
	3663	* the inode could be updated.
	3664	*/
	3665	hadchanges = 0;
	3666	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
	3667	nextadp = TAILQ_NEXT(adp, ad_next);
	3668	if (adp->ad_state & ATTACHED) {
	3669	lk.lkt_held = NOHOLDER;
	3670	panic("handle_written_inodeblock: new entry");
	3671	}
	3672	if (adp->ad_lbn < NDADDR) {
	3673	if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) {
	3674	lk.lkt_held = NOHOLDER;
	3675	panic("%s: %s #%ld mismatch %d != %d",
	3676	"handle_written_inodeblock",
	3677	"direct pointer", adp->ad_lbn,
	3678	dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
	3679	}
	3680	dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
	3681	} else {
	3682	if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) {
	3683	lk.lkt_held = NOHOLDER;
	3684	panic("%s: %s #%ld allocated as %d",
	3685	"handle_written_inodeblock",
	3686	"indirect pointer", adp->ad_lbn - NDADDR,
	3687	dp->di_ib[adp->ad_lbn - NDADDR]);
	3688	}
	3689	dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno;
	3690	}
	3691	adp->ad_state &= ~UNDONE;
	3692	adp->ad_state \|= ATTACHED;
	3693	hadchanges = 1;
	3694	}
	3695	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
	3696	stat_direct_blk_ptrs++;
	3697	/*
	3698	* Reset the file size to its most up-to-date value.
	3699	*/
	3700	if (inodedep->id_savedsize == -1) {
	3701	lk.lkt_held = NOHOLDER;
	3702	panic("handle_written_inodeblock: bad size");
	3703	}
	3704	if (dp->di_size != inodedep->id_savedsize) {
	3705	dp->di_size = inodedep->id_savedsize;
	3706	hadchanges = 1;
	3707	}
	3708	inodedep->id_savedsize = -1;
	3709	/*
	3710	* If there were any rollbacks in the inode block, then it must be
	3711	* marked dirty so that its will eventually get written back in
	3712	* its correct form.
	3713	*/
	3714	if (hadchanges)
	3715	bdirty(bp);
	3716	/*
	3717	* Process any allocdirects that completed during the update.
	3718	*/
	3719	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
	3720	handle_allocdirect_partdone(adp);
	3721	/*
	3722	* Process deallocations that were held pending until the
	3723	* inode had been written to disk. Freeing of the inode
	3724	* is delayed until after all blocks have been freed to
	3725	* avoid creation of new <vfsid, inum, lbn> triples
	3726	* before the old ones have been deleted.
	3727	*/
	3728	filefree = NULL;
	3729	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
	3730	WORKLIST_REMOVE(wk);
	3731	switch (wk->wk_type) {
	3732
	3733	case D_FREEFILE:
	3734	/*
	3735	* We defer adding filefree to the worklist until
	3736	* all other additions have been made to ensure
	3737	* that it will be done after all the old blocks
	3738	* have been freed.
	3739	*/
	3740	if (filefree != NULL) {
	3741	lk.lkt_held = NOHOLDER;
	3742	panic("handle_written_inodeblock: filefree");
	3743	}
	3744	filefree = wk;
	3745	continue;
	3746
	3747	case D_MKDIR:
	3748	handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
	3749	continue;
	3750
	3751	case D_DIRADD:
	3752	diradd_inode_written(WK_DIRADD(wk), inodedep);
	3753	continue;
	3754
	3755	case D_FREEBLKS:
	3756	wk->wk_state \|= COMPLETE;
	3757	if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE)
	3758	continue;
	3759	/* -- fall through -- */
	3760	case D_FREEFRAG:
	3761	case D_DIRREM:
	3762	add_to_worklist(wk);
	3763	continue;
	3764
	3765	default:
	3766	lk.lkt_held = NOHOLDER;
	3767	panic("handle_written_inodeblock: Unknown type %s",
	3768	TYPENAME(wk->wk_type));
	3769	/* NOTREACHED */
	3770	}
	3771	}
	3772	if (filefree != NULL) {
	3773	if (free_inodedep(inodedep) == 0) {
	3774	lk.lkt_held = NOHOLDER;
	3775	panic("handle_written_inodeblock: live inodedep");
	3776	}
	3777	add_to_worklist(filefree);
	3778	return (0);
	3779	}
	3780
	3781	/*
	3782	* If no outstanding dependencies, free it.
	3783	*/
	3784	if (free_inodedep(inodedep) \|\| TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
	3785	return (0);
	3786	return (hadchanges);
	3787	}
	3788
	3789	/*
	3790	* Process a diradd entry after its dependent inode has been written.
	3791	* This routine must be called with splbio interrupts blocked.
	3792	*/
	3793	static void
	3794	diradd_inode_written(struct diradd dap, struct inodedep inodedep)
	3795	{
	3796	struct pagedep *pagedep;
	3797
	3798	dap->da_state \|= COMPLETE;
	3799	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
	3800	if (dap->da_state & DIRCHG)
	3801	pagedep = dap->da_previous->dm_pagedep;
	3802	else
	3803	pagedep = dap->da_pagedep;
	3804	LIST_REMOVE(dap, da_pdlist);
	3805	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
	3806	}
	3807	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
	3808	}
	3809
	3810	/*
	3811	* Handle the completion of a mkdir dependency.
	3812	*/
	3813	static void
	3814	handle_written_mkdir(struct mkdir *mkdir, int type)
	3815	{
	3816	struct diradd *dap;
	3817	struct pagedep *pagedep;
	3818
	3819	if (mkdir->md_state != type) {
	3820	lk.lkt_held = NOHOLDER;
	3821	panic("handle_written_mkdir: bad type");
	3822	}
	3823	dap = mkdir->md_diradd;
	3824	dap->da_state &= ~type;
	3825	if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) == 0)
	3826	dap->da_state \|= DEPCOMPLETE;
	3827	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
	3828	if (dap->da_state & DIRCHG)
	3829	pagedep = dap->da_previous->dm_pagedep;
	3830	else
	3831	pagedep = dap->da_pagedep;
	3832	LIST_REMOVE(dap, da_pdlist);
	3833	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
	3834	}
	3835	LIST_REMOVE(mkdir, md_mkdirs);
	3836	WORKITEM_FREE(mkdir, D_MKDIR);
	3837	}
	3838
	3839	/*
	3840	* Called from within softdep_disk_write_complete above.
	3841	* A write operation was just completed. Removed inodes can
	3842	* now be freed and associated block pointers may be committed.
	3843	* Note that this routine is always called from interrupt level
	3844	* with further splbio interrupts blocked.
	3845	*
	3846	* Parameters:
	3847	* bp: buffer containing the written page
	3848	*/
	3849	static int
	3850	handle_written_filepage(struct pagedep pagedep, struct buf bp)
	3851	{
	3852	struct dirrem *dirrem;
	3853	struct diradd dap, nextdap;
	3854	struct direct *ep;
	3855	int i, chgs;
	3856
	3857	if ((pagedep->pd_state & IOSTARTED) == 0) {
	3858	lk.lkt_held = NOHOLDER;
	3859	panic("handle_written_filepage: not started");
	3860	}
	3861	pagedep->pd_state &= ~IOSTARTED;
	3862	/*
	3863	* Process any directory removals that have been committed.
	3864	*/
	3865	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
	3866	LIST_REMOVE(dirrem, dm_next);
	3867	dirrem->dm_dirinum = pagedep->pd_ino;
	3868	add_to_worklist(&dirrem->dm_list);
	3869	}
	3870	/*
	3871	* Free any directory additions that have been committed.
	3872	*/
	3873	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
	3874	free_diradd(dap);
	3875	/*
	3876	* Uncommitted directory entries must be restored.
	3877	*/
	3878	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
	3879	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
	3880	dap = nextdap) {
	3881	nextdap = LIST_NEXT(dap, da_pdlist);
	3882	if (dap->da_state & ATTACHED) {
	3883	lk.lkt_held = NOHOLDER;
	3884	panic("handle_written_filepage: attached");
	3885	}
	3886	ep = (struct direct *)
	3887	((char *)bp->b_data + dap->da_offset);
	3888	ep->d_ino = dap->da_newinum;
	3889	dap->da_state &= ~UNDONE;
	3890	dap->da_state \|= ATTACHED;
	3891	chgs = 1;
	3892	/*
	3893	* If the inode referenced by the directory has
	3894	* been written out, then the dependency can be
	3895	* moved to the pending list.
	3896	*/
	3897	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
	3898	LIST_REMOVE(dap, da_pdlist);
	3899	LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
	3900	da_pdlist);
	3901	}
	3902	}
	3903	}
	3904	/*
	3905	* If there were any rollbacks in the directory, then it must be
	3906	* marked dirty so that its will eventually get written back in
	3907	* its correct form.
	3908	*/
	3909	if (chgs) {
	3910	if ((bp->b_flags & B_DELWRI) == 0)
	3911	stat_dir_entry++;
	3912	bdirty(bp);
	3913	}
	3914	/*
	3915	* If no dependencies remain, the pagedep will be freed.
	3916	* Otherwise it will remain to update the page before it
	3917	* is written back to disk.
	3918	*/
	3919	if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) {
	3920	for (i = 0; i < DAHASHSZ; i++)
	3921	if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
	3922	break;
	3923	if (i == DAHASHSZ) {
	3924	LIST_REMOVE(pagedep, pd_hash);
	3925	WORKITEM_FREE(pagedep, D_PAGEDEP);
	3926	return (0);
	3927	}
	3928	}
	3929	return (1);
	3930	}
	3931
	3932	/*
	3933	* Writing back in-core inode structures.
	3934	*
	3935	* The filesystem only accesses an inode's contents when it occupies an
	3936	* "in-core" inode structure. These "in-core" structures are separate from
	3937	* the page frames used to cache inode blocks. Only the latter are
	3938	* transferred to/from the disk. So, when the updated contents of the
	3939	* "in-core" inode structure are copied to the corresponding in-memory inode
	3940	* block, the dependencies are also transferred. The following procedure is
	3941	* called when copying a dirty "in-core" inode to a cached inode block.
	3942	*/
	3943
	3944	/*
	3945	* Called when an inode is loaded from disk. If the effective link count
	3946	* differed from the actual link count when it was last flushed, then we
	3947	* need to ensure that the correct effective link count is put back.
	3948	*
	3949	* Parameters:
	3950	* ip: the "in_core" copy of the inode
	3951	*/
	3952	void
	3953	softdep_load_inodeblock(struct inode *ip)
	3954	{
	3955	struct inodedep *inodedep;
	3956
	3957	/*
	3958	* Check for alternate nlink count.
	3959	*/
	3960	ip->i_effnlink = ip->i_nlink;
	3961	ACQUIRE_LOCK(&lk);
	3962	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
	3963	FREE_LOCK(&lk);
	3964	return;
	3965	}
	3966	ip->i_effnlink -= inodedep->id_nlinkdelta;
	3967	FREE_LOCK(&lk);
	3968	}
	3969
	3970	/*
	3971	* This routine is called just before the "in-core" inode
	3972	* information is to be copied to the in-memory inode block.
	3973	* Recall that an inode block contains several inodes. If
	3974	* the force flag is set, then the dependencies will be
	3975	* cleared so that the update can always be made. Note that
	3976	* the buffer is locked when this routine is called, so we
	3977	* will never be in the middle of writing the inode block
	3978	* to disk.
	3979	*
	3980	* Parameters:
	3981	* ip: the "in_core" copy of the inode
	3982	* bp: the buffer containing the inode block
	3983	* waitfor: nonzero => update must be allowed
	3984	*/
	3985	void
	3986	softdep_update_inodeblock(struct inode ip, struct buf bp,
	3987	int waitfor)
	3988	{
	3989	struct inodedep *inodedep;
	3990	struct worklist *wk;
	3991	int error, gotit;
	3992
	3993	/*
	3994	* If the effective link count is not equal to the actual link
	3995	* count, then we must track the difference in an inodedep while
	3996	* the inode is (potentially) tossed out of the cache. Otherwise,
	3997	* if there is no existing inodedep, then there are no dependencies
	3998	* to track.
	3999	*/
	4000	ACQUIRE_LOCK(&lk);
	4001	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
	4002	FREE_LOCK(&lk);
	4003	if (ip->i_effnlink != ip->i_nlink)
	4004	panic("softdep_update_inodeblock: bad link count");
	4005	return;
	4006	}
	4007	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) {
	4008	FREE_LOCK(&lk);
	4009	panic("softdep_update_inodeblock: bad delta");
	4010	}
	4011	/*
	4012	* Changes have been initiated. Anything depending on these
	4013	* changes cannot occur until this inode has been written.
	4014	*/
	4015	inodedep->id_state &= ~COMPLETE;
	4016	if ((inodedep->id_state & ONWORKLIST) == 0)
	4017	WORKLIST_INSERT_BP(bp, &inodedep->id_list);
	4018	/*
	4019	* Any new dependencies associated with the incore inode must
	4020	* now be moved to the list associated with the buffer holding
	4021	* the in-memory copy of the inode. Once merged process any
	4022	* allocdirects that are completed by the merger.
	4023	*/
	4024	merge_inode_lists(inodedep);
	4025	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
	4026	handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
	4027	/*
	4028	* Now that the inode has been pushed into the buffer, the
	4029	* operations dependent on the inode being written to disk
	4030	* can be moved to the id_bufwait so that they will be
	4031	* processed when the buffer I/O completes.
	4032	*/
	4033	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
	4034	WORKLIST_REMOVE(wk);
	4035	WORKLIST_INSERT(&inodedep->id_bufwait, wk);
	4036	}
	4037	/*
	4038	* Newly allocated inodes cannot be written until the bitmap
	4039	* that allocates them have been written (indicated by
	4040	* DEPCOMPLETE being set in id_state). If we are doing a
	4041	* forced sync (e.g., an fsync on a file), we force the bitmap
	4042	* to be written so that the update can be done.
	4043	*/
	4044	if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\| waitfor == 0) {
	4045	FREE_LOCK(&lk);
	4046	return;
	4047	}
	4048	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
	4049	FREE_LOCK(&lk);
	4050	if (gotit &&
	4051	(error = bwrite(inodedep->id_buf)) != 0)
	4052	softdep_error("softdep_update_inodeblock: bwrite", error);
	4053	}
	4054
	4055	/*
	4056	* Merge the new inode dependency list (id_newinoupdt) into the old
	4057	* inode dependency list (id_inoupdt). This routine must be called
	4058	* with splbio interrupts blocked.
	4059	*/
	4060	static void
	4061	merge_inode_lists(struct inodedep *inodedep)
	4062	{
	4063	struct allocdirect listadp, newadp;
	4064
	4065	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
	4066	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
	4067	if (listadp->ad_lbn < newadp->ad_lbn) {
	4068	listadp = TAILQ_NEXT(listadp, ad_next);
	4069	continue;
	4070	}
	4071	TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
	4072	TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
	4073	if (listadp->ad_lbn == newadp->ad_lbn) {
	4074	allocdirect_merge(&inodedep->id_inoupdt, newadp,
	4075	listadp);
	4076	listadp = newadp;
	4077	}
	4078	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
	4079	}
	4080	while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
	4081	TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
	4082	TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
	4083	}
	4084	}
	4085
	4086	/*
	4087	* If we are doing an fsync, then we must ensure that any directory
	4088	* entries for the inode have been written after the inode gets to disk.
	4089	*
	4090	* Parameters:
	4091	* vp: the "in_core" copy of the inode
	4092	*/
	4093	static int
	4094	softdep_fsync(struct vnode *vp)
	4095	{
	4096	struct inodedep *inodedep;
	4097	struct pagedep *pagedep;
	4098	struct worklist *wk;
	4099	struct diradd *dap;
	4100	struct mount *mnt;
	4101	struct vnode *pvp;
	4102	struct inode *ip;
	4103	struct buf *bp;
	4104	struct fs *fs;
	4105	int error, flushparent;
	4106	ino_t parentino;
	4107	ufs_lbn_t lbn;
	4108
	4109	/*
	4110	* Move check from original kernel code, possibly not needed any
	4111	* more with the per-mount bioops.
	4112	*/
	4113	if ((vp->v_mount->mnt_flag & MNT_SOFTDEP) == 0)
	4114	return (0);
	4115
	4116	ip = VTOI(vp);
	4117	fs = ip->i_fs;
	4118	ACQUIRE_LOCK(&lk);
	4119	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
	4120	FREE_LOCK(&lk);
	4121	return (0);
	4122	}
	4123	if (LIST_FIRST(&inodedep->id_inowait) != NULL \|\|
	4124	LIST_FIRST(&inodedep->id_bufwait) != NULL \|\|
	4125	TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\|
	4126	TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
	4127	FREE_LOCK(&lk);
	4128	panic("softdep_fsync: pending ops");
	4129	}
	4130	for (error = 0, flushparent = 0; ; ) {
	4131	if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
	4132	break;
	4133	if (wk->wk_type != D_DIRADD) {
	4134	FREE_LOCK(&lk);
	4135	panic("softdep_fsync: Unexpected type %s",
	4136	TYPENAME(wk->wk_type));
	4137	}
	4138	dap = WK_DIRADD(wk);
	4139	/*
	4140	* Flush our parent if this directory entry
	4141	* has a MKDIR_PARENT dependency.
	4142	*/
	4143	if (dap->da_state & DIRCHG)
	4144	pagedep = dap->da_previous->dm_pagedep;
	4145	else
	4146	pagedep = dap->da_pagedep;
	4147	mnt = pagedep->pd_mnt;
	4148	parentino = pagedep->pd_ino;
	4149	lbn = pagedep->pd_lbn;
	4150	if ((dap->da_state & (MKDIR_BODY \| COMPLETE)) != COMPLETE) {
	4151	FREE_LOCK(&lk);
	4152	panic("softdep_fsync: dirty");
	4153	}
	4154	flushparent = dap->da_state & MKDIR_PARENT;
	4155	/*
	4156	* If we are being fsync'ed as part of vgone'ing this vnode,
	4157	* then we will not be able to release and recover the
	4158	* vnode below, so we just have to give up on writing its
	4159	* directory entry out. It will eventually be written, just
	4160	* not now, but then the user was not asking to have it
	4161	* written, so we are not breaking any promises.
	4162	*/
	4163	if (vp->v_flag & VRECLAIMED)
	4164	break;
	4165	/*
	4166	* We prevent deadlock by always fetching inodes from the
	4167	* root, moving down the directory tree. Thus, when fetching
	4168	* our parent directory, we must unlock ourselves before
	4169	* requesting the lock on our parent. See the comment in
	4170	* ufs_lookup for details on possible races.
	4171	*/
	4172	FREE_LOCK(&lk);
	4173	vn_unlock(vp);
	4174	error = VFS_VGET(mnt, NULL, parentino, &pvp);
	4175	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	4176	if (error != 0)
	4177	return (error);
	4178	if (flushparent) {
	4179	if ((error = ffs_update(pvp, 1)) != 0) {
	4180	vput(pvp);
	4181	return (error);
	4182	}
	4183	}
	4184	/*
	4185	* Flush directory page containing the inode's name.
	4186	*/
	4187	error = bread(pvp, lblktodoff(fs, lbn), blksize(fs, VTOI(pvp), lbn), &bp);
	4188	if (error == 0)
	4189	error = bwrite(bp);
	4190	vput(pvp);
	4191	if (error != 0)
	4192	return (error);
	4193	ACQUIRE_LOCK(&lk);
	4194	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
	4195	break;
	4196	}
	4197	FREE_LOCK(&lk);
	4198	return (0);
	4199	}
	4200
	4201	/*
	4202	* Flush all the dirty bitmaps associated with the block device
	4203	* before flushing the rest of the dirty blocks so as to reduce
	4204	* the number of dependencies that will have to be rolled back.
	4205	*/
	4206	static int softdep_fsync_mountdev_bp(struct buf bp, void data);
	4207
	4208	void
	4209	softdep_fsync_mountdev(struct vnode *vp)
	4210	{
	4211	if (!vn_isdisk(vp, NULL))
	4212	panic("softdep_fsync_mountdev: vnode not a disk");
	4213	ACQUIRE_LOCK(&lk);
	4214	lwkt_gettoken(&vp->v_token);
	4215	RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
	4216	softdep_fsync_mountdev_bp, vp);
	4217	lwkt_reltoken(&vp->v_token);
	4218	drain_output(vp, 1);
	4219	FREE_LOCK(&lk);
	4220	}
	4221
	4222	static int
	4223	softdep_fsync_mountdev_bp(struct buf bp, void data)
	4224	{
	4225	struct worklist *wk;
	4226	struct vnode *vp = data;
	4227
	4228	/*
	4229	* If it is already scheduled, skip to the next buffer.
	4230	*/
	4231	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT))
	4232	return(0);
	4233	if (bp->b_vp != vp \|\| (bp->b_flags & B_DELWRI) == 0) {
	4234	BUF_UNLOCK(bp);
	4235	kprintf("softdep_fsync_mountdev_bp: warning, buffer %p ripped out from under vnode %p\n", bp, vp);
	4236	return(0);
	4237	}
	4238	/*
	4239	* We are only interested in bitmaps with outstanding
	4240	* dependencies.
	4241	*/
	4242	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL \|\|
	4243	wk->wk_type != D_BMSAFEMAP) {
	4244	BUF_UNLOCK(bp);
	4245	return(0);
	4246	}
	4247	bremfree(bp);
	4248	FREE_LOCK(&lk);
	4249	(void) bawrite(bp);
	4250	ACQUIRE_LOCK(&lk);
	4251	return(0);
	4252	}
	4253
	4254	/*
	4255	* This routine is called when we are trying to synchronously flush a
	4256	* file. This routine must eliminate any filesystem metadata dependencies
	4257	* so that the syncing routine can succeed by pushing the dirty blocks
	4258	* associated with the file. If any I/O errors occur, they are returned.
	4259	*/
	4260	struct softdep_sync_metadata_info {
	4261	struct vnode *vp;
	4262	int waitfor;
	4263	};
	4264
	4265	static int softdep_sync_metadata_bp(struct buf bp, void data);
	4266
	4267	int
	4268	softdep_sync_metadata(struct vnode vp, struct thread td)
	4269	{
	4270	struct softdep_sync_metadata_info info;
	4271	int error, waitfor;
	4272
	4273	/*
	4274	* Check whether this vnode is involved in a filesystem
	4275	* that is doing soft dependency processing.
	4276	*/
	4277	if (!vn_isdisk(vp, NULL)) {
	4278	if (!DOINGSOFTDEP(vp))
	4279	return (0);
	4280	} else
	4281	if (vp->v_rdev->si_mountpoint == NULL \|\|
	4282	(vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0)
	4283	return (0);
	4284	/*
	4285	* Ensure that any direct block dependencies have been cleared.
	4286	*/
	4287	ACQUIRE_LOCK(&lk);
	4288	if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
	4289	FREE_LOCK(&lk);
	4290	return (error);
	4291	}
	4292	/*
	4293	* For most files, the only metadata dependencies are the
	4294	* cylinder group maps that allocate their inode or blocks.
	4295	* The block allocation dependencies can be found by traversing
	4296	* the dependency lists for any buffers that remain on their
	4297	* dirty buffer list. The inode allocation dependency will
	4298	* be resolved when the inode is updated with MNT_WAIT.
	4299	* This work is done in two passes. The first pass grabs most
	4300	* of the buffers and begins asynchronously writing them. The
	4301	* only way to wait for these asynchronous writes is to sleep
	4302	* on the filesystem vnode which may stay busy for a long time
	4303	* if the filesystem is active. So, instead, we make a second
	4304	* pass over the dependencies blocking on each write. In the
	4305	* usual case we will be blocking against a write that we
	4306	* initiated, so when it is done the dependency will have been
	4307	* resolved. Thus the second pass is expected to end quickly.
	4308	*/
	4309	waitfor = MNT_NOWAIT;
	4310	top:
	4311	/*
	4312	* We must wait for any I/O in progress to finish so that
	4313	* all potential buffers on the dirty list will be visible.
	4314	*/
	4315	drain_output(vp, 1);
	4316
	4317	info.vp = vp;
	4318	info.waitfor = waitfor;
	4319	lwkt_gettoken(&vp->v_token);
	4320	error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
	4321	softdep_sync_metadata_bp, &info);
	4322	lwkt_reltoken(&vp->v_token);
	4323	if (error < 0) {
	4324	FREE_LOCK(&lk);
	4325	return(-error); /* error code */
	4326	}
	4327
	4328	/*
	4329	* The brief unlock is to allow any pent up dependency
	4330	* processing to be done. Then proceed with the second pass.
	4331	*/
	4332	if (waitfor == MNT_NOWAIT) {
	4333	waitfor = MNT_WAIT;
	4334	FREE_LOCK(&lk);
	4335	ACQUIRE_LOCK(&lk);
	4336	goto top;
	4337	}
	4338
	4339	/*
	4340	* If we have managed to get rid of all the dirty buffers,
	4341	* then we are done. For certain directories and block
	4342	* devices, we may need to do further work.
	4343	*
	4344	* We must wait for any I/O in progress to finish so that
	4345	* all potential buffers on the dirty list will be visible.
	4346	*/
	4347	drain_output(vp, 1);
	4348	if (RB_EMPTY(&vp->v_rbdirty_tree)) {
	4349	FREE_LOCK(&lk);
	4350	return (0);
	4351	}
	4352
	4353	FREE_LOCK(&lk);
	4354	/*
	4355	* If we are trying to sync a block device, some of its buffers may
	4356	* contain metadata that cannot be written until the contents of some
	4357	* partially written files have been written to disk. The only easy
	4358	* way to accomplish this is to sync the entire filesystem (luckily
	4359	* this happens rarely).
	4360	*/
	4361	if (vn_isdisk(vp, NULL) &&
	4362	vp->v_rdev &&
	4363	vp->v_rdev->si_mountpoint && !vn_islocked(vp) &&
	4364	(error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT)) != 0)
	4365	return (error);
	4366	return (0);
	4367	}
	4368
	4369	static int
	4370	softdep_sync_metadata_bp(struct buf bp, void data)
	4371	{
	4372	struct softdep_sync_metadata_info *info = data;
	4373	struct pagedep *pagedep;
	4374	struct allocdirect *adp;
	4375	struct allocindir *aip;
	4376	struct worklist *wk;
	4377	struct buf *nbp;
	4378	int error;
	4379	int i;
	4380
	4381	if (getdirtybuf(&bp, MNT_WAIT) == 0) {
	4382	kprintf("softdep_sync_metadata_bp(1): caught buf %p going away\n", bp);
	4383	return (1);
	4384	}
	4385	if (bp->b_vp != info->vp \|\| (bp->b_flags & B_DELWRI) == 0) {
	4386	kprintf("softdep_sync_metadata_bp(2): caught buf %p going away vp %p\n", bp, info->vp);
	4387	BUF_UNLOCK(bp);
	4388	return(1);
	4389	}
	4390
	4391	/*
	4392	* As we hold the buffer locked, none of its dependencies
	4393	* will disappear.
	4394	*/
	4395	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	4396	switch (wk->wk_type) {
	4397
	4398	case D_ALLOCDIRECT:
	4399	adp = WK_ALLOCDIRECT(wk);
	4400	if (adp->ad_state & DEPCOMPLETE)
	4401	break;
	4402	nbp = adp->ad_buf;
	4403	if (getdirtybuf(&nbp, info->waitfor) == 0)
	4404	break;
	4405	FREE_LOCK(&lk);
	4406	if (info->waitfor == MNT_NOWAIT) {
	4407	bawrite(nbp);
	4408	} else if ((error = bwrite(nbp)) != 0) {
	4409	bawrite(bp);
	4410	ACQUIRE_LOCK(&lk);
	4411	return (-error);
	4412	}
	4413	ACQUIRE_LOCK(&lk);
	4414	break;
	4415
	4416	case D_ALLOCINDIR:
	4417	aip = WK_ALLOCINDIR(wk);
	4418	if (aip->ai_state & DEPCOMPLETE)
	4419	break;
	4420	nbp = aip->ai_buf;
	4421	if (getdirtybuf(&nbp, info->waitfor) == 0)
	4422	break;
	4423	FREE_LOCK(&lk);
	4424	if (info->waitfor == MNT_NOWAIT) {
	4425	bawrite(nbp);
	4426	} else if ((error = bwrite(nbp)) != 0) {
	4427	bawrite(bp);
	4428	ACQUIRE_LOCK(&lk);
	4429	return (-error);
	4430	}
	4431	ACQUIRE_LOCK(&lk);
	4432	break;
	4433
	4434	case D_INDIRDEP:
	4435	restart:
	4436
	4437	LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
	4438	if (aip->ai_state & DEPCOMPLETE)
	4439	continue;
	4440	nbp = aip->ai_buf;
	4441	if (getdirtybuf(&nbp, MNT_WAIT) == 0)
	4442	goto restart;
	4443	FREE_LOCK(&lk);
	4444	if ((error = bwrite(nbp)) != 0) {
	4445	bawrite(bp);
	4446	ACQUIRE_LOCK(&lk);
	4447	return (-error);
	4448	}
	4449	ACQUIRE_LOCK(&lk);
	4450	goto restart;
	4451	}
	4452	break;
	4453
	4454	case D_INODEDEP:
	4455	if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
	4456	WK_INODEDEP(wk)->id_ino)) != 0) {
	4457	FREE_LOCK(&lk);
	4458	bawrite(bp);
	4459	ACQUIRE_LOCK(&lk);
	4460	return (-error);
	4461	}
	4462	break;
	4463
	4464	case D_PAGEDEP:
	4465	/*
	4466	* We are trying to sync a directory that may
	4467	* have dependencies on both its own metadata
	4468	* and/or dependencies on the inodes of any
	4469	* recently allocated files. We walk its diradd
	4470	* lists pushing out the associated inode.
	4471	*/
	4472	pagedep = WK_PAGEDEP(wk);
	4473	for (i = 0; i < DAHASHSZ; i++) {
	4474	if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
	4475	continue;
	4476	if ((error =
	4477	flush_pagedep_deps(info->vp,
	4478	pagedep->pd_mnt,
	4479	&pagedep->pd_diraddhd[i]))) {
	4480	FREE_LOCK(&lk);
	4481	bawrite(bp);
	4482	ACQUIRE_LOCK(&lk);
	4483	return (-error);
	4484	}
	4485	}
	4486	break;
	4487
	4488	case D_MKDIR:
	4489	/*
	4490	* This case should never happen if the vnode has
	4491	* been properly sync'ed. However, if this function
	4492	* is used at a place where the vnode has not yet
	4493	* been sync'ed, this dependency can show up. So,
	4494	* rather than panic, just flush it.
	4495	*/
	4496	nbp = WK_MKDIR(wk)->md_buf;
	4497	if (getdirtybuf(&nbp, info->waitfor) == 0)
	4498	break;
	4499	FREE_LOCK(&lk);
	4500	if (info->waitfor == MNT_NOWAIT) {
	4501	bawrite(nbp);
	4502	} else if ((error = bwrite(nbp)) != 0) {
	4503	bawrite(bp);
	4504	ACQUIRE_LOCK(&lk);
	4505	return (-error);
	4506	}
	4507	ACQUIRE_LOCK(&lk);
	4508	break;
	4509
	4510	case D_BMSAFEMAP:
	4511	/*
	4512	* This case should never happen if the vnode has
	4513	* been properly sync'ed. However, if this function
	4514	* is used at a place where the vnode has not yet
	4515	* been sync'ed, this dependency can show up. So,
	4516	* rather than panic, just flush it.
	4517	*
	4518	* nbp can wind up == bp if a device node for the
	4519	* same filesystem is being fsynced at the same time,
	4520	* leading to a panic if we don't catch the case.
	4521	*/
	4522	nbp = WK_BMSAFEMAP(wk)->sm_buf;
	4523	if (nbp == bp)
	4524	break;
	4525	if (getdirtybuf(&nbp, info->waitfor) == 0)
	4526	break;
	4527	FREE_LOCK(&lk);
	4528	if (info->waitfor == MNT_NOWAIT) {
	4529	bawrite(nbp);
	4530	} else if ((error = bwrite(nbp)) != 0) {
	4531	bawrite(bp);
	4532	ACQUIRE_LOCK(&lk);
	4533	return (-error);
	4534	}
	4535	ACQUIRE_LOCK(&lk);
	4536	break;
	4537
	4538	default:
	4539	FREE_LOCK(&lk);
	4540	panic("softdep_sync_metadata: Unknown type %s",
	4541	TYPENAME(wk->wk_type));
	4542	/* NOTREACHED */
	4543	}
	4544	}
	4545	FREE_LOCK(&lk);
	4546	bawrite(bp);
	4547	ACQUIRE_LOCK(&lk);
	4548	return(0);
	4549	}
	4550
	4551	/*
	4552	* Flush the dependencies associated with an inodedep.
	4553	* Called with splbio blocked.
	4554	*/
	4555	static int
	4556	flush_inodedep_deps(struct fs *fs, ino_t ino)
	4557	{
	4558	struct inodedep *inodedep;
	4559	struct allocdirect *adp;
	4560	int error, waitfor;
	4561	struct buf *bp;
	4562
	4563	/*
	4564	* This work is done in two passes. The first pass grabs most
	4565	* of the buffers and begins asynchronously writing them. The
	4566	* only way to wait for these asynchronous writes is to sleep
	4567	* on the filesystem vnode which may stay busy for a long time
	4568	* if the filesystem is active. So, instead, we make a second
	4569	* pass over the dependencies blocking on each write. In the
	4570	* usual case we will be blocking against a write that we
	4571	* initiated, so when it is done the dependency will have been
	4572	* resolved. Thus the second pass is expected to end quickly.
	4573	* We give a brief window at the top of the loop to allow
	4574	* any pending I/O to complete.
	4575	*/
	4576	for (waitfor = MNT_NOWAIT; ; ) {
	4577	FREE_LOCK(&lk);
	4578	ACQUIRE_LOCK(&lk);
	4579	if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
	4580	return (0);
	4581	TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) {
	4582	if (adp->ad_state & DEPCOMPLETE)
	4583	continue;
	4584	bp = adp->ad_buf;
	4585	if (getdirtybuf(&bp, waitfor) == 0) {
	4586	if (waitfor == MNT_NOWAIT)
	4587	continue;
	4588	break;
	4589	}
	4590	FREE_LOCK(&lk);
	4591	if (waitfor == MNT_NOWAIT) {
	4592	bawrite(bp);
	4593	} else if ((error = bwrite(bp)) != 0) {
	4594	ACQUIRE_LOCK(&lk);
	4595	return (error);
	4596	}
	4597	ACQUIRE_LOCK(&lk);
	4598	break;
	4599	}
	4600	if (adp != NULL)
	4601	continue;
	4602	TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) {
	4603	if (adp->ad_state & DEPCOMPLETE)
	4604	continue;
	4605	bp = adp->ad_buf;
	4606	if (getdirtybuf(&bp, waitfor) == 0) {
	4607	if (waitfor == MNT_NOWAIT)
	4608	continue;
	4609	break;
	4610	}
	4611	FREE_LOCK(&lk);
	4612	if (waitfor == MNT_NOWAIT) {
	4613	bawrite(bp);
	4614	} else if ((error = bwrite(bp)) != 0) {
	4615	ACQUIRE_LOCK(&lk);
	4616	return (error);
	4617	}
	4618	ACQUIRE_LOCK(&lk);
	4619	break;
	4620	}
	4621	if (adp != NULL)
	4622	continue;
	4623	/*
	4624	* If pass2, we are done, otherwise do pass 2.
	4625	*/
	4626	if (waitfor == MNT_WAIT)
	4627	break;
	4628	waitfor = MNT_WAIT;
	4629	}
	4630	/*
	4631	* Try freeing inodedep in case all dependencies have been removed.
	4632	*/
	4633	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
	4634	(void) free_inodedep(inodedep);
	4635	return (0);
	4636	}
	4637
	4638	/*
	4639	* Eliminate a pagedep dependency by flushing out all its diradd dependencies.
	4640	* Called with splbio blocked.
	4641	*/
	4642	static int
	4643	flush_pagedep_deps(struct vnode pvp, struct mount mp,
	4644	struct diraddhd *diraddhdp)
	4645	{
	4646	struct inodedep *inodedep;
	4647	struct ufsmount *ump;
	4648	struct diradd *dap;
	4649	struct vnode *vp;
	4650	int gotit, error = 0;
	4651	struct buf *bp;
	4652	ino_t inum;
	4653
	4654	ump = VFSTOUFS(mp);
	4655	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
	4656	/*
	4657	* Flush ourselves if this directory entry
	4658	* has a MKDIR_PARENT dependency.
	4659	*/
	4660	if (dap->da_state & MKDIR_PARENT) {
	4661	FREE_LOCK(&lk);
	4662	if ((error = ffs_update(pvp, 1)) != 0)
	4663	break;
	4664	ACQUIRE_LOCK(&lk);
	4665	/*
	4666	* If that cleared dependencies, go on to next.
	4667	*/
	4668	if (dap != LIST_FIRST(diraddhdp))
	4669	continue;
	4670	if (dap->da_state & MKDIR_PARENT) {
	4671	FREE_LOCK(&lk);
	4672	panic("flush_pagedep_deps: MKDIR_PARENT");
	4673	}
	4674	}
	4675	/*
	4676	* A newly allocated directory must have its "." and
	4677	* ".." entries written out before its name can be
	4678	* committed in its parent. We do not want or need
	4679	* the full semantics of a synchronous VOP_FSYNC as
	4680	* that may end up here again, once for each directory
	4681	* level in the filesystem. Instead, we push the blocks
	4682	* and wait for them to clear. We have to fsync twice
	4683	* because the first call may choose to defer blocks
	4684	* that still have dependencies, but deferral will
	4685	* happen at most once.
	4686	*/
	4687	inum = dap->da_newinum;
	4688	if (dap->da_state & MKDIR_BODY) {
	4689	FREE_LOCK(&lk);
	4690	if ((error = VFS_VGET(mp, NULL, inum, &vp)) != 0)
	4691	break;
	4692	if ((error=VOP_FSYNC(vp, MNT_NOWAIT, 0)) \|\|
	4693	(error=VOP_FSYNC(vp, MNT_NOWAIT, 0))) {
	4694	vput(vp);
	4695	break;
	4696	}
	4697	drain_output(vp, 0);
	4698	vput(vp);
	4699	ACQUIRE_LOCK(&lk);
	4700	/*
	4701	* If that cleared dependencies, go on to next.
	4702	*/
	4703	if (dap != LIST_FIRST(diraddhdp))
	4704	continue;
	4705	if (dap->da_state & MKDIR_BODY) {
	4706	FREE_LOCK(&lk);
	4707	panic("flush_pagedep_deps: MKDIR_BODY");
	4708	}
	4709	}
	4710	/*
	4711	* Flush the inode on which the directory entry depends.
	4712	* Having accounted for MKDIR_PARENT and MKDIR_BODY above,
	4713	* the only remaining dependency is that the updated inode
	4714	* count must get pushed to disk. The inode has already
	4715	* been pushed into its inode buffer (via VOP_UPDATE) at
	4716	* the time of the reference count change. So we need only
	4717	* locate that buffer, ensure that there will be no rollback
	4718	* caused by a bitmap dependency, then write the inode buffer.
	4719	*/
	4720	if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
	4721	FREE_LOCK(&lk);
	4722	panic("flush_pagedep_deps: lost inode");
	4723	}
	4724	/*
	4725	* If the inode still has bitmap dependencies,
	4726	* push them to disk.
	4727	*/
	4728	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
	4729	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
	4730	FREE_LOCK(&lk);
	4731	if (gotit && (error = bwrite(inodedep->id_buf)) != 0)
	4732	break;
	4733	ACQUIRE_LOCK(&lk);
	4734	if (dap != LIST_FIRST(diraddhdp))
	4735	continue;
	4736	}
	4737	/*
	4738	* If the inode is still sitting in a buffer waiting
	4739	* to be written, push it to disk.
	4740	*/
	4741	FREE_LOCK(&lk);
	4742	if ((error = bread(ump->um_devvp,
	4743	fsbtodoff(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
	4744	(int)ump->um_fs->fs_bsize, &bp)) != 0)
	4745	break;
	4746	if ((error = bwrite(bp)) != 0)
	4747	break;
	4748	ACQUIRE_LOCK(&lk);
	4749	/*
	4750	* If we have failed to get rid of all the dependencies
	4751	* then something is seriously wrong.
	4752	*/
	4753	if (dap == LIST_FIRST(diraddhdp)) {
	4754	FREE_LOCK(&lk);
	4755	panic("flush_pagedep_deps: flush failed");
	4756	}
	4757	}
	4758	if (error)
	4759	ACQUIRE_LOCK(&lk);
	4760	return (error);
	4761	}
	4762
	4763	/*
	4764	* A large burst of file addition or deletion activity can drive the
	4765	* memory load excessively high. First attempt to slow things down
	4766	* using the techniques below. If that fails, this routine requests
	4767	* the offending operations to fall back to running synchronously
	4768	* until the memory load returns to a reasonable level.
	4769	*/
	4770	int
	4771	softdep_slowdown(struct vnode *vp)
	4772	{
	4773	int max_softdeps_hard;
	4774
	4775	max_softdeps_hard = max_softdeps * 11 / 10;
	4776	if (num_dirrem < max_softdeps_hard / 2 &&
	4777	num_inodedep < max_softdeps_hard)
	4778	return (0);
	4779	stat_sync_limit_hit += 1;
	4780	return (1);
	4781	}
	4782
	4783	/*
	4784	* If memory utilization has gotten too high, deliberately slow things
	4785	* down and speed up the I/O processing.
	4786	*/
	4787	static int
	4788	request_cleanup(int resource, int islocked)
	4789	{
	4790	struct thread td = curthread; / XXX */
	4791
	4792	/*
	4793	* We never hold up the filesystem syncer process.
	4794	*/
	4795	if (td == filesys_syncer)
	4796	return (0);
	4797	/*
	4798	* First check to see if the work list has gotten backlogged.
	4799	* If it has, co-opt this process to help clean up two entries.
	4800	* Because this process may hold inodes locked, we cannot
	4801	* handle any remove requests that might block on a locked
	4802	* inode as that could lead to deadlock.
	4803	*/
	4804	if (num_on_worklist > max_softdeps / 10) {
	4805	if (islocked)
	4806	FREE_LOCK(&lk);
	4807	process_worklist_item(NULL, LK_NOWAIT);
	4808	process_worklist_item(NULL, LK_NOWAIT);
	4809	stat_worklist_push += 2;
	4810	if (islocked)
	4811	ACQUIRE_LOCK(&lk);
	4812	return(1);
	4813	}
	4814
	4815	/*
	4816	* If we are resource constrained on inode dependencies, try
	4817	* flushing some dirty inodes. Otherwise, we are constrained
	4818	* by file deletions, so try accelerating flushes of directories
	4819	* with removal dependencies. We would like to do the cleanup
	4820	* here, but we probably hold an inode locked at this point and
	4821	* that might deadlock against one that we try to clean. So,
	4822	* the best that we can do is request the syncer daemon to do
	4823	* the cleanup for us.
	4824	*/
	4825	switch (resource) {
	4826
	4827	case FLUSH_INODES:
	4828	stat_ino_limit_push += 1;
	4829	req_clear_inodedeps += 1;
	4830	stat_countp = &stat_ino_limit_hit;
	4831	break;
	4832
	4833	case FLUSH_REMOVE:
	4834	stat_blk_limit_push += 1;
	4835	req_clear_remove += 1;
	4836	stat_countp = &stat_blk_limit_hit;
	4837	break;
	4838
	4839	default:
	4840	if (islocked)
	4841	FREE_LOCK(&lk);
	4842	panic("request_cleanup: unknown type");
	4843	}
	4844	/*
	4845	* Hopefully the syncer daemon will catch up and awaken us.
	4846	* We wait at most tickdelay before proceeding in any case.
	4847	*/
	4848	if (islocked == 0)
	4849	ACQUIRE_LOCK(&lk);
	4850	proc_waiting += 1;
	4851	if (!callout_active(&handle))
	4852	callout_reset(&handle, tickdelay > 2 ? tickdelay : 2,
	4853	pause_timer, NULL);
	4854	interlocked_sleep(&lk, SLEEP, (caddr_t)&proc_waiting, 0,
	4855	"softupdate", 0);
	4856	proc_waiting -= 1;
	4857	if (islocked == 0)
	4858	FREE_LOCK(&lk);
	4859	return (1);
	4860	}
	4861
	4862	/*
	4863	* Awaken processes pausing in request_cleanup and clear proc_waiting
	4864	* to indicate that there is no longer a timer running.
	4865	*/
	4866	void
	4867	pause_timer(void *arg)
	4868	{
	4869	*stat_countp += 1;
	4870	wakeup_one(&proc_waiting);
	4871	if (proc_waiting > 0)
	4872	callout_reset(&handle, tickdelay > 2 ? tickdelay : 2,
	4873	pause_timer, NULL);
	4874	else
	4875	callout_deactivate(&handle);
	4876	}
	4877
	4878	/*
	4879	* Flush out a directory with at least one removal dependency in an effort to
	4880	* reduce the number of dirrem, freefile, and freeblks dependency structures.
	4881	*/
	4882	static void
	4883	clear_remove(struct thread *td)
	4884	{
	4885	struct pagedep_hashhead *pagedephd;
	4886	struct pagedep *pagedep;
	4887	static int next = 0;
	4888	struct mount *mp;
	4889	struct vnode *vp;
	4890	int error, cnt;
	4891	ino_t ino;
	4892
	4893	ACQUIRE_LOCK(&lk);
	4894	for (cnt = 0; cnt < pagedep_hash; cnt++) {
	4895	pagedephd = &pagedep_hashtbl[next++];
	4896	if (next >= pagedep_hash)
	4897	next = 0;
	4898	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
	4899	if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
	4900	continue;
	4901	mp = pagedep->pd_mnt;
	4902	ino = pagedep->pd_ino;
	4903	FREE_LOCK(&lk);
	4904	if ((error = VFS_VGET(mp, NULL, ino, &vp)) != 0) {
	4905	softdep_error("clear_remove: vget", error);
	4906	return;
	4907	}
	4908	if ((error = VOP_FSYNC(vp, MNT_NOWAIT, 0)))
	4909	softdep_error("clear_remove: fsync", error);
	4910	drain_output(vp, 0);
	4911	vput(vp);
	4912	return;
	4913	}
	4914	}
	4915	FREE_LOCK(&lk);
	4916	}
	4917
	4918	/*
	4919	* Clear out a block of dirty inodes in an effort to reduce
	4920	* the number of inodedep dependency structures.
	4921	*/
	4922	struct clear_inodedeps_info {
	4923	struct fs *fs;
	4924	struct mount *mp;
	4925	};
	4926
	4927	static int
	4928	clear_inodedeps_mountlist_callback(struct mount mp, void data)
	4929	{
	4930	struct clear_inodedeps_info *info = data;
	4931
	4932	if ((mp->mnt_flag & MNT_SOFTDEP) && info->fs == VFSTOUFS(mp)->um_fs) {
	4933	info->mp = mp;
	4934	return(-1);
	4935	}
	4936	return(0);
	4937	}
	4938
	4939	static void
	4940	clear_inodedeps(struct thread *td)
	4941	{
	4942	struct clear_inodedeps_info info;
	4943	struct inodedep_hashhead *inodedephd;
	4944	struct inodedep *inodedep;
	4945	static int next = 0;
	4946	struct vnode *vp;
	4947	struct fs *fs;
	4948	int error, cnt;
	4949	ino_t firstino, lastino, ino;
	4950
	4951	ACQUIRE_LOCK(&lk);
	4952	/*
	4953	* Pick a random inode dependency to be cleared.
	4954	* We will then gather up all the inodes in its block
	4955	* that have dependencies and flush them out.
	4956	*/
	4957	for (cnt = 0; cnt < inodedep_hash; cnt++) {
	4958	inodedephd = &inodedep_hashtbl[next++];
	4959	if (next >= inodedep_hash)
	4960	next = 0;
	4961	if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
	4962	break;
	4963	}
	4964	if (inodedep == NULL) {
	4965	FREE_LOCK(&lk);
	4966	return;
	4967	}
	4968	/*
	4969	* Ugly code to find mount point given pointer to superblock.
	4970	*/
	4971	fs = inodedep->id_fs;
	4972	info.mp = NULL;
	4973	info.fs = fs;
	4974	mountlist_scan(clear_inodedeps_mountlist_callback,
	4975	&info, MNTSCAN_FORWARD\|MNTSCAN_NOBUSY);
	4976	/*
	4977	* Find the last inode in the block with dependencies.
	4978	*/
	4979	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
	4980	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
	4981	if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
	4982	break;
	4983	/*
	4984	* Asynchronously push all but the last inode with dependencies.
	4985	* Synchronously push the last inode with dependencies to ensure
	4986	* that the inode block gets written to free up the inodedeps.
	4987	*/
	4988	for (ino = firstino; ino <= lastino; ino++) {
	4989	if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
	4990	continue;
	4991	FREE_LOCK(&lk);
	4992	if ((error = VFS_VGET(info.mp, NULL, ino, &vp)) != 0) {
	4993	softdep_error("clear_inodedeps: vget", error);
	4994	return;
	4995	}
	4996	if (ino == lastino) {
	4997	if ((error = VOP_FSYNC(vp, MNT_WAIT, 0)))
	4998	softdep_error("clear_inodedeps: fsync1", error);
	4999	} else {
	5000	if ((error = VOP_FSYNC(vp, MNT_NOWAIT, 0)))
	5001	softdep_error("clear_inodedeps: fsync2", error);
	5002	drain_output(vp, 0);
	5003	}
	5004	vput(vp);
	5005	ACQUIRE_LOCK(&lk);
	5006	}
	5007	FREE_LOCK(&lk);
	5008	}
	5009
	5010	/*
	5011	* Function to determine if the buffer has outstanding dependencies
	5012	* that will cause a roll-back if the buffer is written. If wantcount
	5013	* is set, return number of dependencies, otherwise just yes or no.
	5014	*/
	5015	static int
	5016	softdep_count_dependencies(struct buf *bp, int wantcount)
	5017	{
	5018	struct worklist *wk;
	5019	struct inodedep *inodedep;
	5020	struct indirdep *indirdep;
	5021	struct allocindir *aip;
	5022	struct pagedep *pagedep;
	5023	struct diradd *dap;
	5024	int i, retval;
	5025
	5026	retval = 0;
	5027	ACQUIRE_LOCK(&lk);
	5028	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
	5029	switch (wk->wk_type) {
	5030
	5031	case D_INODEDEP:
	5032	inodedep = WK_INODEDEP(wk);
	5033	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
	5034	/* bitmap allocation dependency */
	5035	retval += 1;
	5036	if (!wantcount)
	5037	goto out;
	5038	}
	5039	if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
	5040	/* direct block pointer dependency */
	5041	retval += 1;
	5042	if (!wantcount)
	5043	goto out;
	5044	}
	5045	continue;
	5046
	5047	case D_INDIRDEP:
	5048	indirdep = WK_INDIRDEP(wk);
	5049
	5050	LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
	5051	/* indirect block pointer dependency */
	5052	retval += 1;
	5053	if (!wantcount)
	5054	goto out;
	5055	}
	5056	continue;
	5057
	5058	case D_PAGEDEP:
	5059	pagedep = WK_PAGEDEP(wk);
	5060	for (i = 0; i < DAHASHSZ; i++) {
	5061
	5062	LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
	5063	/* directory entry dependency */
	5064	retval += 1;
	5065	if (!wantcount)
	5066	goto out;
	5067	}
	5068	}
	5069	continue;
	5070
	5071	case D_BMSAFEMAP:
	5072	case D_ALLOCDIRECT:
	5073	case D_ALLOCINDIR:
	5074	case D_MKDIR:
	5075	/* never a dependency on these blocks */
	5076	continue;
	5077
	5078	default:
	5079	FREE_LOCK(&lk);
	5080	panic("softdep_check_for_rollback: Unexpected type %s",
	5081	TYPENAME(wk->wk_type));
	5082	/* NOTREACHED */
	5083	}
	5084	}
	5085	out:
	5086	FREE_LOCK(&lk);
	5087	return retval;
	5088	}
	5089
	5090	/*
	5091	* Acquire exclusive access to a buffer.
	5092	* Must be called with splbio blocked.
	5093	* Return 1 if buffer was acquired.
	5094	*/
	5095	static int
	5096	getdirtybuf(struct buf **bpp, int waitfor)
	5097	{
	5098	struct buf *bp;
	5099	int error;
	5100
	5101	for (;;) {
	5102	if ((bp = *bpp) == NULL)
	5103	return (0);
	5104	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT) == 0)
	5105	break;
	5106	if (waitfor != MNT_WAIT)
	5107	return (0);
	5108	error = interlocked_sleep(&lk, LOCKBUF, bp,
	5109	LK_EXCLUSIVE \| LK_SLEEPFAIL, 0, 0);
	5110	if (error != ENOLCK) {
	5111	FREE_LOCK(&lk);
	5112	panic("getdirtybuf: inconsistent lock");
	5113	}
	5114	}
	5115	if ((bp->b_flags & B_DELWRI) == 0) {
	5116	BUF_UNLOCK(bp);
	5117	return (0);
	5118	}
	5119	bremfree(bp);
	5120	return (1);
	5121	}
	5122
	5123	/*
	5124	* Wait for pending output on a vnode to complete.
	5125	* Must be called with vnode locked.
	5126	*/
	5127	static void
	5128	drain_output(struct vnode *vp, int islocked)
	5129	{
	5130
	5131	if (!islocked)
	5132	ACQUIRE_LOCK(&lk);
	5133	while (bio_track_active(&vp->v_track_write)) {
	5134	FREE_LOCK(&lk);
	5135	bio_track_wait(&vp->v_track_write, 0, 0);
	5136	ACQUIRE_LOCK(&lk);
	5137	}
	5138	if (!islocked)
	5139	FREE_LOCK(&lk);
	5140	}
	5141
	5142	/*
	5143	* Called whenever a buffer that is being invalidated or reallocated
	5144	* contains dependencies. This should only happen if an I/O error has
	5145	* occurred. The routine is called with the buffer locked.
	5146	*/
	5147	static void
	5148	softdep_deallocate_dependencies(struct buf *bp)
	5149	{
	5150	if ((bp->b_flags & B_ERROR) == 0)
	5151	panic("softdep_deallocate_dependencies: dangling deps");
	5152	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntfromname, bp->b_error);
	5153	panic("softdep_deallocate_dependencies: unrecovered I/O error");
	5154	}
	5155
	5156	/*
	5157	* Function to handle asynchronous write errors in the filesystem.
	5158	*/
	5159	void
	5160	softdep_error(char *func, int error)
	5161	{
	5162
	5163	/* XXX should do something better! */
	5164	kprintf("%s: got error %d while accessing filesystem\n", func, error);
	5165	}