gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1994,1997 John S. Dyson
	3	* All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* 1. Redistributions of source code must retain the above copyright
	9	* notice immediately at the beginning of the file, without modification,
	10	* this list of conditions, and the following disclaimer.
	11	* 2. Absolutely no warranty of function or purpose is made by the author
	12	* John S. Dyson.
	13	*
	14	* $FreeBSD: src/sys/kern/vfs_bio.c,v 1.242.2.20 2003/05/28 18:38:10 alc Exp $
	15	* $DragonFly: src/sys/kern/vfs_bio.c,v 1.115 2008/08/13 11:02:31 swildner Exp $
	16	*/
	17
	18	/*
	19	* this file contains a new buffer I/O scheme implementing a coherent
	20	* VM object and buffer cache scheme. Pains have been taken to make
	21	* sure that the performance degradation associated with schemes such
	22	* as this is not realized.
	23	*
	24	* Author: John S. Dyson
	25	* Significant help during the development and debugging phases
	26	* had been provided by David Greenman, also of the FreeBSD core team.
	27	*
	28	* see man buf(9) for more info.
	29	*/
	30
	31	#include <sys/param.h>
	32	#include <sys/systm.h>
	33	#include <sys/buf.h>
	34	#include <sys/conf.h>
	35	#include <sys/eventhandler.h>
	36	#include <sys/lock.h>
	37	#include <sys/malloc.h>
	38	#include <sys/mount.h>
	39	#include <sys/kernel.h>
	40	#include <sys/kthread.h>
	41	#include <sys/proc.h>
	42	#include <sys/reboot.h>
	43	#include <sys/resourcevar.h>
	44	#include <sys/sysctl.h>
	45	#include <sys/vmmeter.h>
	46	#include <sys/vnode.h>
	47	#include <sys/dsched.h>
	48	#include <sys/proc.h>
	49	#include <vm/vm.h>
	50	#include <vm/vm_param.h>
	51	#include <vm/vm_kern.h>
	52	#include <vm/vm_pageout.h>
	53	#include <vm/vm_page.h>
	54	#include <vm/vm_object.h>
	55	#include <vm/vm_extern.h>
	56	#include <vm/vm_map.h>
	57	#include <vm/vm_pager.h>
	58	#include <vm/swap_pager.h>
	59
	60	#include <sys/buf2.h>
	61	#include <sys/thread2.h>
	62	#include <sys/spinlock2.h>
	63	#include <sys/mplock2.h>
	64	#include <vm/vm_page2.h>
	65
	66	#include "opt_ddb.h"
	67	#ifdef DDB
	68	#include <ddb/ddb.h>
	69	#endif
	70
	71	/*
	72	* Buffer queues.
	73	*/
	74	enum bufq_type {
	75	BQUEUE_NONE, /* not on any queue */
	76	BQUEUE_LOCKED, /* locked buffers */
	77	BQUEUE_CLEAN, /* non-B_DELWRI buffers */
	78	BQUEUE_DIRTY, /* B_DELWRI buffers */
	79	BQUEUE_DIRTY_HW, /* B_DELWRI buffers - heavy weight */
	80	BQUEUE_EMPTYKVA, /* empty buffer headers with KVA assignment */
	81	BQUEUE_EMPTY, /* empty buffer headers */
	82
	83	BUFFER_QUEUES /* number of buffer queues */
	84	};
	85
	86	typedef enum bufq_type bufq_type_t;
	87
	88	#define BD_WAKE_SIZE 16384
	89	#define BD_WAKE_MASK (BD_WAKE_SIZE - 1)
	90
	91	TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
	92	struct spinlock bufspin = SPINLOCK_INITIALIZER(&bufspin);
	93
	94	static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
	95
	96	struct buf buf; / buffer header pool */
	97
	98	static void vfs_clean_pages(struct buf *bp);
	99	static void vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m);
	100	static void vfs_dirty_one_page(struct buf *bp, int pageno, vm_page_t m);
	101	static void vfs_vmio_release(struct buf *bp);
	102	static int flushbufqueues(bufq_type_t q);
	103	static vm_page_t bio_page_alloc(vm_object_t obj, vm_pindex_t pg, int deficit);
	104
	105	static void bd_signal(int totalspace);
	106	static void buf_daemon(void);
	107	static void buf_daemon_hw(void);
	108
	109	/*
	110	* bogus page -- for I/O to/from partially complete buffers
	111	* this is a temporary solution to the problem, but it is not
	112	* really that bad. it would be better to split the buffer
	113	* for input in the case of buffers partially already in memory,
	114	* but the code is intricate enough already.
	115	*/
	116	vm_page_t bogus_page;
	117
	118	/*
	119	* These are all static, but make the ones we export globals so we do
	120	* not need to use compiler magic.
	121	*/
	122	int bufspace, maxbufspace,
	123	bufmallocspace, maxbufmallocspace, lobufspace, hibufspace;
	124	static int bufreusecnt, bufdefragcnt, buffreekvacnt;
	125	static int lorunningspace, hirunningspace, runningbufreq;
	126	int dirtybufspace, dirtybufspacehw, lodirtybufspace, hidirtybufspace;
	127	int dirtybufcount, dirtybufcounthw;
	128	int runningbufspace, runningbufcount;
	129	static int getnewbufcalls;
	130	static int getnewbufrestarts;
	131	static int recoverbufcalls;
	132	static int needsbuffer; /* locked by needsbuffer_spin */
	133	static int bd_request; /* locked by needsbuffer_spin */
	134	static int bd_request_hw; /* locked by needsbuffer_spin */
	135	static u_int bd_wake_ary[BD_WAKE_SIZE];
	136	static u_int bd_wake_index;
	137	static u_int vm_cycle_point = 40; /* 23-36 will migrate more act->inact */
	138	static struct spinlock needsbuffer_spin;
	139	static int debug_commit;
	140
	141	static struct thread *bufdaemon_td;
	142	static struct thread *bufdaemonhw_td;
	143
	144
	145	/*
	146	* Sysctls for operational control of the buffer cache.
	147	*/
	148	SYSCTL_INT(_vfs, OID_AUTO, lodirtybufspace, CTLFLAG_RW, &lodirtybufspace, 0,
	149	"Number of dirty buffers to flush before bufdaemon becomes inactive");
	150	SYSCTL_INT(_vfs, OID_AUTO, hidirtybufspace, CTLFLAG_RW, &hidirtybufspace, 0,
	151	"High watermark used to trigger explicit flushing of dirty buffers");
	152	SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
	153	"Minimum amount of buffer space required for active I/O");
	154	SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
	155	"Maximum amount of buffer space to usable for active I/O");
	156	SYSCTL_UINT(_vfs, OID_AUTO, vm_cycle_point, CTLFLAG_RW, &vm_cycle_point, 0,
	157	"Recycle pages to active or inactive queue transition pt 0-64");
	158	/*
	159	* Sysctls determining current state of the buffer cache.
	160	*/
	161	SYSCTL_INT(_vfs, OID_AUTO, nbuf, CTLFLAG_RD, &nbuf, 0,
	162	"Total number of buffers in buffer cache");
	163	SYSCTL_INT(_vfs, OID_AUTO, dirtybufspace, CTLFLAG_RD, &dirtybufspace, 0,
	164	"Pending bytes of dirty buffers (all)");
	165	SYSCTL_INT(_vfs, OID_AUTO, dirtybufspacehw, CTLFLAG_RD, &dirtybufspacehw, 0,
	166	"Pending bytes of dirty buffers (heavy weight)");
	167	SYSCTL_INT(_vfs, OID_AUTO, dirtybufcount, CTLFLAG_RD, &dirtybufcount, 0,
	168	"Pending number of dirty buffers");
	169	SYSCTL_INT(_vfs, OID_AUTO, dirtybufcounthw, CTLFLAG_RD, &dirtybufcounthw, 0,
	170	"Pending number of dirty buffers (heavy weight)");
	171	SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
	172	"I/O bytes currently in progress due to asynchronous writes");
	173	SYSCTL_INT(_vfs, OID_AUTO, runningbufcount, CTLFLAG_RD, &runningbufcount, 0,
	174	"I/O buffers currently in progress due to asynchronous writes");
	175	SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
	176	"Hard limit on maximum amount of memory usable for buffer space");
	177	SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
	178	"Soft limit on maximum amount of memory usable for buffer space");
	179	SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
	180	"Minimum amount of memory to reserve for system buffer space");
	181	SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
	182	"Amount of memory available for buffers");
	183	SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RD, &maxbufmallocspace,
	184	0, "Maximum amount of memory reserved for buffers using malloc");
	185	SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
	186	"Amount of memory left for buffers using malloc-scheme");
	187	SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, &getnewbufcalls, 0,
	188	"New buffer header acquisition requests");
	189	SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, &getnewbufrestarts,
	190	0, "New buffer header acquisition restarts");
	191	SYSCTL_INT(_vfs, OID_AUTO, recoverbufcalls, CTLFLAG_RD, &recoverbufcalls, 0,
	192	"Recover VM space in an emergency");
	193	SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RD, &bufdefragcnt, 0,
	194	"Buffer acquisition restarts due to fragmented buffer map");
	195	SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RD, &buffreekvacnt, 0,
	196	"Amount of time KVA space was deallocated in an arbitrary buffer");
	197	SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RD, &bufreusecnt, 0,
	198	"Amount of time buffer re-use operations were successful");
	199	SYSCTL_INT(_vfs, OID_AUTO, debug_commit, CTLFLAG_RW, &debug_commit, 0, "");
	200	SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, 0, sizeof(struct buf),
	201	"sizeof(struct buf)");
	202
	203	char *buf_wmesg = BUF_WMESG;
	204
	205	#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */
	206	#define VFS_BIO_NEED_UNUSED02 0x02
	207	#define VFS_BIO_NEED_UNUSED04 0x04
	208	#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
	209
	210	/*
	211	* bufspacewakeup:
	212	*
	213	* Called when buffer space is potentially available for recovery.
	214	* getnewbuf() will block on this flag when it is unable to free
	215	* sufficient buffer space. Buffer space becomes recoverable when
	216	* bp's get placed back in the queues.
	217	*/
	218
	219	static __inline void
	220	bufspacewakeup(void)
	221	{
	222	/*
	223	* If someone is waiting for BUF space, wake them up. Even
	224	* though we haven't freed the kva space yet, the waiting
	225	* process will be able to now.
	226	*/
	227	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
	228	spin_lock_wr(&needsbuffer_spin);
	229	needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
	230	spin_unlock_wr(&needsbuffer_spin);
	231	wakeup(&needsbuffer);
	232	}
	233	}
	234
	235	/*
	236	* runningbufwakeup:
	237	*
	238	* Accounting for I/O in progress.
	239	*
	240	*/
	241	static __inline void
	242	runningbufwakeup(struct buf *bp)
	243	{
	244	int totalspace;
	245	int limit;
	246
	247	if ((totalspace = bp->b_runningbufspace) != 0) {
	248	atomic_subtract_int(&runningbufspace, totalspace);
	249	atomic_subtract_int(&runningbufcount, 1);
	250	bp->b_runningbufspace = 0;
	251
	252	/*
	253	* see waitrunningbufspace() for limit test.
	254	*/
	255	limit = hirunningspace * 2 / 3;
	256	if (runningbufreq && runningbufspace <= limit) {
	257	runningbufreq = 0;
	258	wakeup(&runningbufreq);
	259	}
	260	bd_signal(totalspace);
	261	}
	262	}
	263
	264	/*
	265	* bufcountwakeup:
	266	*
	267	* Called when a buffer has been added to one of the free queues to
	268	* account for the buffer and to wakeup anyone waiting for free buffers.
	269	* This typically occurs when large amounts of metadata are being handled
	270	* by the buffer cache ( else buffer space runs out first, usually ).
	271	*
	272	* MPSAFE
	273	*/
	274	static __inline void
	275	bufcountwakeup(void)
	276	{
	277	if (needsbuffer) {
	278	spin_lock_wr(&needsbuffer_spin);
	279	needsbuffer &= ~VFS_BIO_NEED_ANY;
	280	spin_unlock_wr(&needsbuffer_spin);
	281	wakeup(&needsbuffer);
	282	}
	283	}
	284
	285	/*
	286	* waitrunningbufspace()
	287	*
	288	* Wait for the amount of running I/O to drop to hirunningspace * 2 / 3.
	289	* This is the point where write bursting stops so we don't want to wait
	290	* for the running amount to drop below it (at least if we still want bioq
	291	* to burst writes).
	292	*
	293	* The caller may be using this function to block in a tight loop, we
	294	* must block while runningbufspace is greater then or equal to
	295	* hirunningspace * 2 / 3.
	296	*
	297	* And even with that it may not be enough, due to the presence of
	298	* B_LOCKED dirty buffers, so also wait for at least one running buffer
	299	* to complete.
	300	*/
	301	static __inline void
	302	waitrunningbufspace(void)
	303	{
	304	int limit = hirunningspace * 2 / 3;
	305
	306	crit_enter();
	307	if (runningbufspace > limit) {
	308	while (runningbufspace > limit) {
	309	++runningbufreq;
	310	tsleep(&runningbufreq, 0, "wdrn1", 0);
	311	}
	312	} else if (runningbufspace) {
	313	++runningbufreq;
	314	tsleep(&runningbufreq, 0, "wdrn2", 1);
	315	}
	316	crit_exit();
	317	}
	318
	319	/*
	320	* buf_dirty_count_severe:
	321	*
	322	* Return true if we have too many dirty buffers.
	323	*/
	324	int
	325	buf_dirty_count_severe(void)
	326	{
	327	return (runningbufspace + dirtybufspace >= hidirtybufspace \|\|
	328	dirtybufcount >= nbuf / 2);
	329	}
	330
	331	/*
	332	* Return true if the amount of running I/O is severe and BIOQ should
	333	* start bursting.
	334	*/
	335	int
	336	buf_runningbufspace_severe(void)
	337	{
	338	return (runningbufspace >= hirunningspace * 2 / 3);
	339	}
	340
	341	/*
	342	* vfs_buf_test_cache:
	343	*
	344	* Called when a buffer is extended. This function clears the B_CACHE
	345	* bit if the newly extended portion of the buffer does not contain
	346	* valid data.
	347	*
	348	* NOTE! Dirty VM pages are not processed into dirty (B_DELWRI) buffer
	349	* cache buffers. The VM pages remain dirty, as someone had mmap()'d
	350	* them while a clean buffer was present.
	351	*/
	352	static __inline__
	353	void
	354	vfs_buf_test_cache(struct buf *bp,
	355	vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
	356	vm_page_t m)
	357	{
	358	if (bp->b_flags & B_CACHE) {
	359	int base = (foff + off) & PAGE_MASK;
	360	if (vm_page_is_valid(m, base, size) == 0)
	361	bp->b_flags &= ~B_CACHE;
	362	}
	363	}
	364
	365	/*
	366	* bd_speedup()
	367	*
	368	* Spank the buf_daemon[_hw] if the total dirty buffer space exceeds the
	369	* low water mark.
	370	*
	371	* MPSAFE
	372	*/
	373	static __inline__
	374	void
	375	bd_speedup(void)
	376	{
	377	if (dirtybufspace < lodirtybufspace && dirtybufcount < nbuf / 2)
	378	return;
	379
	380	if (bd_request == 0 &&
	381	(dirtybufspace - dirtybufspacehw > lodirtybufspace / 2 \|\|
	382	dirtybufcount - dirtybufcounthw >= nbuf / 2)) {
	383	spin_lock_wr(&needsbuffer_spin);
	384	bd_request = 1;
	385	spin_unlock_wr(&needsbuffer_spin);
	386	wakeup(&bd_request);
	387	}
	388	if (bd_request_hw == 0 &&
	389	(dirtybufspacehw > lodirtybufspace / 2 \|\|
	390	dirtybufcounthw >= nbuf / 2)) {
	391	spin_lock_wr(&needsbuffer_spin);
	392	bd_request_hw = 1;
	393	spin_unlock_wr(&needsbuffer_spin);
	394	wakeup(&bd_request_hw);
	395	}
	396	}
	397
	398	/*
	399	* bd_heatup()
	400	*
	401	* Get the buf_daemon heated up when the number of running and dirty
	402	* buffers exceeds the mid-point.
	403	*
	404	* Return the total number of dirty bytes past the second mid point
	405	* as a measure of how much excess dirty data there is in the system.
	406	*
	407	* MPSAFE
	408	*/
	409	int
	410	bd_heatup(void)
	411	{
	412	int mid1;
	413	int mid2;
	414	int totalspace;
	415
	416	mid1 = lodirtybufspace + (hidirtybufspace - lodirtybufspace) / 2;
	417
	418	totalspace = runningbufspace + dirtybufspace;
	419	if (totalspace >= mid1 \|\| dirtybufcount >= nbuf / 2) {
	420	bd_speedup();
	421	mid2 = mid1 + (hidirtybufspace - mid1) / 2;
	422	if (totalspace >= mid2)
	423	return(totalspace - mid2);
	424	}
	425	return(0);
	426	}
	427
	428	/*
	429	* bd_wait()
	430	*
	431	* Wait for the buffer cache to flush (totalspace) bytes worth of
	432	* buffers, then return.
	433	*
	434	* Regardless this function blocks while the number of dirty buffers
	435	* exceeds hidirtybufspace.
	436	*
	437	* MPSAFE
	438	*/
	439	void
	440	bd_wait(int totalspace)
	441	{
	442	u_int i;
	443	int count;
	444
	445	if (curthread == bufdaemonhw_td \|\| curthread == bufdaemon_td)
	446	return;
	447
	448	while (totalspace > 0) {
	449	bd_heatup();
	450	if (totalspace > runningbufspace + dirtybufspace)
	451	totalspace = runningbufspace + dirtybufspace;
	452	count = totalspace / BKVASIZE;
	453	if (count >= BD_WAKE_SIZE)
	454	count = BD_WAKE_SIZE - 1;
	455
	456	spin_lock_wr(&needsbuffer_spin);
	457	i = (bd_wake_index + count) & BD_WAKE_MASK;
	458	++bd_wake_ary[i];
	459	tsleep_interlock(&bd_wake_ary[i], 0);
	460	spin_unlock_wr(&needsbuffer_spin);
	461	tsleep(&bd_wake_ary[i], PINTERLOCKED, "flstik", hz);
	462
	463	totalspace = runningbufspace + dirtybufspace - hidirtybufspace;
	464	}
	465	}
	466
	467	/*
	468	* bd_signal()
	469	*
	470	* This function is called whenever runningbufspace or dirtybufspace
	471	* is reduced. Track threads waiting for run+dirty buffer I/O
	472	* complete.
	473	*
	474	* MPSAFE
	475	*/
	476	static void
	477	bd_signal(int totalspace)
	478	{
	479	u_int i;
	480
	481	if (totalspace > 0) {
	482	if (totalspace > BKVASIZE * BD_WAKE_SIZE)
	483	totalspace = BKVASIZE * BD_WAKE_SIZE;
	484	spin_lock_wr(&needsbuffer_spin);
	485	while (totalspace > 0) {
	486	i = bd_wake_index++;
	487	i &= BD_WAKE_MASK;
	488	if (bd_wake_ary[i]) {
	489	bd_wake_ary[i] = 0;
	490	spin_unlock_wr(&needsbuffer_spin);
	491	wakeup(&bd_wake_ary[i]);
	492	spin_lock_wr(&needsbuffer_spin);
	493	}
	494	totalspace -= BKVASIZE;
	495	}
	496	spin_unlock_wr(&needsbuffer_spin);
	497	}
	498	}
	499
	500	/*
	501	* BIO tracking support routines.
	502	*
	503	* Release a ref on a bio_track. Wakeup requests are atomically released
	504	* along with the last reference so bk_active will never wind up set to
	505	* only 0x80000000.
	506	*
	507	* MPSAFE
	508	*/
	509	static
	510	void
	511	bio_track_rel(struct bio_track *track)
	512	{
	513	int active;
	514	int desired;
	515
	516	/*
	517	* Shortcut
	518	*/
	519	active = track->bk_active;
	520	if (active == 1 && atomic_cmpset_int(&track->bk_active, 1, 0))
	521	return;
	522
	523	/*
	524	* Full-on. Note that the wait flag is only atomically released on
	525	* the 1->0 count transition.
	526	*
	527	* We check for a negative count transition using bit 30 since bit 31
	528	* has a different meaning.
	529	*/
	530	for (;;) {
	531	desired = (active & 0x7FFFFFFF) - 1;
	532	if (desired)
	533	desired \|= active & 0x80000000;
	534	if (atomic_cmpset_int(&track->bk_active, active, desired)) {
	535	if (desired & 0x40000000)
	536	panic("bio_track_rel: bad count: %p\n", track);
	537	if (active & 0x80000000)
	538	wakeup(track);
	539	break;
	540	}
	541	active = track->bk_active;
	542	}
	543	}
	544
	545	/*
	546	* Wait for the tracking count to reach 0.
	547	*
	548	* Use atomic ops such that the wait flag is only set atomically when
	549	* bk_active is non-zero.
	550	*
	551	* MPSAFE
	552	*/
	553	int
	554	bio_track_wait(struct bio_track *track, int slp_flags, int slp_timo)
	555	{
	556	int active;
	557	int desired;
	558	int error;
	559
	560	/*
	561	* Shortcut
	562	*/
	563	if (track->bk_active == 0)
	564	return(0);
	565
	566	/*
	567	* Full-on. Note that the wait flag may only be atomically set if
	568	* the active count is non-zero.
	569	*/
	570	error = 0;
	571	while ((active = track->bk_active) != 0) {
	572	desired = active \| 0x80000000;
	573	tsleep_interlock(track, slp_flags);
	574	if (active == desired \|\|
	575	atomic_cmpset_int(&track->bk_active, active, desired)) {
	576	error = tsleep(track, slp_flags \| PINTERLOCKED,
	577	"iowait", slp_timo);
	578	if (error)
	579	break;
	580	}
	581	}
	582	return (error);
	583	}
	584
	585	/*
	586	* bufinit:
	587	*
	588	* Load time initialisation of the buffer cache, called from machine
	589	* dependant initialization code.
	590	*/
	591	void
	592	bufinit(void)
	593	{
	594	struct buf *bp;
	595	vm_offset_t bogus_offset;
	596	int i;
	597
	598	spin_init(&needsbuffer_spin);
	599
	600	/* next, make a null set of free lists */
	601	for (i = 0; i < BUFFER_QUEUES; i++)
	602	TAILQ_INIT(&bufqueues[i]);
	603
	604	/* finally, initialize each buffer header and stick on empty q */
	605	for (i = 0; i < nbuf; i++) {
	606	bp = &buf[i];
	607	bzero(bp, sizeof *bp);
	608	bp->b_flags = B_INVAL; /* we're just an empty header */
	609	bp->b_cmd = BUF_CMD_DONE;
	610	bp->b_qindex = BQUEUE_EMPTY;
	611	initbufbio(bp);
	612	xio_init(&bp->b_xio);
	613	buf_dep_init(bp);
	614	BUF_LOCKINIT(bp);
	615	TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_EMPTY], bp, b_freelist);
	616	}
	617
	618	/*
	619	* maxbufspace is the absolute maximum amount of buffer space we are
	620	* allowed to reserve in KVM and in real terms. The absolute maximum
	621	* is nominally used by buf_daemon. hibufspace is the nominal maximum
	622	* used by most other processes. The differential is required to
	623	* ensure that buf_daemon is able to run when other processes might
	624	* be blocked waiting for buffer space.
	625	*
	626	* maxbufspace is based on BKVASIZE. Allocating buffers larger then
	627	* this may result in KVM fragmentation which is not handled optimally
	628	* by the system.
	629	*/
	630	maxbufspace = nbuf * BKVASIZE;
	631	hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
	632	lobufspace = hibufspace - MAXBSIZE;
	633
	634	lorunningspace = 512 * 1024;
	635	/* hirunningspace -- see below */
	636
	637	/*
	638	* Limit the amount of malloc memory since it is wired permanently
	639	* into the kernel space. Even though this is accounted for in
	640	* the buffer allocation, we don't want the malloced region to grow
	641	* uncontrolled. The malloc scheme improves memory utilization
	642	* significantly on average (small) directories.
	643	*/
	644	maxbufmallocspace = hibufspace / 20;
	645
	646	/*
	647	* Reduce the chance of a deadlock occuring by limiting the number
	648	* of delayed-write dirty buffers we allow to stack up.
	649	*
	650	* We don't want too much actually queued to the device at once
	651	* (XXX this needs to be per-mount!), because the buffers will
	652	* wind up locked for a very long period of time while the I/O
	653	* drains.
	654	*/
	655	hidirtybufspace = hibufspace / 2; /* dirty + running */
	656	hirunningspace = hibufspace / 16; /* locked & queued to device */
	657	if (hirunningspace < 1024 * 1024)
	658	hirunningspace = 1024 * 1024;
	659
	660	dirtybufspace = 0;
	661	dirtybufspacehw = 0;
	662
	663	lodirtybufspace = hidirtybufspace / 2;
	664
	665	/*
	666	* Maximum number of async ops initiated per buf_daemon loop. This is
	667	* somewhat of a hack at the moment, we really need to limit ourselves
	668	* based on the number of bytes of I/O in-transit that were initiated
	669	* from buf_daemon.
	670	*/
	671
	672	bogus_offset = kmem_alloc_pageable(&kernel_map, PAGE_SIZE);
	673	bogus_page = vm_page_alloc(&kernel_object,
	674	(bogus_offset >> PAGE_SHIFT),
	675	VM_ALLOC_NORMAL);
	676	vmstats.v_wire_count++;
	677
	678	}
	679
	680	/*
	681	* Initialize the embedded bio structures
	682	*/
	683	void
	684	initbufbio(struct buf *bp)
	685	{
	686	bp->b_bio1.bio_buf = bp;
	687	bp->b_bio1.bio_prev = NULL;
	688	bp->b_bio1.bio_offset = NOOFFSET;
	689	bp->b_bio1.bio_next = &bp->b_bio2;
	690	bp->b_bio1.bio_done = NULL;
	691	bp->b_bio1.bio_flags = 0;
	692
	693	bp->b_bio2.bio_buf = bp;
	694	bp->b_bio2.bio_prev = &bp->b_bio1;
	695	bp->b_bio2.bio_offset = NOOFFSET;
	696	bp->b_bio2.bio_next = NULL;
	697	bp->b_bio2.bio_done = NULL;
	698	bp->b_bio2.bio_flags = 0;
	699	}
	700
	701	/*
	702	* Reinitialize the embedded bio structures as well as any additional
	703	* translation cache layers.
	704	*/
	705	void
	706	reinitbufbio(struct buf *bp)
	707	{
	708	struct bio *bio;
	709
	710	for (bio = &bp->b_bio1; bio; bio = bio->bio_next) {
	711	bio->bio_done = NULL;
	712	bio->bio_offset = NOOFFSET;
	713	}
	714	}
	715
	716	/*
	717	* Push another BIO layer onto an existing BIO and return it. The new
	718	* BIO layer may already exist, holding cached translation data.
	719	*/
	720	struct bio *
	721	push_bio(struct bio *bio)
	722	{
	723	struct bio *nbio;
	724
	725	if ((nbio = bio->bio_next) == NULL) {
	726	int index = bio - &bio->bio_buf->b_bio_array[0];
	727	if (index >= NBUF_BIO - 1) {
	728	panic("push_bio: too many layers bp %p\n",
	729	bio->bio_buf);
	730	}
	731	nbio = &bio->bio_buf->b_bio_array[index + 1];
	732	bio->bio_next = nbio;
	733	nbio->bio_prev = bio;
	734	nbio->bio_buf = bio->bio_buf;
	735	nbio->bio_offset = NOOFFSET;
	736	nbio->bio_done = NULL;
	737	nbio->bio_next = NULL;
	738	}
	739	KKASSERT(nbio->bio_done == NULL);
	740	return(nbio);
	741	}
	742
	743	/*
	744	* Pop a BIO translation layer, returning the previous layer. The
	745	* must have been previously pushed.
	746	*/
	747	struct bio *
	748	pop_bio(struct bio *bio)
	749	{
	750	return(bio->bio_prev);
	751	}
	752
	753	void
	754	clearbiocache(struct bio *bio)
	755	{
	756	while (bio) {
	757	bio->bio_offset = NOOFFSET;
	758	bio = bio->bio_next;
	759	}
	760	}
	761
	762	/*
	763	* bfreekva:
	764	*
	765	* Free the KVA allocation for buffer 'bp'.
	766	*
	767	* Must be called from a critical section as this is the only locking for
	768	* buffer_map.
	769	*
	770	* Since this call frees up buffer space, we call bufspacewakeup().
	771	*
	772	* MPALMOSTSAFE
	773	*/
	774	static void
	775	bfreekva(struct buf *bp)
	776	{
	777	int count;
	778
	779	if (bp->b_kvasize) {
	780	get_mplock();
	781	++buffreekvacnt;
	782	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	783	vm_map_lock(&buffer_map);
	784	bufspace -= bp->b_kvasize;
	785	vm_map_delete(&buffer_map,
	786	(vm_offset_t) bp->b_kvabase,
	787	(vm_offset_t) bp->b_kvabase + bp->b_kvasize,
	788	&count
	789	);
	790	vm_map_unlock(&buffer_map);
	791	vm_map_entry_release(count);
	792	bp->b_kvasize = 0;
	793	bufspacewakeup();
	794	rel_mplock();
	795	}
	796	}
	797
	798	/*
	799	* bremfree:
	800	*
	801	* Remove the buffer from the appropriate free list.
	802	*/
	803	static __inline void
	804	_bremfree(struct buf *bp)
	805	{
	806	if (bp->b_qindex != BQUEUE_NONE) {
	807	KASSERT(BUF_REFCNTNB(bp) == 1,
	808	("bremfree: bp %p not locked",bp));
	809	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
	810	bp->b_qindex = BQUEUE_NONE;
	811	} else {
	812	if (BUF_REFCNTNB(bp) <= 1)
	813	panic("bremfree: removing a buffer not on a queue");
	814	}
	815	}
	816
	817	void
	818	bremfree(struct buf *bp)
	819	{
	820	spin_lock_wr(&bufspin);
	821	_bremfree(bp);
	822	spin_unlock_wr(&bufspin);
	823	}
	824
	825	static void
	826	bremfree_locked(struct buf *bp)
	827	{
	828	_bremfree(bp);
	829	}
	830
	831	/*
	832	* bread:
	833	*
	834	* Get a buffer with the specified data. Look in the cache first. We
	835	* must clear B_ERROR and B_INVAL prior to initiating I/O. If B_CACHE
	836	* is set, the buffer is valid and we do not have to do anything ( see
	837	* getblk() ).
	838	*
	839	* MPALMOSTSAFE
	840	*/
	841	int
	842	bread(struct vnode vp, off_t loffset, int size, struct buf *bpp)
	843	{
	844	struct buf *bp;
	845
	846	bp = getblk(vp, loffset, size, 0, 0);
	847	*bpp = bp;
	848
	849	/* if not found in cache, do some I/O */
	850	if ((bp->b_flags & B_CACHE) == 0) {
	851	get_mplock();
	852	bp->b_flags &= ~(B_ERROR \| B_EINTR \| B_INVAL);
	853	bp->b_cmd = BUF_CMD_READ;
	854	bp->b_bio1.bio_done = biodone_sync;
	855	bp->b_bio1.bio_flags \|= BIO_SYNC;
	856	vfs_busy_pages(vp, bp);
	857	vn_strategy(vp, &bp->b_bio1);
	858	rel_mplock();
	859	return (biowait(&bp->b_bio1, "biord"));
	860	}
	861	return (0);
	862	}
	863
	864	/*
	865	* breadn:
	866	*
	867	* Operates like bread, but also starts asynchronous I/O on
	868	* read-ahead blocks. We must clear B_ERROR and B_INVAL prior
	869	* to initiating I/O . If B_CACHE is set, the buffer is valid
	870	* and we do not have to do anything.
	871	*
	872	* MPALMOSTSAFE
	873	*/
	874	int
	875	breadn(struct vnode vp, off_t loffset, int size, off_t raoffset,
	876	int rabsize, int cnt, struct buf *bpp)
	877	{
	878	struct buf bp, rabp;
	879	int i;
	880	int rv = 0, readwait = 0;
	881
	882	*bpp = bp = getblk(vp, loffset, size, 0, 0);
	883
	884	/* if not found in cache, do some I/O */
	885	if ((bp->b_flags & B_CACHE) == 0) {
	886	get_mplock();
	887	bp->b_flags &= ~(B_ERROR \| B_EINTR \| B_INVAL);
	888	bp->b_cmd = BUF_CMD_READ;
	889	bp->b_bio1.bio_done = biodone_sync;
	890	bp->b_bio1.bio_flags \|= BIO_SYNC;
	891	vfs_busy_pages(vp, bp);
	892	vn_strategy(vp, &bp->b_bio1);
	893	++readwait;
	894	rel_mplock();
	895	}
	896
	897	for (i = 0; i < cnt; i++, raoffset++, rabsize++) {
	898	if (inmem(vp, *raoffset))
	899	continue;
	900	rabp = getblk(vp, raoffset, rabsize, 0, 0);
	901
	902	if ((rabp->b_flags & B_CACHE) == 0) {
	903	get_mplock();
	904	rabp->b_flags &= ~(B_ERROR \| B_EINTR \| B_INVAL);
	905	rabp->b_cmd = BUF_CMD_READ;
	906	vfs_busy_pages(vp, rabp);
	907	BUF_KERNPROC(rabp);
	908	vn_strategy(vp, &rabp->b_bio1);
	909	rel_mplock();
	910	} else {
	911	brelse(rabp);
	912	}
	913	}
	914	if (readwait)
	915	rv = biowait(&bp->b_bio1, "biord");
	916	return (rv);
	917	}
	918
	919	/*
	920	* bwrite:
	921	*
	922	* Synchronous write, waits for completion.
	923	*
	924	* Write, release buffer on completion. (Done by iodone
	925	* if async). Do not bother writing anything if the buffer
	926	* is invalid.
	927	*
	928	* Note that we set B_CACHE here, indicating that buffer is
	929	* fully valid and thus cacheable. This is true even of NFS
	930	* now so we set it generally. This could be set either here
	931	* or in biodone() since the I/O is synchronous. We put it
	932	* here.
	933	*/
	934	int
	935	bwrite(struct buf *bp)
	936	{
	937	int error;
	938
	939	if (bp->b_flags & B_INVAL) {
	940	brelse(bp);
	941	return (0);
	942	}
	943	if (BUF_REFCNTNB(bp) == 0)
	944	panic("bwrite: buffer is not busy???");
	945
	946	/* Mark the buffer clean */
	947	bundirty(bp);
	948
	949	bp->b_flags &= ~(B_ERROR \| B_EINTR);
	950	bp->b_flags \|= B_CACHE;
	951	bp->b_cmd = BUF_CMD_WRITE;
	952	bp->b_bio1.bio_done = biodone_sync;
	953	bp->b_bio1.bio_flags \|= BIO_SYNC;
	954	vfs_busy_pages(bp->b_vp, bp);
	955
	956	/*
	957	* Normal bwrites pipeline writes. NOTE: b_bufsize is only
	958	* valid for vnode-backed buffers.
	959	*/
	960	bp->b_runningbufspace = bp->b_bufsize;
	961	if (bp->b_runningbufspace) {
	962	runningbufspace += bp->b_runningbufspace;
	963	++runningbufcount;
	964	}
	965
	966	vn_strategy(bp->b_vp, &bp->b_bio1);
	967	error = biowait(&bp->b_bio1, "biows");
	968	brelse(bp);
	969	return (error);
	970	}
	971
	972	/*
	973	* bawrite:
	974	*
	975	* Asynchronous write. Start output on a buffer, but do not wait for
	976	* it to complete. The buffer is released when the output completes.
	977	*
	978	* bwrite() ( or the VOP routine anyway ) is responsible for handling
	979	* B_INVAL buffers. Not us.
	980	*/
	981	void
	982	bawrite(struct buf *bp)
	983	{
	984	if (bp->b_flags & B_INVAL) {
	985	brelse(bp);
	986	return;
	987	}
	988	if (BUF_REFCNTNB(bp) == 0)
	989	panic("bwrite: buffer is not busy???");
	990
	991	/* Mark the buffer clean */
	992	bundirty(bp);
	993
	994	bp->b_flags &= ~(B_ERROR \| B_EINTR);
	995	bp->b_flags \|= B_CACHE;
	996	bp->b_cmd = BUF_CMD_WRITE;
	997	KKASSERT(bp->b_bio1.bio_done == NULL);
	998	vfs_busy_pages(bp->b_vp, bp);
	999
	1000	/*
	1001	* Normal bwrites pipeline writes. NOTE: b_bufsize is only
	1002	* valid for vnode-backed buffers.
	1003	*/
	1004	bp->b_runningbufspace = bp->b_bufsize;
	1005	if (bp->b_runningbufspace) {
	1006	runningbufspace += bp->b_runningbufspace;
	1007	++runningbufcount;
	1008	}
	1009
	1010	BUF_KERNPROC(bp);
	1011	vn_strategy(bp->b_vp, &bp->b_bio1);
	1012	}
	1013
	1014	/*
	1015	* bowrite:
	1016	*
	1017	* Ordered write. Start output on a buffer, and flag it so that the
	1018	* device will write it in the order it was queued. The buffer is
	1019	* released when the output completes. bwrite() ( or the VOP routine
	1020	* anyway ) is responsible for handling B_INVAL buffers.
	1021	*/
	1022	int
	1023	bowrite(struct buf *bp)
	1024	{
	1025	bp->b_flags \|= B_ORDERED;
	1026	bawrite(bp);
	1027	return (0);
	1028	}
	1029
	1030	/*
	1031	* bdwrite:
	1032	*
	1033	* Delayed write. (Buffer is marked dirty). Do not bother writing
	1034	* anything if the buffer is marked invalid.
	1035	*
	1036	* Note that since the buffer must be completely valid, we can safely
	1037	* set B_CACHE. In fact, we have to set B_CACHE here rather then in
	1038	* biodone() in order to prevent getblk from writing the buffer
	1039	* out synchronously.
	1040	*/
	1041	void
	1042	bdwrite(struct buf *bp)
	1043	{
	1044	if (BUF_REFCNTNB(bp) == 0)
	1045	panic("bdwrite: buffer is not busy");
	1046
	1047	if (bp->b_flags & B_INVAL) {
	1048	brelse(bp);
	1049	return;
	1050	}
	1051	bdirty(bp);
	1052
	1053	if (dsched_is_clear_buf_priv(bp))
	1054	dsched_new_buf(bp);
	1055
	1056	/*
	1057	* Set B_CACHE, indicating that the buffer is fully valid. This is
	1058	* true even of NFS now.
	1059	*/
	1060	bp->b_flags \|= B_CACHE;
	1061
	1062	/*
	1063	* This bmap keeps the system from needing to do the bmap later,
	1064	* perhaps when the system is attempting to do a sync. Since it
	1065	* is likely that the indirect block -- or whatever other datastructure
	1066	* that the filesystem needs is still in memory now, it is a good
	1067	* thing to do this. Note also, that if the pageout daemon is
	1068	* requesting a sync -- there might not be enough memory to do
	1069	* the bmap then... So, this is important to do.
	1070	*/
	1071	if (bp->b_bio2.bio_offset == NOOFFSET) {
	1072	VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset,
	1073	NULL, NULL, BUF_CMD_WRITE);
	1074	}
	1075
	1076	/*
	1077	* Because the underlying pages may still be mapped and
	1078	* writable trying to set the dirty buffer (b_dirtyoff/end)
	1079	* range here will be inaccurate.
	1080	*
	1081	* However, we must still clean the pages to satisfy the
	1082	* vnode_pager and pageout daemon, so theythink the pages
	1083	* have been "cleaned". What has really occured is that
	1084	* they've been earmarked for later writing by the buffer
	1085	* cache.
	1086	*
	1087	* So we get the b_dirtyoff/end update but will not actually
	1088	* depend on it (NFS that is) until the pages are busied for
	1089	* writing later on.
	1090	*/
	1091	vfs_clean_pages(bp);
	1092	bqrelse(bp);
	1093
	1094	/*
	1095	* note: we cannot initiate I/O from a bdwrite even if we wanted to,
	1096	* due to the softdep code.
	1097	*/
	1098	}
	1099
	1100	/*
	1101	* Fake write - return pages to VM system as dirty, leave the buffer clean.
	1102	* This is used by tmpfs.
	1103	*
	1104	* It is important for any VFS using this routine to NOT use it for
	1105	* IO_SYNC or IO_ASYNC operations which occur when the system really
	1106	* wants to flush VM pages to backing store.
	1107	*/
	1108	void
	1109	buwrite(struct buf *bp)
	1110	{
	1111	vm_page_t m;
	1112	int i;
	1113
	1114	/*
	1115	* Only works for VMIO buffers. If the buffer is already
	1116	* marked for delayed-write we can't avoid the bdwrite().
	1117	*/
	1118	if ((bp->b_flags & B_VMIO) == 0 \|\| (bp->b_flags & B_DELWRI)) {
	1119	bdwrite(bp);
	1120	return;
	1121	}
	1122
	1123	/*
	1124	* Set valid & dirty.
	1125	*/
	1126	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	1127	m = bp->b_xio.xio_pages[i];
	1128	vfs_dirty_one_page(bp, i, m);
	1129	}
	1130	bqrelse(bp);
	1131	}
	1132
	1133	/*
	1134	* bdirty:
	1135	*
	1136	* Turn buffer into delayed write request by marking it B_DELWRI.
	1137	* B_RELBUF and B_NOCACHE must be cleared.
	1138	*
	1139	* We reassign the buffer to itself to properly update it in the
	1140	* dirty/clean lists.
	1141	*
	1142	* Must be called from a critical section.
	1143	* The buffer must be on BQUEUE_NONE.
	1144	*/
	1145	void
	1146	bdirty(struct buf *bp)
	1147	{
	1148	KASSERT(bp->b_qindex == BQUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
	1149	if (bp->b_flags & B_NOCACHE) {
	1150	kprintf("bdirty: clearing B_NOCACHE on buf %p\n", bp);
	1151	bp->b_flags &= ~B_NOCACHE;
	1152	}
	1153	if (bp->b_flags & B_INVAL) {
	1154	kprintf("bdirty: warning, dirtying invalid buffer %p\n", bp);
	1155	}
	1156	bp->b_flags &= ~B_RELBUF;
	1157
	1158	if ((bp->b_flags & B_DELWRI) == 0) {
	1159	bp->b_flags \|= B_DELWRI;
	1160	reassignbuf(bp);
	1161	atomic_add_int(&dirtybufcount, 1);
	1162	dirtybufspace += bp->b_bufsize;
	1163	if (bp->b_flags & B_HEAVY) {
	1164	atomic_add_int(&dirtybufcounthw, 1);
	1165	atomic_add_int(&dirtybufspacehw, bp->b_bufsize);
	1166	}
	1167	bd_heatup();
	1168	}
	1169	}
	1170
	1171	/*
	1172	* Set B_HEAVY, indicating that this is a heavy-weight buffer that
	1173	* needs to be flushed with a different buf_daemon thread to avoid
	1174	* deadlocks. B_HEAVY also imposes restrictions in getnewbuf().
	1175	*/
	1176	void
	1177	bheavy(struct buf *bp)
	1178	{
	1179	if ((bp->b_flags & B_HEAVY) == 0) {
	1180	bp->b_flags \|= B_HEAVY;
	1181	if (bp->b_flags & B_DELWRI) {
	1182	atomic_add_int(&dirtybufcounthw, 1);
	1183	atomic_add_int(&dirtybufspacehw, bp->b_bufsize);
	1184	}
	1185	}
	1186	}
	1187
	1188	/*
	1189	* bundirty:
	1190	*
	1191	* Clear B_DELWRI for buffer.
	1192	*
	1193	* Must be called from a critical section.
	1194	*
	1195	* The buffer is typically on BQUEUE_NONE but there is one case in
	1196	* brelse() that calls this function after placing the buffer on
	1197	* a different queue.
	1198	*
	1199	* MPSAFE
	1200	*/
	1201	void
	1202	bundirty(struct buf *bp)
	1203	{
	1204	if (bp->b_flags & B_DELWRI) {
	1205	bp->b_flags &= ~B_DELWRI;
	1206	reassignbuf(bp);
	1207	atomic_subtract_int(&dirtybufcount, 1);
	1208	atomic_subtract_int(&dirtybufspace, bp->b_bufsize);
	1209	if (bp->b_flags & B_HEAVY) {
	1210	atomic_subtract_int(&dirtybufcounthw, 1);
	1211	atomic_subtract_int(&dirtybufspacehw, bp->b_bufsize);
	1212	}
	1213	bd_signal(bp->b_bufsize);
	1214	}
	1215	/*
	1216	* Since it is now being written, we can clear its deferred write flag.
	1217	*/
	1218	bp->b_flags &= ~B_DEFERRED;
	1219	}
	1220
	1221	/*
	1222	* brelse:
	1223	*
	1224	* Release a busy buffer and, if requested, free its resources. The
	1225	* buffer will be stashed in the appropriate bufqueue[] allowing it
	1226	* to be accessed later as a cache entity or reused for other purposes.
	1227	*
	1228	* MPALMOSTSAFE
	1229	*/
	1230	void
	1231	brelse(struct buf *bp)
	1232	{
	1233	#ifdef INVARIANTS
	1234	int saved_flags = bp->b_flags;
	1235	#endif
	1236
	1237	KASSERT(!(bp->b_flags & (B_CLUSTER\|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
	1238
	1239	/*
	1240	* If B_NOCACHE is set we are being asked to destroy the buffer and
	1241	* its backing store. Clear B_DELWRI.
	1242	*
	1243	* B_NOCACHE is set in two cases: (1) when the caller really wants
	1244	* to destroy the buffer and backing store and (2) when the caller
	1245	* wants to destroy the buffer and backing store after a write
	1246	* completes.
	1247	*/
	1248	if ((bp->b_flags & (B_NOCACHE\|B_DELWRI)) == (B_NOCACHE\|B_DELWRI)) {
	1249	bundirty(bp);
	1250	}
	1251
	1252	if ((bp->b_flags & (B_INVAL \| B_DELWRI)) == B_DELWRI) {
	1253	/*
	1254	* A re-dirtied buffer is only subject to destruction
	1255	* by B_INVAL. B_ERROR and B_NOCACHE are ignored.
	1256	*/
	1257	/* leave buffer intact */
	1258	} else if ((bp->b_flags & (B_NOCACHE \| B_INVAL \| B_ERROR)) \|\|
	1259	(bp->b_bufsize <= 0)) {
	1260	/*
	1261	* Either a failed read or we were asked to free or not
	1262	* cache the buffer. This path is reached with B_DELWRI
	1263	* set only if B_INVAL is already set. B_NOCACHE governs
	1264	* backing store destruction.
	1265	*
	1266	* NOTE: HAMMER will set B_LOCKED in buf_deallocate if the
	1267	* buffer cannot be immediately freed.
	1268	*/
	1269	bp->b_flags \|= B_INVAL;
	1270	if (LIST_FIRST(&bp->b_dep) != NULL) {
	1271	get_mplock();
	1272	buf_deallocate(bp);
	1273	rel_mplock();
	1274	}
	1275	if (bp->b_flags & B_DELWRI) {
	1276	atomic_subtract_int(&dirtybufcount, 1);
	1277	atomic_subtract_int(&dirtybufspace, bp->b_bufsize);
	1278	if (bp->b_flags & B_HEAVY) {
	1279	atomic_subtract_int(&dirtybufcounthw, 1);
	1280	atomic_subtract_int(&dirtybufspacehw, bp->b_bufsize);
	1281	}
	1282	bd_signal(bp->b_bufsize);
	1283	}
	1284	bp->b_flags &= ~(B_DELWRI \| B_CACHE);
	1285	}
	1286
	1287	/*
	1288	* We must clear B_RELBUF if B_DELWRI or B_LOCKED is set.
	1289	* If vfs_vmio_release() is called with either bit set, the
	1290	* underlying pages may wind up getting freed causing a previous
	1291	* write (bdwrite()) to get 'lost' because pages associated with
	1292	* a B_DELWRI bp are marked clean. Pages associated with a
	1293	* B_LOCKED buffer may be mapped by the filesystem.
	1294	*
	1295	* If we want to release the buffer ourselves (rather then the
	1296	* originator asking us to release it), give the originator a
	1297	* chance to countermand the release by setting B_LOCKED.
	1298	*
	1299	* We still allow the B_INVAL case to call vfs_vmio_release(), even
	1300	* if B_DELWRI is set.
	1301	*
	1302	* If B_DELWRI is not set we may have to set B_RELBUF if we are low
	1303	* on pages to return pages to the VM page queues.
	1304	*/
	1305	if (bp->b_flags & (B_DELWRI \| B_LOCKED)) {
	1306	bp->b_flags &= ~B_RELBUF;
	1307	} else if (vm_page_count_severe()) {
	1308	if (LIST_FIRST(&bp->b_dep) != NULL) {
	1309	get_mplock();
	1310	buf_deallocate(bp); /* can set B_LOCKED */
	1311	rel_mplock();
	1312	}
	1313	if (bp->b_flags & (B_DELWRI \| B_LOCKED))
	1314	bp->b_flags &= ~B_RELBUF;
	1315	else
	1316	bp->b_flags \|= B_RELBUF;
	1317	}
	1318
	1319	/*
	1320	* Make sure b_cmd is clear. It may have already been cleared by
	1321	* biodone().
	1322	*
	1323	* At this point destroying the buffer is governed by the B_INVAL
	1324	* or B_RELBUF flags.
	1325	*/
	1326	bp->b_cmd = BUF_CMD_DONE;
	1327	dsched_exit_buf(bp);
	1328
	1329	/*
	1330	* VMIO buffer rundown. Make sure the VM page array is restored
	1331	* after an I/O may have replaces some of the pages with bogus pages
	1332	* in order to not destroy dirty pages in a fill-in read.
	1333	*
	1334	* Note that due to the code above, if a buffer is marked B_DELWRI
	1335	* then the B_RELBUF and B_NOCACHE bits will always be clear.
	1336	* B_INVAL may still be set, however.
	1337	*
	1338	* For clean buffers, B_INVAL or B_RELBUF will destroy the buffer
	1339	* but not the backing store. B_NOCACHE will destroy the backing
	1340	* store.
	1341	*
	1342	* Note that dirty NFS buffers contain byte-granular write ranges
	1343	* and should not be destroyed w/ B_INVAL even if the backing store
	1344	* is left intact.
	1345	*/
	1346	if (bp->b_flags & B_VMIO) {
	1347	/*
	1348	* Rundown for VMIO buffers which are not dirty NFS buffers.
	1349	*/
	1350	int i, j, resid;
	1351	vm_page_t m;
	1352	off_t foff;
	1353	vm_pindex_t poff;
	1354	vm_object_t obj;
	1355	struct vnode *vp;
	1356
	1357	vp = bp->b_vp;
	1358
	1359	/*
	1360	* Get the base offset and length of the buffer. Note that
	1361	* in the VMIO case if the buffer block size is not
	1362	* page-aligned then b_data pointer may not be page-aligned.
	1363	* But our b_xio.xio_pages array IS page aligned.
	1364	*
	1365	* block sizes less then DEV_BSIZE (usually 512) are not
	1366	* supported due to the page granularity bits (m->valid,
	1367	* m->dirty, etc...).
	1368	*
	1369	* See man buf(9) for more information
	1370	*/
	1371
	1372	resid = bp->b_bufsize;
	1373	foff = bp->b_loffset;
	1374
	1375	get_mplock();
	1376	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	1377	m = bp->b_xio.xio_pages[i];
	1378	vm_page_flag_clear(m, PG_ZERO);
	1379	/*
	1380	* If we hit a bogus page, fixup all of them
	1381	* now. Note that we left these pages wired
	1382	* when we removed them so they had better exist,
	1383	* and they cannot be ripped out from under us so
	1384	* no critical section protection is necessary.
	1385	*/
	1386	if (m == bogus_page) {
	1387	obj = vp->v_object;
	1388	poff = OFF_TO_IDX(bp->b_loffset);
	1389
	1390	for (j = i; j < bp->b_xio.xio_npages; j++) {
	1391	vm_page_t mtmp;
	1392
	1393	mtmp = bp->b_xio.xio_pages[j];
	1394	if (mtmp == bogus_page) {
	1395	mtmp = vm_page_lookup(obj, poff + j);
	1396	if (!mtmp) {
	1397	panic("brelse: page missing");
	1398	}
	1399	bp->b_xio.xio_pages[j] = mtmp;
	1400	}
	1401	}
	1402
	1403	if ((bp->b_flags & B_INVAL) == 0) {
	1404	pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
	1405	bp->b_xio.xio_pages, bp->b_xio.xio_npages);
	1406	}
	1407	m = bp->b_xio.xio_pages[i];
	1408	}
	1409
	1410	/*
	1411	* Invalidate the backing store if B_NOCACHE is set
	1412	* (e.g. used with vinvalbuf()). If this is NFS
	1413	* we impose a requirement that the block size be
	1414	* a multiple of PAGE_SIZE and create a temporary
	1415	* hack to basically invalidate the whole page. The
	1416	* problem is that NFS uses really odd buffer sizes
	1417	* especially when tracking piecemeal writes and
	1418	* it also vinvalbuf()'s a lot, which would result
	1419	* in only partial page validation and invalidation
	1420	* here. If the file page is mmap()'d, however,
	1421	* all the valid bits get set so after we invalidate
	1422	* here we would end up with weird m->valid values
	1423	* like 0xfc. nfs_getpages() can't handle this so
	1424	* we clear all the valid bits for the NFS case
	1425	* instead of just some of them.
	1426	*
	1427	* The real bug is the VM system having to set m->valid
	1428	* to VM_PAGE_BITS_ALL for faulted-in pages, which
	1429	* itself is an artifact of the whole 512-byte
	1430	* granular mess that exists to support odd block
	1431	* sizes and UFS meta-data block sizes (e.g. 6144).
	1432	* A complete rewrite is required.
	1433	*
	1434	* XXX
	1435	*/
	1436	if (bp->b_flags & (B_NOCACHE\|B_ERROR)) {
	1437	int poffset = foff & PAGE_MASK;
	1438	int presid;
	1439
	1440	presid = PAGE_SIZE - poffset;
	1441	if (bp->b_vp->v_tag == VT_NFS &&
	1442	bp->b_vp->v_type == VREG) {
	1443	; /* entire page */
	1444	} else if (presid > resid) {
	1445	presid = resid;
	1446	}
	1447	KASSERT(presid >= 0, ("brelse: extra page"));
	1448	vm_page_set_invalid(m, poffset, presid);
	1449
	1450	/*
	1451	* Also make sure any swap cache is removed
	1452	* as it is now stale (HAMMER in particular
	1453	* uses B_NOCACHE to deal with buffer
	1454	* aliasing).
	1455	*/
	1456	swap_pager_unswapped(m);
	1457	}
	1458	resid -= PAGE_SIZE - (foff & PAGE_MASK);
	1459	foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
	1460	}
	1461	if (bp->b_flags & (B_INVAL \| B_RELBUF))
	1462	vfs_vmio_release(bp);
	1463	rel_mplock();
	1464	} else {
	1465	/*
	1466	* Rundown for non-VMIO buffers.
	1467	*/
	1468	if (bp->b_flags & (B_INVAL \| B_RELBUF)) {
	1469	get_mplock();
	1470	if (bp->b_bufsize)
	1471	allocbuf(bp, 0);
	1472	KKASSERT (LIST_FIRST(&bp->b_dep) == NULL);
	1473	if (bp->b_vp)
	1474	brelvp(bp);
	1475	rel_mplock();
	1476	}
	1477	}
	1478
	1479	if (bp->b_qindex != BQUEUE_NONE)
	1480	panic("brelse: free buffer onto another queue???");
	1481	if (BUF_REFCNTNB(bp) > 1) {
	1482	/* Temporary panic to verify exclusive locking */
	1483	/* This panic goes away when we allow shared refs */
	1484	panic("brelse: multiple refs");
	1485	/* NOT REACHED */
	1486	return;
	1487	}
	1488
	1489	/*
	1490	* Figure out the correct queue to place the cleaned up buffer on.
	1491	* Buffers placed in the EMPTY or EMPTYKVA had better already be
	1492	* disassociated from their vnode.
	1493	*/
	1494	spin_lock_wr(&bufspin);
	1495	if (bp->b_flags & B_LOCKED) {
	1496	/*
	1497	* Buffers that are locked are placed in the locked queue
	1498	* immediately, regardless of their state.
	1499	*/
	1500	bp->b_qindex = BQUEUE_LOCKED;
	1501	TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_LOCKED], bp, b_freelist);
	1502	} else if (bp->b_bufsize == 0) {
	1503	/*
	1504	* Buffers with no memory. Due to conditionals near the top
	1505	* of brelse() such buffers should probably already be
	1506	* marked B_INVAL and disassociated from their vnode.
	1507	*/
	1508	bp->b_flags \|= B_INVAL;
	1509	KASSERT(bp->b_vp == NULL, ("bp1 %p flags %08x/%08x vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp));
	1510	KKASSERT((bp->b_flags & B_HASHED) == 0);
	1511	if (bp->b_kvasize) {
	1512	bp->b_qindex = BQUEUE_EMPTYKVA;
	1513	} else {
	1514	bp->b_qindex = BQUEUE_EMPTY;
	1515	}
	1516	TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
	1517	} else if (bp->b_flags & (B_INVAL \| B_NOCACHE \| B_RELBUF)) {
	1518	/*
	1519	* Buffers with junk contents. Again these buffers had better
	1520	* already be disassociated from their vnode.
	1521	*/
	1522	KASSERT(bp->b_vp == NULL, ("bp2 %p flags %08x/%08x vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp));
	1523	KKASSERT((bp->b_flags & B_HASHED) == 0);
	1524	bp->b_flags \|= B_INVAL;
	1525	bp->b_qindex = BQUEUE_CLEAN;
	1526	TAILQ_INSERT_HEAD(&bufqueues[BQUEUE_CLEAN], bp, b_freelist);
	1527	} else {
	1528	/*
	1529	* Remaining buffers. These buffers are still associated with
	1530	* their vnode.
	1531	*/
	1532	switch(bp->b_flags & (B_DELWRI\|B_HEAVY)) {
	1533	case B_DELWRI:
	1534	bp->b_qindex = BQUEUE_DIRTY;
	1535	TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_DIRTY], bp, b_freelist);
	1536	break;
	1537	case B_DELWRI \| B_HEAVY:
	1538	bp->b_qindex = BQUEUE_DIRTY_HW;
	1539	TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_DIRTY_HW], bp,
	1540	b_freelist);
	1541	break;
	1542	default:
	1543	/*
	1544	* NOTE: Buffers are always placed at the end of the
	1545	* queue. If B_AGE is not set the buffer will cycle
	1546	* through the queue twice.
	1547	*/
	1548	bp->b_qindex = BQUEUE_CLEAN;
	1549	TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN], bp, b_freelist);
	1550	break;
	1551	}
	1552	}
	1553	spin_unlock_wr(&bufspin);
	1554
	1555	/*
	1556	* If B_INVAL, clear B_DELWRI. We've already placed the buffer
	1557	* on the correct queue.
	1558	*/
	1559	if ((bp->b_flags & (B_INVAL\|B_DELWRI)) == (B_INVAL\|B_DELWRI))
	1560	bundirty(bp);
	1561
	1562	/*
	1563	* The bp is on an appropriate queue unless locked. If it is not
	1564	* locked or dirty we can wakeup threads waiting for buffer space.
	1565	*
	1566	* We've already handled the B_INVAL case ( B_DELWRI will be clear
	1567	* if B_INVAL is set ).
	1568	*/
	1569	if ((bp->b_flags & (B_LOCKED\|B_DELWRI)) == 0)
	1570	bufcountwakeup();
	1571
	1572	/*
	1573	* Something we can maybe free or reuse
	1574	*/
	1575	if (bp->b_bufsize \|\| bp->b_kvasize)
	1576	bufspacewakeup();
	1577
	1578	/*
	1579	* Clean up temporary flags and unlock the buffer.
	1580	*/
	1581	bp->b_flags &= ~(B_ORDERED \| B_NOCACHE \| B_RELBUF \| B_DIRECT);
	1582	BUF_UNLOCK(bp);
	1583	}
	1584
	1585	/*
	1586	* bqrelse:
	1587	*
	1588	* Release a buffer back to the appropriate queue but do not try to free
	1589	* it. The buffer is expected to be used again soon.
	1590	*
	1591	* bqrelse() is used by bdwrite() to requeue a delayed write, and used by
	1592	* biodone() to requeue an async I/O on completion. It is also used when
	1593	* known good buffers need to be requeued but we think we may need the data
	1594	* again soon.
	1595	*
	1596	* XXX we should be able to leave the B_RELBUF hint set on completion.
	1597	*
	1598	* MPSAFE
	1599	*/
	1600	void
	1601	bqrelse(struct buf *bp)
	1602	{
	1603	KASSERT(!(bp->b_flags & (B_CLUSTER\|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
	1604
	1605	if (bp->b_qindex != BQUEUE_NONE)
	1606	panic("bqrelse: free buffer onto another queue???");
	1607	if (BUF_REFCNTNB(bp) > 1) {
	1608	/* do not release to free list */
	1609	panic("bqrelse: multiple refs");
	1610	return;
	1611	}
	1612
	1613	buf_act_advance(bp);
	1614
	1615	spin_lock_wr(&bufspin);
	1616	if (bp->b_flags & B_LOCKED) {
	1617	/*
	1618	* Locked buffers are released to the locked queue. However,
	1619	* if the buffer is dirty it will first go into the dirty
	1620	* queue and later on after the I/O completes successfully it
	1621	* will be released to the locked queue.
	1622	*/
	1623	bp->b_qindex = BQUEUE_LOCKED;
	1624	TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_LOCKED], bp, b_freelist);
	1625	} else if (bp->b_flags & B_DELWRI) {
	1626	bp->b_qindex = (bp->b_flags & B_HEAVY) ?
	1627	BQUEUE_DIRTY_HW : BQUEUE_DIRTY;
	1628	TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
	1629	} else if (vm_page_count_severe()) {
	1630	/*
	1631	* We are too low on memory, we have to try to free the
	1632	* buffer (most importantly: the wired pages making up its
	1633	* backing store) now.
	1634	*/
	1635	spin_unlock_wr(&bufspin);
	1636	brelse(bp);
	1637	return;
	1638	} else {
	1639	bp->b_qindex = BQUEUE_CLEAN;
	1640	TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN], bp, b_freelist);
	1641	}
	1642	spin_unlock_wr(&bufspin);
	1643
	1644	if ((bp->b_flags & B_LOCKED) == 0 &&
	1645	((bp->b_flags & B_INVAL) \|\| (bp->b_flags & B_DELWRI) == 0)) {
	1646	bufcountwakeup();
	1647	}
	1648
	1649	/*
	1650	* Something we can maybe free or reuse.
	1651	*/
	1652	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
	1653	bufspacewakeup();
	1654
	1655	/*
	1656	* Final cleanup and unlock. Clear bits that are only used while a
	1657	* buffer is actively locked.
	1658	*/
	1659	bp->b_flags &= ~(B_ORDERED \| B_NOCACHE \| B_RELBUF);
	1660	dsched_exit_buf(bp);
	1661	BUF_UNLOCK(bp);
	1662	}
	1663
	1664	/*
	1665	* vfs_vmio_release:
	1666	*
	1667	* Return backing pages held by the buffer 'bp' back to the VM system
	1668	* if possible. The pages are freed if they are no longer valid or
	1669	* attempt to free if it was used for direct I/O otherwise they are
	1670	* sent to the page cache.
	1671	*
	1672	* Pages that were marked busy are left alone and skipped.
	1673	*
	1674	* The KVA mapping (b_data) for the underlying pages is removed by
	1675	* this function.
	1676	*/
	1677	static void
	1678	vfs_vmio_release(struct buf *bp)
	1679	{
	1680	int i;
	1681	vm_page_t m;
	1682
	1683	lwkt_gettoken(&vm_token);
	1684	crit_enter();
	1685	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	1686	m = bp->b_xio.xio_pages[i];
	1687	bp->b_xio.xio_pages[i] = NULL;
	1688
	1689	/*
	1690	* The VFS is telling us this is not a meta-data buffer
	1691	* even if it is backed by a block device.
	1692	*/
	1693	if (bp->b_flags & B_NOTMETA)
	1694	vm_page_flag_set(m, PG_NOTMETA);
	1695
	1696	/*
	1697	* This is a very important bit of code. We try to track
	1698	* VM page use whether the pages are wired into the buffer
	1699	* cache or not. While wired into the buffer cache the
	1700	* bp tracks the act_count.
	1701	*
	1702	* We can choose to place unwired pages on the inactive
	1703	* queue (0) or active queue (1). If we place too many
	1704	* on the active queue the queue will cycle the act_count
	1705	* on pages we'd like to keep, just from single-use pages
	1706	* (such as when doing a tar-up or file scan).
	1707	*/
	1708	if (bp->b_act_count < vm_cycle_point)
	1709	vm_page_unwire(m, 0);
	1710	else
	1711	vm_page_unwire(m, 1);
	1712
	1713	/*
	1714	* We don't mess with busy pages, it is
	1715	* the responsibility of the process that
	1716	* busied the pages to deal with them.
	1717	*/
	1718	if ((m->flags & PG_BUSY) \|\| (m->busy != 0))
	1719	continue;
	1720
	1721	if (m->wire_count == 0) {
	1722	vm_page_flag_clear(m, PG_ZERO);
	1723	/*
	1724	* Might as well free the page if we can and it has
	1725	* no valid data. We also free the page if the
	1726	* buffer was used for direct I/O.
	1727	*/
	1728	#if 0
	1729	if ((bp->b_flags & B_ASYNC) == 0 && !m->valid &&
	1730	m->hold_count == 0) {
	1731	vm_page_busy(m);
	1732	vm_page_protect(m, VM_PROT_NONE);
	1733	vm_page_free(m);
	1734	} else
	1735	#endif
	1736	if (bp->b_flags & B_DIRECT) {
	1737	vm_page_try_to_free(m);
	1738	} else if (vm_page_count_severe()) {
	1739	m->act_count = bp->b_act_count;
	1740	vm_page_try_to_cache(m);
	1741	} else {
	1742	m->act_count = bp->b_act_count;
	1743	}
	1744	}
	1745	}
	1746	crit_exit();
	1747	lwkt_reltoken(&vm_token);
	1748	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_xio.xio_npages);
	1749	if (bp->b_bufsize) {
	1750	bufspacewakeup();
	1751	bp->b_bufsize = 0;
	1752	}
	1753	bp->b_xio.xio_npages = 0;
	1754	bp->b_flags &= ~B_VMIO;
	1755	KKASSERT (LIST_FIRST(&bp->b_dep) == NULL);
	1756	if (bp->b_vp) {
	1757	get_mplock();
	1758	brelvp(bp);
	1759	rel_mplock();
	1760	}
	1761	}
	1762
	1763	/*
	1764	* vfs_bio_awrite:
	1765	*
	1766	* Implement clustered async writes for clearing out B_DELWRI buffers.
	1767	* This is much better then the old way of writing only one buffer at
	1768	* a time. Note that we may not be presented with the buffers in the
	1769	* correct order, so we search for the cluster in both directions.
	1770	*
	1771	* The buffer is locked on call.
	1772	*/
	1773	int
	1774	vfs_bio_awrite(struct buf *bp)
	1775	{
	1776	int i;
	1777	int j;
	1778	off_t loffset = bp->b_loffset;
	1779	struct vnode *vp = bp->b_vp;
	1780	int nbytes;
	1781	struct buf *bpa;
	1782	int nwritten;
	1783	int size;
	1784
	1785	/*
	1786	* right now we support clustered writing only to regular files. If
	1787	* we find a clusterable block we could be in the middle of a cluster
	1788	* rather then at the beginning.
	1789	*
	1790	* NOTE: b_bio1 contains the logical loffset and is aliased
	1791	* to b_loffset. b_bio2 contains the translated block number.
	1792	*/
	1793	if ((vp->v_type == VREG) &&
	1794	(vp->v_mount != 0) && /* Only on nodes that have the size info */
	1795	(bp->b_flags & (B_CLUSTEROK \| B_INVAL)) == B_CLUSTEROK) {
	1796
	1797	size = vp->v_mount->mnt_stat.f_iosize;
	1798
	1799	for (i = size; i < MAXPHYS; i += size) {
	1800	if ((bpa = findblk(vp, loffset + i, FINDBLK_TEST)) &&
	1801	BUF_REFCNT(bpa) == 0 &&
	1802	((bpa->b_flags & (B_DELWRI \| B_CLUSTEROK \| B_INVAL)) ==
	1803	(B_DELWRI \| B_CLUSTEROK)) &&
	1804	(bpa->b_bufsize == size)) {
	1805	if ((bpa->b_bio2.bio_offset == NOOFFSET) \|\|
	1806	(bpa->b_bio2.bio_offset !=
	1807	bp->b_bio2.bio_offset + i))
	1808	break;
	1809	} else {
	1810	break;
	1811	}
	1812	}
	1813	for (j = size; i + j <= MAXPHYS && j <= loffset; j += size) {
	1814	if ((bpa = findblk(vp, loffset - j, FINDBLK_TEST)) &&
	1815	BUF_REFCNT(bpa) == 0 &&
	1816	((bpa->b_flags & (B_DELWRI \| B_CLUSTEROK \| B_INVAL)) ==
	1817	(B_DELWRI \| B_CLUSTEROK)) &&
	1818	(bpa->b_bufsize == size)) {
	1819	if ((bpa->b_bio2.bio_offset == NOOFFSET) \|\|
	1820	(bpa->b_bio2.bio_offset !=
	1821	bp->b_bio2.bio_offset - j))
	1822	break;
	1823	} else {
	1824	break;
	1825	}
	1826	}
	1827	j -= size;
	1828	nbytes = (i + j);
	1829
	1830	/*
	1831	* this is a possible cluster write
	1832	*/
	1833	if (nbytes != size) {
	1834	BUF_UNLOCK(bp);
	1835	nwritten = cluster_wbuild(vp, size,
	1836	loffset - j, nbytes);
	1837	return nwritten;
	1838	}
	1839	}
	1840
	1841	/*
	1842	* default (old) behavior, writing out only one block
	1843	*
	1844	* XXX returns b_bufsize instead of b_bcount for nwritten?
	1845	*/
	1846	nwritten = bp->b_bufsize;
	1847	bremfree(bp);
	1848	bawrite(bp);
	1849
	1850	return nwritten;
	1851	}
	1852
	1853	/*
	1854	* getnewbuf:
	1855	*
	1856	* Find and initialize a new buffer header, freeing up existing buffers
	1857	* in the bufqueues as necessary. The new buffer is returned locked.
	1858	*
	1859	* Important: B_INVAL is not set. If the caller wishes to throw the
	1860	* buffer away, the caller must set B_INVAL prior to calling brelse().
	1861	*
	1862	* We block if:
	1863	* We have insufficient buffer headers
	1864	* We have insufficient buffer space
	1865	* buffer_map is too fragmented ( space reservation fails )
	1866	* If we have to flush dirty buffers ( but we try to avoid this )
	1867	*
	1868	* To avoid VFS layer recursion we do not flush dirty buffers ourselves.
	1869	* Instead we ask the buf daemon to do it for us. We attempt to
	1870	* avoid piecemeal wakeups of the pageout daemon.
	1871	*
	1872	* MPALMOSTSAFE
	1873	*/
	1874	static struct buf *
	1875	getnewbuf(int blkflags, int slptimeo, int size, int maxsize)
	1876	{
	1877	struct buf *bp;
	1878	struct buf *nbp;
	1879	int defrag = 0;
	1880	int nqindex;
	1881	int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0;
	1882	static int flushingbufs;
	1883
	1884	/*
	1885	* We can't afford to block since we might be holding a vnode lock,
	1886	* which may prevent system daemons from running. We deal with
	1887	* low-memory situations by proactively returning memory and running
	1888	* async I/O rather then sync I/O.
	1889	*/
	1890
	1891	++getnewbufcalls;
	1892	--getnewbufrestarts;
	1893	restart:
	1894	++getnewbufrestarts;
	1895
	1896	/*
	1897	* Setup for scan. If we do not have enough free buffers,
	1898	* we setup a degenerate case that immediately fails. Note
	1899	* that if we are specially marked process, we are allowed to
	1900	* dip into our reserves.
	1901	*
	1902	* The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
	1903	*
	1904	* We start with EMPTYKVA. If the list is empty we backup to EMPTY.
	1905	* However, there are a number of cases (defragging, reusing, ...)
	1906	* where we cannot backup.
	1907	*/
	1908	nqindex = BQUEUE_EMPTYKVA;
	1909	spin_lock_wr(&bufspin);
	1910	nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTYKVA]);
	1911
	1912	if (nbp == NULL) {
	1913	/*
	1914	* If no EMPTYKVA buffers and we are either
	1915	* defragging or reusing, locate a CLEAN buffer
	1916	* to free or reuse. If bufspace useage is low
	1917	* skip this step so we can allocate a new buffer.
	1918	*/
	1919	if (defrag \|\| bufspace >= lobufspace) {
	1920	nqindex = BQUEUE_CLEAN;
	1921	nbp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN]);
	1922	}
	1923
	1924	/*
	1925	* If we could not find or were not allowed to reuse a
	1926	* CLEAN buffer, check to see if it is ok to use an EMPTY
	1927	* buffer. We can only use an EMPTY buffer if allocating
	1928	* its KVA would not otherwise run us out of buffer space.
	1929	*/
	1930	if (nbp == NULL && defrag == 0 &&
	1931	bufspace + maxsize < hibufspace) {
	1932	nqindex = BQUEUE_EMPTY;
	1933	nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTY]);
	1934	}
	1935	}
	1936
	1937	/*
	1938	* Run scan, possibly freeing data and/or kva mappings on the fly
	1939	* depending.
	1940	*
	1941	* WARNING! bufspin is held!
	1942	*/
	1943	while ((bp = nbp) != NULL) {
	1944	int qindex = nqindex;
	1945
	1946	nbp = TAILQ_NEXT(bp, b_freelist);
	1947
	1948	/*
	1949	* BQUEUE_CLEAN - B_AGE special case. If not set the bp
	1950	* cycles through the queue twice before being selected.
	1951	*/
	1952	if (qindex == BQUEUE_CLEAN &&
	1953	(bp->b_flags & B_AGE) == 0 && nbp) {
	1954	bp->b_flags \|= B_AGE;
	1955	TAILQ_REMOVE(&bufqueues[qindex], bp, b_freelist);
	1956	TAILQ_INSERT_TAIL(&bufqueues[qindex], bp, b_freelist);
	1957	continue;
	1958	}
	1959
	1960	/*
	1961	* Calculate next bp ( we can only use it if we do not block
	1962	* or do other fancy things ).
	1963	*/
	1964	if (nbp == NULL) {
	1965	switch(qindex) {
	1966	case BQUEUE_EMPTY:
	1967	nqindex = BQUEUE_EMPTYKVA;
	1968	if ((nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTYKVA])))
	1969	break;
	1970	/* fall through */
	1971	case BQUEUE_EMPTYKVA:
	1972	nqindex = BQUEUE_CLEAN;
	1973	if ((nbp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN])))
	1974	break;
	1975	/* fall through */
	1976	case BQUEUE_CLEAN:
	1977	/*
	1978	* nbp is NULL.
	1979	*/
	1980	break;
	1981	}
	1982	}
	1983
	1984	/*
	1985	* Sanity Checks
	1986	*/
	1987	KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
	1988
	1989	/*
	1990	* Note: we no longer distinguish between VMIO and non-VMIO
	1991	* buffers.
	1992	*/
	1993
	1994	KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
	1995
	1996	/*
	1997	* If we are defragging then we need a buffer with
	1998	* b_kvasize != 0. XXX this situation should no longer
	1999	* occur, if defrag is non-zero the buffer's b_kvasize
	2000	* should also be non-zero at this point. XXX
	2001	*/
	2002	if (defrag && bp->b_kvasize == 0) {
	2003	kprintf("Warning: defrag empty buffer %p\n", bp);
	2004	continue;
	2005	}
	2006
	2007	/*
	2008	* Start freeing the bp. This is somewhat involved. nbp
	2009	* remains valid only for BQUEUE_EMPTY[KVA] bp's. Buffers
	2010	* on the clean list must be disassociated from their
	2011	* current vnode. Buffers on the empty[kva] lists have
	2012	* already been disassociated.
	2013	*/
	2014
	2015	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT) != 0) {
	2016	spin_unlock_wr(&bufspin);
	2017	tsleep(&bd_request, 0, "gnbxxx", hz / 100);
	2018	goto restart;
	2019	}
	2020	if (bp->b_qindex != qindex) {
	2021	spin_unlock_wr(&bufspin);
	2022	kprintf("getnewbuf: warning, BUF_LOCK blocked unexpectedly on buf %p index %d->%d, race corrected\n", bp, qindex, bp->b_qindex);
	2023	BUF_UNLOCK(bp);
	2024	goto restart;
	2025	}
	2026	bremfree_locked(bp);
	2027	spin_unlock_wr(&bufspin);
	2028
	2029	/*
	2030	* Dependancies must be handled before we disassociate the
	2031	* vnode.
	2032	*
	2033	* NOTE: HAMMER will set B_LOCKED if the buffer cannot
	2034	* be immediately disassociated. HAMMER then becomes
	2035	* responsible for releasing the buffer.
	2036	*
	2037	* NOTE: bufspin is UNLOCKED now.
	2038	*/
	2039	if (LIST_FIRST(&bp->b_dep) != NULL) {
	2040	get_mplock();
	2041	buf_deallocate(bp);
	2042	rel_mplock();
	2043	if (bp->b_flags & B_LOCKED) {
	2044	bqrelse(bp);
	2045	goto restart;
	2046	}
	2047	KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
	2048	}
	2049
	2050	if (qindex == BQUEUE_CLEAN) {
	2051	get_mplock();
	2052	if (bp->b_flags & B_VMIO) {
	2053	get_mplock();
	2054	vfs_vmio_release(bp);
	2055	rel_mplock();
	2056	}
	2057	if (bp->b_vp)
	2058	brelvp(bp);
	2059	rel_mplock();
	2060	}
	2061
	2062	/*
	2063	* NOTE: nbp is now entirely invalid. We can only restart
	2064	* the scan from this point on.
	2065	*
	2066	* Get the rest of the buffer freed up. b_kva* is still
	2067	* valid after this operation.
	2068	*/
	2069
	2070	KASSERT(bp->b_vp == NULL, ("bp3 %p flags %08x vnode %p qindex %d unexpectededly still associated!", bp, bp->b_flags, bp->b_vp, qindex));
	2071	KKASSERT((bp->b_flags & B_HASHED) == 0);
	2072
	2073	/*
	2074	* critical section protection is not required when
	2075	* scrapping a buffer's contents because it is already
	2076	* wired.
	2077	*/
	2078	if (bp->b_bufsize) {
	2079	get_mplock();
	2080	allocbuf(bp, 0);
	2081	rel_mplock();
	2082	}
	2083
	2084	bp->b_flags = B_BNOCLIP;
	2085	bp->b_cmd = BUF_CMD_DONE;
	2086	bp->b_vp = NULL;
	2087	bp->b_error = 0;
	2088	bp->b_resid = 0;
	2089	bp->b_bcount = 0;
	2090	bp->b_xio.xio_npages = 0;
	2091	bp->b_dirtyoff = bp->b_dirtyend = 0;
	2092	bp->b_act_count = ACT_INIT;
	2093	reinitbufbio(bp);
	2094	KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
	2095	buf_dep_init(bp);
	2096	if (blkflags & GETBLK_BHEAVY)
	2097	bp->b_flags \|= B_HEAVY;
	2098
	2099	/*
	2100	* If we are defragging then free the buffer.
	2101	*/
	2102	if (defrag) {
	2103	bp->b_flags \|= B_INVAL;
	2104	bfreekva(bp);
	2105	brelse(bp);
	2106	defrag = 0;
	2107	goto restart;
	2108	}
	2109
	2110	/*
	2111	* If we are overcomitted then recover the buffer and its
	2112	* KVM space. This occurs in rare situations when multiple
	2113	* processes are blocked in getnewbuf() or allocbuf().
	2114	*/
	2115	if (bufspace >= hibufspace)
	2116	flushingbufs = 1;
	2117	if (flushingbufs && bp->b_kvasize != 0) {
	2118	bp->b_flags \|= B_INVAL;
	2119	bfreekva(bp);
	2120	brelse(bp);
	2121	goto restart;
	2122	}
	2123	if (bufspace < lobufspace)
	2124	flushingbufs = 0;
	2125	break;
	2126	/* NOT REACHED, bufspin not held */
	2127	}
	2128
	2129	/*
	2130	* If we exhausted our list, sleep as appropriate. We may have to
	2131	* wakeup various daemons and write out some dirty buffers.
	2132	*
	2133	* Generally we are sleeping due to insufficient buffer space.
	2134	*
	2135	* NOTE: bufspin is held if bp is NULL, else it is not held.
	2136	*/
	2137	if (bp == NULL) {
	2138	int flags;
	2139	char *waitmsg;
	2140
	2141	spin_unlock_wr(&bufspin);
	2142	if (defrag) {
	2143	flags = VFS_BIO_NEED_BUFSPACE;
	2144	waitmsg = "nbufkv";
	2145	} else if (bufspace >= hibufspace) {
	2146	waitmsg = "nbufbs";
	2147	flags = VFS_BIO_NEED_BUFSPACE;
	2148	} else {
	2149	waitmsg = "newbuf";
	2150	flags = VFS_BIO_NEED_ANY;
	2151	}
	2152
	2153	needsbuffer \|= flags;
	2154	bd_speedup(); /* heeeelp */
	2155	while (needsbuffer & flags) {
	2156	if (tsleep(&needsbuffer, slpflags, waitmsg, slptimeo))
	2157	return (NULL);
	2158	}
	2159	} else {
	2160	/*
	2161	* We finally have a valid bp. We aren't quite out of the
	2162	* woods, we still have to reserve kva space. In order
	2163	* to keep fragmentation sane we only allocate kva in
	2164	* BKVASIZE chunks.
	2165	*
	2166	* (bufspin is not held)
	2167	*/
	2168	maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
	2169
	2170	if (maxsize != bp->b_kvasize) {
	2171	vm_offset_t addr = 0;
	2172	int count;
	2173
	2174	bfreekva(bp);
	2175
	2176	get_mplock();
	2177	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	2178	vm_map_lock(&buffer_map);
	2179
	2180	if (vm_map_findspace(&buffer_map,
	2181	vm_map_min(&buffer_map), maxsize,
	2182	maxsize, 0, &addr)) {
	2183	/*
	2184	* Uh oh. Buffer map is too fragmented. We
	2185	* must defragment the map.
	2186	*/
	2187	vm_map_unlock(&buffer_map);
	2188	vm_map_entry_release(count);
	2189	++bufdefragcnt;
	2190	defrag = 1;
	2191	bp->b_flags \|= B_INVAL;
	2192	rel_mplock();
	2193	brelse(bp);
	2194	goto restart;
	2195	}
	2196	if (addr) {
	2197	vm_map_insert(&buffer_map, &count,
	2198	NULL, 0,
	2199	addr, addr + maxsize,
	2200	VM_MAPTYPE_NORMAL,
	2201	VM_PROT_ALL, VM_PROT_ALL,
	2202	MAP_NOFAULT);
	2203
	2204	bp->b_kvabase = (caddr_t) addr;
	2205	bp->b_kvasize = maxsize;
	2206	bufspace += bp->b_kvasize;
	2207	++bufreusecnt;
	2208	}
	2209	vm_map_unlock(&buffer_map);
	2210	vm_map_entry_release(count);
	2211	rel_mplock();
	2212	}
	2213	bp->b_data = bp->b_kvabase;
	2214	}
	2215	return(bp);
	2216	}
	2217
	2218	/*
	2219	* This routine is called in an emergency to recover VM pages from the
	2220	* buffer cache by cashing in clean buffers. The idea is to recover
	2221	* enough pages to be able to satisfy a stuck bio_page_alloc().
	2222	*/
	2223	static int
	2224	recoverbufpages(void)
	2225	{
	2226	struct buf *bp;
	2227	int bytes = 0;
	2228
	2229	++recoverbufcalls;
	2230
	2231	spin_lock_wr(&bufspin);
	2232	while (bytes < MAXBSIZE) {
	2233	bp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN]);
	2234	if (bp == NULL)
	2235	break;
	2236
	2237	/*
	2238	* BQUEUE_CLEAN - B_AGE special case. If not set the bp
	2239	* cycles through the queue twice before being selected.
	2240	*/
	2241	if ((bp->b_flags & B_AGE) == 0 && TAILQ_NEXT(bp, b_freelist)) {
	2242	bp->b_flags \|= B_AGE;
	2243	TAILQ_REMOVE(&bufqueues[BQUEUE_CLEAN], bp, b_freelist);
	2244	TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN],
	2245	bp, b_freelist);
	2246	continue;
	2247	}
	2248
	2249	/*
	2250	* Sanity Checks
	2251	*/
	2252	KKASSERT(bp->b_qindex == BQUEUE_CLEAN);
	2253	KKASSERT((bp->b_flags & B_DELWRI) == 0);
	2254
	2255	/*
	2256	* Start freeing the bp. This is somewhat involved.
	2257	*
	2258	* Buffers on the clean list must be disassociated from
	2259	* their current vnode
	2260	*/
	2261
	2262	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT) != 0) {
	2263	kprintf("recoverbufpages: warning, locked buf %p, race corrected\n", bp);
	2264	tsleep(&bd_request, 0, "gnbxxx", hz / 100);
	2265	continue;
	2266	}
	2267	if (bp->b_qindex != BQUEUE_CLEAN) {
	2268	kprintf("recoverbufpages: warning, BUF_LOCK blocked unexpectedly on buf %p index %d, race corrected\n", bp, bp->b_qindex);
	2269	BUF_UNLOCK(bp);
	2270	continue;
	2271	}
	2272	bremfree_locked(bp);
	2273	spin_unlock_wr(&bufspin);
	2274
	2275	/*
	2276	* Dependancies must be handled before we disassociate the
	2277	* vnode.
	2278	*
	2279	* NOTE: HAMMER will set B_LOCKED if the buffer cannot
	2280	* be immediately disassociated. HAMMER then becomes
	2281	* responsible for releasing the buffer.
	2282	*/
	2283	if (LIST_FIRST(&bp->b_dep) != NULL) {
	2284	buf_deallocate(bp);
	2285	if (bp->b_flags & B_LOCKED) {
	2286	bqrelse(bp);
	2287	spin_lock_wr(&bufspin);
	2288	continue;
	2289	}
	2290	KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
	2291	}
	2292
	2293	bytes += bp->b_bufsize;
	2294
	2295	get_mplock();
	2296	if (bp->b_flags & B_VMIO) {
	2297	bp->b_flags \|= B_DIRECT; /* try to free pages */
	2298	vfs_vmio_release(bp);
	2299	}
	2300	if (bp->b_vp)
	2301	brelvp(bp);
	2302
	2303	KKASSERT(bp->b_vp == NULL);
	2304	KKASSERT((bp->b_flags & B_HASHED) == 0);
	2305
	2306	/*
	2307	* critical section protection is not required when
	2308	* scrapping a buffer's contents because it is already
	2309	* wired.
	2310	*/
	2311	if (bp->b_bufsize)
	2312	allocbuf(bp, 0);
	2313	rel_mplock();
	2314
	2315	bp->b_flags = B_BNOCLIP;
	2316	bp->b_cmd = BUF_CMD_DONE;
	2317	bp->b_vp = NULL;
	2318	bp->b_error = 0;
	2319	bp->b_resid = 0;
	2320	bp->b_bcount = 0;
	2321	bp->b_xio.xio_npages = 0;
	2322	bp->b_dirtyoff = bp->b_dirtyend = 0;
	2323	reinitbufbio(bp);
	2324	KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
	2325	buf_dep_init(bp);
	2326	bp->b_flags \|= B_INVAL;
	2327	/* bfreekva(bp); */
	2328	brelse(bp);
	2329	spin_lock_wr(&bufspin);
	2330	}
	2331	spin_unlock_wr(&bufspin);
	2332	return(bytes);
	2333	}
	2334
	2335	/*
	2336	* buf_daemon:
	2337	*
	2338	* Buffer flushing daemon. Buffers are normally flushed by the
	2339	* update daemon but if it cannot keep up this process starts to
	2340	* take the load in an attempt to prevent getnewbuf() from blocking.
	2341	*
	2342	* Once a flush is initiated it does not stop until the number
	2343	* of buffers falls below lodirtybuffers, but we will wake up anyone
	2344	* waiting at the mid-point.
	2345	*/
	2346
	2347	static struct kproc_desc buf_kp = {
	2348	"bufdaemon",
	2349	buf_daemon,
	2350	&bufdaemon_td
	2351	};
	2352	SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST,
	2353	kproc_start, &buf_kp)
	2354
	2355	static struct kproc_desc bufhw_kp = {
	2356	"bufdaemon_hw",
	2357	buf_daemon_hw,
	2358	&bufdaemonhw_td
	2359	};
	2360	SYSINIT(bufdaemon_hw, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST,
	2361	kproc_start, &bufhw_kp)
	2362
	2363	static void
	2364	buf_daemon(void)
	2365	{
	2366	int limit;
	2367
	2368	/*
	2369	* This process needs to be suspended prior to shutdown sync.
	2370	*/
	2371	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc,
	2372	bufdaemon_td, SHUTDOWN_PRI_LAST);
	2373	curthread->td_flags \|= TDF_SYSTHREAD;
	2374
	2375	/*
	2376	* This process is allowed to take the buffer cache to the limit
	2377	*/
	2378	crit_enter();
	2379
	2380	for (;;) {
	2381	kproc_suspend_loop();
	2382
	2383	/*
	2384	* Do the flush as long as the number of dirty buffers
	2385	* (including those running) exceeds lodirtybufspace.
	2386	*
	2387	* When flushing limit running I/O to hirunningspace
	2388	* Do the flush. Limit the amount of in-transit I/O we
	2389	* allow to build up, otherwise we would completely saturate
	2390	* the I/O system. Wakeup any waiting processes before we
	2391	* normally would so they can run in parallel with our drain.
	2392	*
	2393	* Our aggregate normal+HW lo water mark is lodirtybufspace,
	2394	* but because we split the operation into two threads we
	2395	* have to cut it in half for each thread.
	2396	*/
	2397	waitrunningbufspace();
	2398	limit = lodirtybufspace / 2;
	2399	while (runningbufspace + dirtybufspace > limit \|\|
	2400	dirtybufcount - dirtybufcounthw >= nbuf / 2) {
	2401	if (flushbufqueues(BQUEUE_DIRTY) == 0)
	2402	break;
	2403	if (runningbufspace < hirunningspace)
	2404	continue;
	2405	waitrunningbufspace();
	2406	}
	2407
	2408	/*
	2409	* We reached our low water mark, reset the
	2410	* request and sleep until we are needed again.
	2411	* The sleep is just so the suspend code works.
	2412	*/
	2413	spin_lock_wr(&needsbuffer_spin);
	2414	if (bd_request == 0) {
	2415	ssleep(&bd_request, &needsbuffer_spin, 0,
	2416	"psleep", hz);
	2417	}
	2418	bd_request = 0;
	2419	spin_unlock_wr(&needsbuffer_spin);
	2420	}
	2421	}
	2422
	2423	static void
	2424	buf_daemon_hw(void)
	2425	{
	2426	int limit;
	2427
	2428	/*
	2429	* This process needs to be suspended prior to shutdown sync.
	2430	*/
	2431	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc,
	2432	bufdaemonhw_td, SHUTDOWN_PRI_LAST);
	2433	curthread->td_flags \|= TDF_SYSTHREAD;
	2434
	2435	/*
	2436	* This process is allowed to take the buffer cache to the limit
	2437	*/
	2438	crit_enter();
	2439
	2440	for (;;) {
	2441	kproc_suspend_loop();
	2442
	2443	/*
	2444	* Do the flush. Limit the amount of in-transit I/O we
	2445	* allow to build up, otherwise we would completely saturate
	2446	* the I/O system. Wakeup any waiting processes before we
	2447	* normally would so they can run in parallel with our drain.
	2448	*
	2449	* Once we decide to flush push the queued I/O up to
	2450	* hirunningspace in order to trigger bursting by the bioq
	2451	* subsystem.
	2452	*
	2453	* Our aggregate normal+HW lo water mark is lodirtybufspace,
	2454	* but because we split the operation into two threads we
	2455	* have to cut it in half for each thread.
	2456	*/
	2457	waitrunningbufspace();
	2458	limit = lodirtybufspace / 2;
	2459	while (runningbufspace + dirtybufspacehw > limit \|\|
	2460	dirtybufcounthw >= nbuf / 2) {
	2461	if (flushbufqueues(BQUEUE_DIRTY_HW) == 0)
	2462	break;
	2463	if (runningbufspace < hirunningspace)
	2464	continue;
	2465	waitrunningbufspace();
	2466	}
	2467
	2468	/*
	2469	* We reached our low water mark, reset the
	2470	* request and sleep until we are needed again.
	2471	* The sleep is just so the suspend code works.
	2472	*/
	2473	spin_lock_wr(&needsbuffer_spin);
	2474	if (bd_request_hw == 0) {
	2475	ssleep(&bd_request_hw, &needsbuffer_spin, 0,
	2476	"psleep", hz);
	2477	}
	2478	bd_request_hw = 0;
	2479	spin_unlock_wr(&needsbuffer_spin);
	2480	}
	2481	}
	2482
	2483	/*
	2484	* flushbufqueues:
	2485	*
	2486	* Try to flush a buffer in the dirty queue. We must be careful to
	2487	* free up B_INVAL buffers instead of write them, which NFS is
	2488	* particularly sensitive to.
	2489	*
	2490	* B_RELBUF may only be set by VFSs. We do set B_AGE to indicate
	2491	* that we really want to try to get the buffer out and reuse it
	2492	* due to the write load on the machine.
	2493	*/
	2494	static int
	2495	flushbufqueues(bufq_type_t q)
	2496	{
	2497	struct buf *bp;
	2498	int r = 0;
	2499	int spun;
	2500
	2501	spin_lock_wr(&bufspin);
	2502	spun = 1;
	2503
	2504	bp = TAILQ_FIRST(&bufqueues[q]);
	2505	while (bp) {
	2506	KASSERT((bp->b_flags & B_DELWRI),
	2507	("unexpected clean buffer %p", bp));
	2508
	2509	if (bp->b_flags & B_DELWRI) {
	2510	if (bp->b_flags & B_INVAL) {
	2511	spin_unlock_wr(&bufspin);
	2512	spun = 0;
	2513	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT) != 0)
	2514	panic("flushbufqueues: locked buf");
	2515	bremfree(bp);
	2516	brelse(bp);
	2517	++r;
	2518	break;
	2519	}
	2520	if (LIST_FIRST(&bp->b_dep) != NULL &&
	2521	(bp->b_flags & B_DEFERRED) == 0 &&
	2522	buf_countdeps(bp, 0)) {
	2523	TAILQ_REMOVE(&bufqueues[q], bp, b_freelist);
	2524	TAILQ_INSERT_TAIL(&bufqueues[q], bp,
	2525	b_freelist);
	2526	bp->b_flags \|= B_DEFERRED;
	2527	bp = TAILQ_FIRST(&bufqueues[q]);
	2528	continue;
	2529	}
	2530
	2531	/*
	2532	* Only write it out if we can successfully lock
	2533	* it. If the buffer has a dependancy,
	2534	* buf_checkwrite must also return 0 for us to
	2535	* be able to initate the write.
	2536	*
	2537	* If the buffer is flagged B_ERROR it may be
	2538	* requeued over and over again, we try to
	2539	* avoid a live lock.
	2540	*/
	2541	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT) == 0) {
	2542	spin_unlock_wr(&bufspin);
	2543	spun = 0;
	2544	if (LIST_FIRST(&bp->b_dep) != NULL &&
	2545	buf_checkwrite(bp)) {
	2546	bremfree(bp);
	2547	brelse(bp);
	2548	} else if (bp->b_flags & B_ERROR) {
	2549	tsleep(bp, 0, "bioer", 1);
	2550	bp->b_flags &= ~B_AGE;
	2551	vfs_bio_awrite(bp);
	2552	} else {
	2553	bp->b_flags \|= B_AGE;
	2554	vfs_bio_awrite(bp);
	2555	}
	2556	++r;
	2557	break;
	2558	}
	2559	}
	2560	bp = TAILQ_NEXT(bp, b_freelist);
	2561	}
	2562	if (spun)
	2563	spin_unlock_wr(&bufspin);
	2564	return (r);
	2565	}
	2566
	2567	/*
	2568	* inmem:
	2569	*
	2570	* Returns true if no I/O is needed to access the associated VM object.
	2571	* This is like findblk except it also hunts around in the VM system for
	2572	* the data.
	2573	*
	2574	* Note that we ignore vm_page_free() races from interrupts against our
	2575	* lookup, since if the caller is not protected our return value will not
	2576	* be any more valid then otherwise once we exit the critical section.
	2577	*/
	2578	int
	2579	inmem(struct vnode *vp, off_t loffset)
	2580	{
	2581	vm_object_t obj;
	2582	vm_offset_t toff, tinc, size;
	2583	vm_page_t m;
	2584
	2585	if (findblk(vp, loffset, FINDBLK_TEST))
	2586	return 1;
	2587	if (vp->v_mount == NULL)
	2588	return 0;
	2589	if ((obj = vp->v_object) == NULL)
	2590	return 0;
	2591
	2592	size = PAGE_SIZE;
	2593	if (size > vp->v_mount->mnt_stat.f_iosize)
	2594	size = vp->v_mount->mnt_stat.f_iosize;
	2595
	2596	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
	2597	m = vm_page_lookup(obj, OFF_TO_IDX(loffset + toff));
	2598	if (m == NULL)
	2599	return 0;
	2600	tinc = size;
	2601	if (tinc > PAGE_SIZE - ((toff + loffset) & PAGE_MASK))
	2602	tinc = PAGE_SIZE - ((toff + loffset) & PAGE_MASK);
	2603	if (vm_page_is_valid(m,
	2604	(vm_offset_t) ((toff + loffset) & PAGE_MASK), tinc) == 0)
	2605	return 0;
	2606	}
	2607	return 1;
	2608	}
	2609
	2610	/*
	2611	* findblk:
	2612	*
	2613	* Locate and return the specified buffer. Unless flagged otherwise,
	2614	* a locked buffer will be returned if it exists or NULL if it does not.
	2615	*
	2616	* findblk()'d buffers are still on the bufqueues and if you intend
	2617	* to use your (locked NON-TEST) buffer you need to bremfree(bp)
	2618	* and possibly do other stuff to it.
	2619	*
	2620	* FINDBLK_TEST - Do not lock the buffer. The caller is responsible
	2621	* for locking the buffer and ensuring that it remains
	2622	* the desired buffer after locking.
	2623	*
	2624	* FINDBLK_NBLOCK - Lock the buffer non-blocking. If we are unable
	2625	* to acquire the lock we return NULL, even if the
	2626	* buffer exists.
	2627	*
	2628	* (0) - Lock the buffer blocking.
	2629	*
	2630	* MPSAFE
	2631	*/
	2632	struct buf *
	2633	findblk(struct vnode *vp, off_t loffset, int flags)
	2634	{
	2635	struct buf *bp;
	2636	int lkflags;
	2637
	2638	lkflags = LK_EXCLUSIVE;
	2639	if (flags & FINDBLK_NBLOCK)
	2640	lkflags \|= LK_NOWAIT;
	2641
	2642	for (;;) {
	2643	lwkt_gettoken(&vp->v_token);
	2644	bp = buf_rb_hash_RB_LOOKUP(&vp->v_rbhash_tree, loffset);
	2645	lwkt_reltoken(&vp->v_token);
	2646	if (bp == NULL \|\| (flags & FINDBLK_TEST))
	2647	break;
	2648	if (BUF_LOCK(bp, lkflags)) {
	2649	bp = NULL;
	2650	break;
	2651	}
	2652	if (bp->b_vp == vp && bp->b_loffset == loffset)
	2653	break;
	2654	BUF_UNLOCK(bp);
	2655	}
	2656	return(bp);
	2657	}
	2658
	2659	/*
	2660	* getcacheblk:
	2661	*
	2662	* Similar to getblk() except only returns the buffer if it is
	2663	* B_CACHE and requires no other manipulation. Otherwise NULL
	2664	* is returned.
	2665	*
	2666	* If B_RAM is set the buffer might be just fine, but we return
	2667	* NULL anyway because we want the code to fall through to the
	2668	* cluster read. Otherwise read-ahead breaks.
	2669	*/
	2670	struct buf *
	2671	getcacheblk(struct vnode *vp, off_t loffset)
	2672	{
	2673	struct buf *bp;
	2674
	2675	bp = findblk(vp, loffset, 0);
	2676	if (bp) {
	2677	if ((bp->b_flags & (B_INVAL \| B_CACHE \| B_RAM)) == B_CACHE) {
	2678	bp->b_flags &= ~B_AGE;
	2679	bremfree(bp);
	2680	} else {
	2681	BUF_UNLOCK(bp);
	2682	bp = NULL;
	2683	}
	2684	}
	2685	return (bp);
	2686	}
	2687
	2688	/*
	2689	* getblk:
	2690	*
	2691	* Get a block given a specified block and offset into a file/device.
	2692	* B_INVAL may or may not be set on return. The caller should clear
	2693	* B_INVAL prior to initiating a READ.
	2694	*
	2695	* IT IS IMPORTANT TO UNDERSTAND THAT IF YOU CALL GETBLK() AND B_CACHE
	2696	* IS NOT SET, YOU MUST INITIALIZE THE RETURNED BUFFER, ISSUE A READ,
	2697	* OR SET B_INVAL BEFORE RETIRING IT. If you retire a getblk'd buffer
	2698	* without doing any of those things the system will likely believe
	2699	* the buffer to be valid (especially if it is not B_VMIO), and the
	2700	* next getblk() will return the buffer with B_CACHE set.
	2701	*
	2702	* For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
	2703	* an existing buffer.
	2704	*
	2705	* For a VMIO buffer, B_CACHE is modified according to the backing VM.
	2706	* If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
	2707	* and then cleared based on the backing VM. If the previous buffer is
	2708	* non-0-sized but invalid, B_CACHE will be cleared.
	2709	*
	2710	* If getblk() must create a new buffer, the new buffer is returned with
	2711	* both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
	2712	* case it is returned with B_INVAL clear and B_CACHE set based on the
	2713	* backing VM.
	2714	*
	2715	* getblk() also forces a bwrite() for any B_DELWRI buffer whos
	2716	* B_CACHE bit is clear.
	2717	*
	2718	* What this means, basically, is that the caller should use B_CACHE to
	2719	* determine whether the buffer is fully valid or not and should clear
	2720	* B_INVAL prior to issuing a read. If the caller intends to validate
	2721	* the buffer by loading its data area with something, the caller needs
	2722	* to clear B_INVAL. If the caller does this without issuing an I/O,
	2723	* the caller should set B_CACHE ( as an optimization ), else the caller
	2724	* should issue the I/O and biodone() will set B_CACHE if the I/O was
	2725	* a write attempt or if it was a successfull read. If the caller
	2726	* intends to issue a READ, the caller must clear B_INVAL and B_ERROR
	2727	* prior to issuing the READ. biodone() will not clear B_INVAL.
	2728	*
	2729	* getblk flags:
	2730	*
	2731	* GETBLK_PCATCH - catch signal if blocked, can cause NULL return
	2732	* GETBLK_BHEAVY - heavy-weight buffer cache buffer
	2733	*
	2734	* MPALMOSTSAFE
	2735	*/
	2736	struct buf *
	2737	getblk(struct vnode *vp, off_t loffset, int size, int blkflags, int slptimeo)
	2738	{
	2739	struct buf *bp;
	2740	int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0;
	2741	int error;
	2742	int lkflags;
	2743
	2744	if (size > MAXBSIZE)
	2745	panic("getblk: size(%d) > MAXBSIZE(%d)", size, MAXBSIZE);
	2746	if (vp->v_object == NULL)
	2747	panic("getblk: vnode %p has no object!", vp);
	2748
	2749	loop:
	2750	if ((bp = findblk(vp, loffset, FINDBLK_TEST)) != NULL) {
	2751	/*
	2752	* The buffer was found in the cache, but we need to lock it.
	2753	* Even with LK_NOWAIT the lockmgr may break our critical
	2754	* section, so double-check the validity of the buffer
	2755	* once the lock has been obtained.
	2756	*/
	2757	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	2758	if (blkflags & GETBLK_NOWAIT)
	2759	return(NULL);
	2760	lkflags = LK_EXCLUSIVE \| LK_SLEEPFAIL;
	2761	if (blkflags & GETBLK_PCATCH)
	2762	lkflags \|= LK_PCATCH;
	2763	error = BUF_TIMELOCK(bp, lkflags, "getblk", slptimeo);
	2764	if (error) {
	2765	if (error == ENOLCK)
	2766	goto loop;
	2767	return (NULL);
	2768	}
	2769	/* buffer may have changed on us */
	2770	}
	2771
	2772	/*
	2773	* Once the buffer has been locked, make sure we didn't race
	2774	* a buffer recyclement. Buffers that are no longer hashed
	2775	* will have b_vp == NULL, so this takes care of that check
	2776	* as well.
	2777	*/
	2778	if (bp->b_vp != vp \|\| bp->b_loffset != loffset) {
	2779	kprintf("Warning buffer %p (vp %p loffset %lld) "
	2780	"was recycled\n",
	2781	bp, vp, (long long)loffset);
	2782	BUF_UNLOCK(bp);
	2783	goto loop;
	2784	}
	2785
	2786	/*
	2787	* If SZMATCH any pre-existing buffer must be of the requested
	2788	* size or NULL is returned. The caller absolutely does not
	2789	* want getblk() to bwrite() the buffer on a size mismatch.
	2790	*/
	2791	if ((blkflags & GETBLK_SZMATCH) && size != bp->b_bcount) {
	2792	BUF_UNLOCK(bp);
	2793	return(NULL);
	2794	}
	2795
	2796	/*
	2797	* All vnode-based buffers must be backed by a VM object.
	2798	*/
	2799	KKASSERT(bp->b_flags & B_VMIO);
	2800	KKASSERT(bp->b_cmd == BUF_CMD_DONE);
	2801	bp->b_flags &= ~B_AGE;
	2802
	2803	/*
	2804	* Make sure that B_INVAL buffers do not have a cached
	2805	* block number translation.
	2806	*/
	2807	if ((bp->b_flags & B_INVAL) && (bp->b_bio2.bio_offset != NOOFFSET)) {
	2808	kprintf("Warning invalid buffer %p (vp %p loffset %lld)"
	2809	" did not have cleared bio_offset cache\n",
	2810	bp, vp, (long long)loffset);
	2811	clearbiocache(&bp->b_bio2);
	2812	}
	2813
	2814	/*
	2815	* The buffer is locked. B_CACHE is cleared if the buffer is
	2816	* invalid.
	2817	*/
	2818	if (bp->b_flags & B_INVAL)
	2819	bp->b_flags &= ~B_CACHE;
	2820	bremfree(bp);
	2821
	2822	/*
	2823	* Any size inconsistancy with a dirty buffer or a buffer
	2824	* with a softupdates dependancy must be resolved. Resizing
	2825	* the buffer in such circumstances can lead to problems.
	2826	*
	2827	* Dirty or dependant buffers are written synchronously.
	2828	* Other types of buffers are simply released and
	2829	* reconstituted as they may be backed by valid, dirty VM
	2830	* pages (but not marked B_DELWRI).
	2831	*
	2832	* NFS NOTE: NFS buffers which straddle EOF are oddly-sized
	2833	* and may be left over from a prior truncation (and thus
	2834	* no longer represent the actual EOF point), so we
	2835	* definitely do not want to B_NOCACHE the backing store.
	2836	*/
	2837	if (size != bp->b_bcount) {
	2838	get_mplock();
	2839	if (bp->b_flags & B_DELWRI) {
	2840	bp->b_flags \|= B_RELBUF;
	2841	bwrite(bp);
	2842	} else if (LIST_FIRST(&bp->b_dep)) {
	2843	bp->b_flags \|= B_RELBUF;
	2844	bwrite(bp);
	2845	} else {
	2846	bp->b_flags \|= B_RELBUF;
	2847	brelse(bp);
	2848	}
	2849	rel_mplock();
	2850	goto loop;
	2851	}
	2852	KKASSERT(size <= bp->b_kvasize);
	2853	KASSERT(bp->b_loffset != NOOFFSET,
	2854	("getblk: no buffer offset"));
	2855
	2856	/*
	2857	* A buffer with B_DELWRI set and B_CACHE clear must
	2858	* be committed before we can return the buffer in
	2859	* order to prevent the caller from issuing a read
	2860	* ( due to B_CACHE not being set ) and overwriting
	2861	* it.
	2862	*
	2863	* Most callers, including NFS and FFS, need this to
	2864	* operate properly either because they assume they
	2865	* can issue a read if B_CACHE is not set, or because
	2866	* ( for example ) an uncached B_DELWRI might loop due
	2867	* to softupdates re-dirtying the buffer. In the latter
	2868	* case, B_CACHE is set after the first write completes,
	2869	* preventing further loops.
	2870	*
	2871	* NOTE! b*write() sets B_CACHE. If we cleared B_CACHE
	2872	* above while extending the buffer, we cannot allow the
	2873	* buffer to remain with B_CACHE set after the write
	2874	* completes or it will represent a corrupt state. To
	2875	* deal with this we set B_NOCACHE to scrap the buffer
	2876	* after the write.
	2877	*
	2878	* XXX Should this be B_RELBUF instead of B_NOCACHE?
	2879	* I'm not even sure this state is still possible
	2880	* now that getblk() writes out any dirty buffers
	2881	* on size changes.
	2882	*
	2883	* We might be able to do something fancy, like setting
	2884	* B_CACHE in bwrite() except if B_DELWRI is already set,
	2885	* so the below call doesn't set B_CACHE, but that gets real
	2886	* confusing. This is much easier.
	2887	*/
	2888
	2889	if ((bp->b_flags & (B_CACHE\|B_DELWRI)) == B_DELWRI) {
	2890	get_mplock();
	2891	kprintf("getblk: Warning, bp %p loff=%jx DELWRI set "
	2892	"and CACHE clear, b_flags %08x\n",
	2893	bp, (intmax_t)bp->b_loffset, bp->b_flags);
	2894	bp->b_flags \|= B_NOCACHE;
	2895	bwrite(bp);
	2896	rel_mplock();
	2897	goto loop;
	2898	}
	2899	} else {
	2900	/*
	2901	* Buffer is not in-core, create new buffer. The buffer
	2902	* returned by getnewbuf() is locked. Note that the returned
	2903	* buffer is also considered valid (not marked B_INVAL).
	2904	*
	2905	* Calculating the offset for the I/O requires figuring out
	2906	* the block size. We use DEV_BSIZE for VBLK or VCHR and
	2907	* the mount's f_iosize otherwise. If the vnode does not
	2908	* have an associated mount we assume that the passed size is
	2909	* the block size.
	2910	*
	2911	* Note that vn_isdisk() cannot be used here since it may
	2912	* return a failure for numerous reasons. Note that the
	2913	* buffer size may be larger then the block size (the caller
	2914	* will use block numbers with the proper multiple). Beware
	2915	* of using any v_* fields which are part of unions. In
	2916	* particular, in DragonFly the mount point overloading
	2917	* mechanism uses the namecache only and the underlying
	2918	* directory vnode is not a special case.
	2919	*/
	2920	int bsize, maxsize;
	2921
	2922	if (vp->v_type == VBLK \|\| vp->v_type == VCHR)
	2923	bsize = DEV_BSIZE;
	2924	else if (vp->v_mount)
	2925	bsize = vp->v_mount->mnt_stat.f_iosize;
	2926	else
	2927	bsize = size;
	2928
	2929	maxsize = size + (loffset & PAGE_MASK);
	2930	maxsize = imax(maxsize, bsize);
	2931
	2932	bp = getnewbuf(blkflags, slptimeo, size, maxsize);
	2933	if (bp == NULL) {
	2934	if (slpflags \|\| slptimeo)
	2935	return NULL;
	2936	goto loop;
	2937	}
	2938
	2939	/*
	2940	* Atomically insert the buffer into the hash, so that it can
	2941	* be found by findblk().
	2942	*
	2943	* If bgetvp() returns non-zero a collision occured, and the
	2944	* bp will not be associated with the vnode.
	2945	*
	2946	* Make sure the translation layer has been cleared.
	2947	*/
	2948	bp->b_loffset = loffset;
	2949	bp->b_bio2.bio_offset = NOOFFSET;
	2950	/* bp->b_bio2.bio_next = NULL; */
	2951
	2952	if (bgetvp(vp, bp)) {
	2953	bp->b_flags \|= B_INVAL;
	2954	brelse(bp);
	2955	goto loop;
	2956	}
	2957
	2958	/*
	2959	* All vnode-based buffers must be backed by a VM object.
	2960	*/
	2961	KKASSERT(vp->v_object != NULL);
	2962	bp->b_flags \|= B_VMIO;
	2963	KKASSERT(bp->b_cmd == BUF_CMD_DONE);
	2964
	2965	get_mplock();
	2966	allocbuf(bp, size);
	2967	rel_mplock();
	2968	}
	2969	KKASSERT(dsched_is_clear_buf_priv(bp));
	2970	return (bp);
	2971	}
	2972
	2973	/*
	2974	* regetblk(bp)
	2975	*
	2976	* Reacquire a buffer that was previously released to the locked queue,
	2977	* or reacquire a buffer which is interlocked by having bioops->io_deallocate
	2978	* set B_LOCKED (which handles the acquisition race).
	2979	*
	2980	* To this end, either B_LOCKED must be set or the dependancy list must be
	2981	* non-empty.
	2982	*
	2983	* MPSAFE
	2984	*/
	2985	void
	2986	regetblk(struct buf *bp)
	2987	{
	2988	KKASSERT((bp->b_flags & B_LOCKED) \|\| LIST_FIRST(&bp->b_dep) != NULL);
	2989	BUF_LOCK(bp, LK_EXCLUSIVE \| LK_RETRY);
	2990	bremfree(bp);
	2991	}
	2992
	2993	/*
	2994	* geteblk:
	2995	*
	2996	* Get an empty, disassociated buffer of given size. The buffer is
	2997	* initially set to B_INVAL.
	2998	*
	2999	* critical section protection is not required for the allocbuf()
	3000	* call because races are impossible here.
	3001	*
	3002	* MPALMOSTSAFE
	3003	*/
	3004	struct buf *
	3005	geteblk(int size)
	3006	{
	3007	struct buf *bp;
	3008	int maxsize;
	3009
	3010	maxsize = (size + BKVAMASK) & ~BKVAMASK;
	3011
	3012	while ((bp = getnewbuf(0, 0, size, maxsize)) == 0)
	3013	;
	3014	get_mplock();
	3015	allocbuf(bp, size);
	3016	rel_mplock();
	3017	bp->b_flags \|= B_INVAL; /* b_dep cleared by getnewbuf() */
	3018	KKASSERT(dsched_is_clear_buf_priv(bp));
	3019	return (bp);
	3020	}
	3021
	3022
	3023	/*
	3024	* allocbuf:
	3025	*
	3026	* This code constitutes the buffer memory from either anonymous system
	3027	* memory (in the case of non-VMIO operations) or from an associated
	3028	* VM object (in the case of VMIO operations). This code is able to
	3029	* resize a buffer up or down.
	3030	*
	3031	* Note that this code is tricky, and has many complications to resolve
	3032	* deadlock or inconsistant data situations. Tread lightly!!!
	3033	* There are B_CACHE and B_DELWRI interactions that must be dealt with by
	3034	* the caller. Calling this code willy nilly can result in the loss of data.
	3035	*
	3036	* allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with
	3037	* B_CACHE for the non-VMIO case.
	3038	*
	3039	* This routine does not need to be called from a critical section but you
	3040	* must own the buffer.
	3041	*
	3042	* NOTMPSAFE
	3043	*/
	3044	int
	3045	allocbuf(struct buf *bp, int size)
	3046	{
	3047	int newbsize, mbsize;
	3048	int i;
	3049
	3050	if (BUF_REFCNT(bp) == 0)
	3051	panic("allocbuf: buffer not busy");
	3052
	3053	if (bp->b_kvasize < size)
	3054	panic("allocbuf: buffer too small");
	3055
	3056	if ((bp->b_flags & B_VMIO) == 0) {
	3057	caddr_t origbuf;
	3058	int origbufsize;
	3059	/*
	3060	* Just get anonymous memory from the kernel. Don't
	3061	* mess with B_CACHE.
	3062	*/
	3063	mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
	3064	if (bp->b_flags & B_MALLOC)
	3065	newbsize = mbsize;
	3066	else
	3067	newbsize = round_page(size);
	3068
	3069	if (newbsize < bp->b_bufsize) {
	3070	/*
	3071	* Malloced buffers are not shrunk
	3072	*/
	3073	if (bp->b_flags & B_MALLOC) {
	3074	if (newbsize) {
	3075	bp->b_bcount = size;
	3076	} else {
	3077	kfree(bp->b_data, M_BIOBUF);
	3078	if (bp->b_bufsize) {
	3079	bufmallocspace -= bp->b_bufsize;
	3080	bufspacewakeup();
	3081	bp->b_bufsize = 0;
	3082	}
	3083	bp->b_data = bp->b_kvabase;
	3084	bp->b_bcount = 0;
	3085	bp->b_flags &= ~B_MALLOC;
	3086	}
	3087	return 1;
	3088	}
	3089	vm_hold_free_pages(
	3090	bp,
	3091	(vm_offset_t) bp->b_data + newbsize,
	3092	(vm_offset_t) bp->b_data + bp->b_bufsize);
	3093	} else if (newbsize > bp->b_bufsize) {
	3094	/*
	3095	* We only use malloced memory on the first allocation.
	3096	* and revert to page-allocated memory when the buffer
	3097	* grows.
	3098	*/
	3099	if ((bufmallocspace < maxbufmallocspace) &&
	3100	(bp->b_bufsize == 0) &&
	3101	(mbsize <= PAGE_SIZE/2)) {
	3102
	3103	bp->b_data = kmalloc(mbsize, M_BIOBUF, M_WAITOK);
	3104	bp->b_bufsize = mbsize;
	3105	bp->b_bcount = size;
	3106	bp->b_flags \|= B_MALLOC;
	3107	bufmallocspace += mbsize;
	3108	return 1;
	3109	}
	3110	origbuf = NULL;
	3111	origbufsize = 0;
	3112	/*
	3113	* If the buffer is growing on its other-than-first
	3114	* allocation, then we revert to the page-allocation
	3115	* scheme.
	3116	*/
	3117	if (bp->b_flags & B_MALLOC) {
	3118	origbuf = bp->b_data;
	3119	origbufsize = bp->b_bufsize;
	3120	bp->b_data = bp->b_kvabase;
	3121	if (bp->b_bufsize) {
	3122	bufmallocspace -= bp->b_bufsize;
	3123	bufspacewakeup();
	3124	bp->b_bufsize = 0;
	3125	}
	3126	bp->b_flags &= ~B_MALLOC;
	3127	newbsize = round_page(newbsize);
	3128	}
	3129	vm_hold_load_pages(
	3130	bp,
	3131	(vm_offset_t) bp->b_data + bp->b_bufsize,
	3132	(vm_offset_t) bp->b_data + newbsize);
	3133	if (origbuf) {
	3134	bcopy(origbuf, bp->b_data, origbufsize);
	3135	kfree(origbuf, M_BIOBUF);
	3136	}
	3137	}
	3138	} else {
	3139	vm_page_t m;
	3140	int desiredpages;
	3141
	3142	newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
	3143	desiredpages = ((int)(bp->b_loffset & PAGE_MASK) +
	3144	newbsize + PAGE_MASK) >> PAGE_SHIFT;
	3145	KKASSERT(desiredpages <= XIO_INTERNAL_PAGES);
	3146
	3147	if (bp->b_flags & B_MALLOC)
	3148	panic("allocbuf: VMIO buffer can't be malloced");
	3149	/*
	3150	* Set B_CACHE initially if buffer is 0 length or will become
	3151	* 0-length.
	3152	*/
	3153	if (size == 0 \|\| bp->b_bufsize == 0)
	3154	bp->b_flags \|= B_CACHE;
	3155
	3156	if (newbsize < bp->b_bufsize) {
	3157	/*
	3158	* DEV_BSIZE aligned new buffer size is less then the
	3159	* DEV_BSIZE aligned existing buffer size. Figure out
	3160	* if we have to remove any pages.
	3161	*/
	3162	if (desiredpages < bp->b_xio.xio_npages) {
	3163	for (i = desiredpages; i < bp->b_xio.xio_npages; i++) {
	3164	/*
	3165	* the page is not freed here -- it
	3166	* is the responsibility of
	3167	* vnode_pager_setsize
	3168	*/
	3169	m = bp->b_xio.xio_pages[i];
	3170	KASSERT(m != bogus_page,
	3171	("allocbuf: bogus page found"));
	3172	while (vm_page_sleep_busy(m, TRUE, "biodep"))
	3173	;
	3174
	3175	bp->b_xio.xio_pages[i] = NULL;
	3176	vm_page_unwire(m, 0);
	3177	}
	3178	pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
	3179	(desiredpages << PAGE_SHIFT), (bp->b_xio.xio_npages - desiredpages));
	3180	bp->b_xio.xio_npages = desiredpages;
	3181	}
	3182	} else if (size > bp->b_bcount) {
	3183	/*
	3184	* We are growing the buffer, possibly in a
	3185	* byte-granular fashion.
	3186	*/
	3187	struct vnode *vp;
	3188	vm_object_t obj;
	3189	vm_offset_t toff;
	3190	vm_offset_t tinc;
	3191
	3192	/*
	3193	* Step 1, bring in the VM pages from the object,
	3194	* allocating them if necessary. We must clear
	3195	* B_CACHE if these pages are not valid for the
	3196	* range covered by the buffer.
	3197	*
	3198	* critical section protection is required to protect
	3199	* against interrupts unbusying and freeing pages
	3200	* between our vm_page_lookup() and our
	3201	* busycheck/wiring call.
	3202	*/
	3203	vp = bp->b_vp;
	3204	obj = vp->v_object;
	3205
	3206	crit_enter();
	3207	while (bp->b_xio.xio_npages < desiredpages) {
	3208	vm_page_t m;
	3209	vm_pindex_t pi;
	3210
	3211	pi = OFF_TO_IDX(bp->b_loffset) + bp->b_xio.xio_npages;
	3212	if ((m = vm_page_lookup(obj, pi)) == NULL) {
	3213	/*
	3214	* note: must allocate system pages
	3215	* since blocking here could intefere
	3216	* with paging I/O, no matter which
	3217	* process we are.
	3218	*/
	3219	m = bio_page_alloc(obj, pi, desiredpages - bp->b_xio.xio_npages);
	3220	if (m) {
	3221	vm_page_wire(m);
	3222	vm_page_wakeup(m);
	3223	vm_page_flag_clear(m, PG_ZERO);
	3224	bp->b_flags &= ~B_CACHE;
	3225	bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
	3226	++bp->b_xio.xio_npages;
	3227	}
	3228	continue;
	3229	}
	3230
	3231	/*
	3232	* We found a page. If we have to sleep on it,
	3233	* retry because it might have gotten freed out
	3234	* from under us.
	3235	*
	3236	* We can only test PG_BUSY here. Blocking on
	3237	* m->busy might lead to a deadlock:
	3238	*
	3239	* vm_fault->getpages->cluster_read->allocbuf
	3240	*
	3241	*/
	3242
	3243	if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
	3244	continue;
	3245	vm_page_flag_clear(m, PG_ZERO);
	3246	vm_page_wire(m);
	3247	bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
	3248	++bp->b_xio.xio_npages;
	3249	if (bp->b_act_count < m->act_count)
	3250	bp->b_act_count = m->act_count;
	3251	}
	3252	crit_exit();
	3253
	3254	/*
	3255	* Step 2. We've loaded the pages into the buffer,
	3256	* we have to figure out if we can still have B_CACHE
	3257	* set. Note that B_CACHE is set according to the
	3258	* byte-granular range ( bcount and size ), not the
	3259	* aligned range ( newbsize ).
	3260	*
	3261	* The VM test is against m->valid, which is DEV_BSIZE
	3262	* aligned. Needless to say, the validity of the data
	3263	* needs to also be DEV_BSIZE aligned. Note that this
	3264	* fails with NFS if the server or some other client
	3265	* extends the file's EOF. If our buffer is resized,
	3266	* B_CACHE may remain set! XXX
	3267	*/
	3268
	3269	toff = bp->b_bcount;
	3270	tinc = PAGE_SIZE - ((bp->b_loffset + toff) & PAGE_MASK);
	3271
	3272	while ((bp->b_flags & B_CACHE) && toff < size) {
	3273	vm_pindex_t pi;
	3274
	3275	if (tinc > (size - toff))
	3276	tinc = size - toff;
	3277
	3278	pi = ((bp->b_loffset & PAGE_MASK) + toff) >>
	3279	PAGE_SHIFT;
	3280
	3281	vfs_buf_test_cache(
	3282	bp,
	3283	bp->b_loffset,
	3284	toff,
	3285	tinc,
	3286	bp->b_xio.xio_pages[pi]
	3287	);
	3288	toff += tinc;
	3289	tinc = PAGE_SIZE;
	3290	}
	3291
	3292	/*
	3293	* Step 3, fixup the KVM pmap. Remember that
	3294	* bp->b_data is relative to bp->b_loffset, but
	3295	* bp->b_loffset may be offset into the first page.
	3296	*/
	3297
	3298	bp->b_data = (caddr_t)
	3299	trunc_page((vm_offset_t)bp->b_data);
	3300	pmap_qenter(
	3301	(vm_offset_t)bp->b_data,
	3302	bp->b_xio.xio_pages,
	3303	bp->b_xio.xio_npages
	3304	);
	3305	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data \|
	3306	(vm_offset_t)(bp->b_loffset & PAGE_MASK));
	3307	}
	3308	}
	3309
	3310	/* adjust space use on already-dirty buffer */
	3311	if (bp->b_flags & B_DELWRI) {
	3312	dirtybufspace += newbsize - bp->b_bufsize;
	3313	if (bp->b_flags & B_HEAVY)
	3314	dirtybufspacehw += newbsize - bp->b_bufsize;
	3315	}
	3316	if (newbsize < bp->b_bufsize)
	3317	bufspacewakeup();
	3318	bp->b_bufsize = newbsize; /* actual buffer allocation */
	3319	bp->b_bcount = size; /* requested buffer size */
	3320	return 1;
	3321	}
	3322
	3323	/*
	3324	* biowait:
	3325	*
	3326	* Wait for buffer I/O completion, returning error status. B_EINTR
	3327	* is converted into an EINTR error but not cleared (since a chain
	3328	* of biowait() calls may occur).
	3329	*
	3330	* On return bpdone() will have been called but the buffer will remain
	3331	* locked and will not have been brelse()'d.
	3332	*
	3333	* NOTE! If a timeout is specified and ETIMEDOUT occurs the I/O is
	3334	* likely still in progress on return.
	3335	*
	3336	* NOTE! This operation is on a BIO, not a BUF.
	3337	*
	3338	* NOTE! BIO_DONE is cleared by vn_strategy()
	3339	*
	3340	* MPSAFE
	3341	*/
	3342	static __inline int
	3343	_biowait(struct bio bio, const char wmesg, int to)
	3344	{
	3345	struct buf *bp = bio->bio_buf;
	3346	u_int32_t flags;
	3347	u_int32_t nflags;
	3348	int error;
	3349
	3350	KKASSERT(bio == &bp->b_bio1);
	3351	for (;;) {
	3352	flags = bio->bio_flags;
	3353	if (flags & BIO_DONE)
	3354	break;
	3355	tsleep_interlock(bio, 0);
	3356	nflags = flags \| BIO_WANT;
	3357	tsleep_interlock(bio, 0);
	3358	if (atomic_cmpset_int(&bio->bio_flags, flags, nflags)) {
	3359	if (wmesg)
	3360	error = tsleep(bio, PINTERLOCKED, wmesg, to);
	3361	else if (bp->b_cmd == BUF_CMD_READ)
	3362	error = tsleep(bio, PINTERLOCKED, "biord", to);
	3363	else
	3364	error = tsleep(bio, PINTERLOCKED, "biowr", to);
	3365	if (error) {
	3366	kprintf("tsleep error biowait %d\n", error);
	3367	return (error);
	3368	}
	3369	}
	3370	}
	3371
	3372	/*
	3373	* Finish up.
	3374	*/
	3375	KKASSERT(bp->b_cmd == BUF_CMD_DONE);
	3376	bio->bio_flags &= ~(BIO_DONE \| BIO_SYNC);
	3377	if (bp->b_flags & B_EINTR)
	3378	return (EINTR);
	3379	if (bp->b_flags & B_ERROR)
	3380	return (bp->b_error ? bp->b_error : EIO);
	3381	return (0);
	3382	}
	3383
	3384	int
	3385	biowait(struct bio bio, const char wmesg)
	3386	{
	3387	return(_biowait(bio, wmesg, 0));
	3388	}
	3389
	3390	int
	3391	biowait_timeout(struct bio bio, const char wmesg, int to)
	3392	{
	3393	return(_biowait(bio, wmesg, to));
	3394	}
	3395
	3396	/*
	3397	* This associates a tracking count with an I/O. vn_strategy() and
	3398	* dev_dstrategy() do this automatically but there are a few cases
	3399	* where a vnode or device layer is bypassed when a block translation
	3400	* is cached. In such cases bio_start_transaction() may be called on
	3401	* the bypassed layers so the system gets an I/O in progress indication
	3402	* for those higher layers.
	3403	*/
	3404	void
	3405	bio_start_transaction(struct bio bio, struct bio_track track)
	3406	{
	3407	bio->bio_track = track;
	3408	if (dsched_is_clear_buf_priv(bio->bio_buf))
	3409	dsched_new_buf(bio->bio_buf);
	3410	bio_track_ref(track);
	3411	}
	3412
	3413	/*
	3414	* Initiate I/O on a vnode.
	3415	*
	3416	* SWAPCACHE OPERATION:
	3417	*
	3418	* Real buffer cache buffers have a non-NULL bp->b_vp. Unfortunately
	3419	* devfs also uses b_vp for fake buffers so we also have to check
	3420	* that B_PAGING is 0. In this case the passed 'vp' is probably the
	3421	* underlying block device. The swap assignments are related to the
	3422	* buffer cache buffer's b_vp, not the passed vp.
	3423	*
	3424	* The passed vp == bp->b_vp only in the case where the strategy call
	3425	* is made on the vp itself for its own buffers (a regular file or
	3426	* block device vp). The filesystem usually then re-calls vn_strategy()
	3427	* after translating the request to an underlying device.
	3428	*
	3429	* Cluster buffers set B_CLUSTER and the passed vp is the vp of the
	3430	* underlying buffer cache buffers.
	3431	*
	3432	* We can only deal with page-aligned buffers at the moment, because
	3433	* we can't tell what the real dirty state for pages straddling a buffer
	3434	* are.
	3435	*
	3436	* In order to call swap_pager_strategy() we must provide the VM object
	3437	* and base offset for the underlying buffer cache pages so it can find
	3438	* the swap blocks.
	3439	*/
	3440	void
	3441	vn_strategy(struct vnode vp, struct bio bio)
	3442	{
	3443	struct bio_track *track;
	3444	struct buf *bp = bio->bio_buf;
	3445
	3446	KKASSERT(bp->b_cmd != BUF_CMD_DONE);
	3447
	3448	/*
	3449	* Handle the swap cache intercept.
	3450	*/
	3451	if (vn_cache_strategy(vp, bio))
	3452	return;
	3453
	3454	/*
	3455	* Otherwise do the operation through the filesystem
	3456	*/
	3457	if (bp->b_cmd == BUF_CMD_READ)
	3458	track = &vp->v_track_read;
	3459	else
	3460	track = &vp->v_track_write;
	3461	KKASSERT((bio->bio_flags & BIO_DONE) == 0);
	3462	bio->bio_track = track;
	3463	if (dsched_is_clear_buf_priv(bio->bio_buf))
	3464	dsched_new_buf(bio->bio_buf);
	3465	bio_track_ref(track);
	3466	vop_strategy(*vp->v_ops, vp, bio);
	3467	}
	3468
	3469	int
	3470	vn_cache_strategy(struct vnode vp, struct bio bio)
	3471	{
	3472	struct buf *bp = bio->bio_buf;
	3473	struct bio *nbio;
	3474	vm_object_t object;
	3475	vm_page_t m;
	3476	int i;
	3477
	3478	/*
	3479	* Is this buffer cache buffer suitable for reading from
	3480	* the swap cache?
	3481	*/
	3482	if (vm_swapcache_read_enable == 0 \|\|
	3483	bp->b_cmd != BUF_CMD_READ \|\|
	3484	((bp->b_flags & B_CLUSTER) == 0 &&
	3485	(bp->b_vp == NULL \|\| (bp->b_flags & B_PAGING))) \|\|
	3486	((int)bp->b_loffset & PAGE_MASK) != 0 \|\|
	3487	(bp->b_bcount & PAGE_MASK) != 0) {
	3488	return(0);
	3489	}
	3490
	3491	/*
	3492	* Figure out the original VM object (it will match the underlying
	3493	* VM pages). Note that swap cached data uses page indices relative
	3494	* to that object, not relative to bio->bio_offset.
	3495	*/
	3496	if (bp->b_flags & B_CLUSTER)
	3497	object = vp->v_object;
	3498	else
	3499	object = bp->b_vp->v_object;
	3500
	3501	/*
	3502	* In order to be able to use the swap cache all underlying VM
	3503	* pages must be marked as such, and we can't have any bogus pages.
	3504	*/
	3505	for (i = 0; i < bp->b_xio.xio_npages; ++i) {
	3506	m = bp->b_xio.xio_pages[i];
	3507	if ((m->flags & PG_SWAPPED) == 0)
	3508	break;
	3509	if (m == bogus_page)
	3510	break;
	3511	}
	3512
	3513	/*
	3514	* If we are good then issue the I/O using swap_pager_strategy()
	3515	*/
	3516	if (i == bp->b_xio.xio_npages) {
	3517	m = bp->b_xio.xio_pages[0];
	3518	nbio = push_bio(bio);
	3519	nbio->bio_offset = ptoa(m->pindex);
	3520	KKASSERT(m->object == object);
	3521	swap_pager_strategy(object, nbio);
	3522	return(1);
	3523	}
	3524	return(0);
	3525	}
	3526
	3527	/*
	3528	* bpdone:
	3529	*
	3530	* Finish I/O on a buffer after all BIOs have been processed.
	3531	* Called when the bio chain is exhausted or by biowait. If called
	3532	* by biowait, elseit is typically 0.
	3533	*
	3534	* bpdone is also responsible for setting B_CACHE in a B_VMIO bp.
	3535	* In a non-VMIO bp, B_CACHE will be set on the next getblk()
	3536	* assuming B_INVAL is clear.
	3537	*
	3538	* For the VMIO case, we set B_CACHE if the op was a read and no
	3539	* read error occured, or if the op was a write. B_CACHE is never
	3540	* set if the buffer is invalid or otherwise uncacheable.
	3541	*
	3542	* bpdone does not mess with B_INVAL, allowing the I/O routine or the
	3543	* initiator to leave B_INVAL set to brelse the buffer out of existance
	3544	* in the biodone routine.
	3545	*/
	3546	void
	3547	bpdone(struct buf *bp, int elseit)
	3548	{
	3549	buf_cmd_t cmd;
	3550
	3551	KASSERT(BUF_REFCNTNB(bp) > 0,
	3552	("biodone: bp %p not busy %d", bp, BUF_REFCNTNB(bp)));
	3553	KASSERT(bp->b_cmd != BUF_CMD_DONE,
	3554	("biodone: bp %p already done!", bp));
	3555
	3556	/*
	3557	* No more BIOs are left. All completion functions have been dealt
	3558	* with, now we clean up the buffer.
	3559	*/
	3560	cmd = bp->b_cmd;
	3561	bp->b_cmd = BUF_CMD_DONE;
	3562
	3563	/*
	3564	* Only reads and writes are processed past this point.
	3565	*/
	3566	if (cmd != BUF_CMD_READ && cmd != BUF_CMD_WRITE) {
	3567	if (cmd == BUF_CMD_FREEBLKS)
	3568	bp->b_flags \|= B_NOCACHE;
	3569	if (elseit)
	3570	brelse(bp);
	3571	return;
	3572	}
	3573
	3574	/*
	3575	* Warning: softupdates may re-dirty the buffer, and HAMMER can do
	3576	* a lot worse. XXX - move this above the clearing of b_cmd
	3577	*/
	3578	if (LIST_FIRST(&bp->b_dep) != NULL)
	3579	buf_complete(bp);
	3580
	3581	/*
	3582	* A failed write must re-dirty the buffer unless B_INVAL
	3583	* was set. Only applicable to normal buffers (with VPs).
	3584	* vinum buffers may not have a vp.
	3585	*/
	3586	if (cmd == BUF_CMD_WRITE &&
	3587	(bp->b_flags & (B_ERROR \| B_INVAL)) == B_ERROR) {
	3588	bp->b_flags &= ~B_NOCACHE;
	3589	if (bp->b_vp)
	3590	bdirty(bp);
	3591	}
	3592
	3593	if (bp->b_flags & B_VMIO) {
	3594	int i;
	3595	vm_ooffset_t foff;
	3596	vm_page_t m;
	3597	vm_object_t obj;
	3598	int iosize;
	3599	struct vnode *vp = bp->b_vp;
	3600
	3601	obj = vp->v_object;
	3602
	3603	#if defined(VFS_BIO_DEBUG)
	3604	if (vp->v_auxrefs == 0)
	3605	panic("biodone: zero vnode hold count");
	3606	if ((vp->v_flag & VOBJBUF) == 0)
	3607	panic("biodone: vnode is not setup for merged cache");
	3608	#endif
	3609
	3610	foff = bp->b_loffset;
	3611	KASSERT(foff != NOOFFSET, ("biodone: no buffer offset"));
	3612	KASSERT(obj != NULL, ("biodone: missing VM object"));
	3613
	3614	#if defined(VFS_BIO_DEBUG)
	3615	if (obj->paging_in_progress < bp->b_xio.xio_npages) {
	3616	kprintf("biodone: paging in progress(%d) < bp->b_xio.xio_npages(%d)\n",
	3617	obj->paging_in_progress, bp->b_xio.xio_npages);
	3618	}
	3619	#endif
	3620
	3621	/*
	3622	* Set B_CACHE if the op was a normal read and no error
	3623	* occured. B_CACHE is set for writes in the b*write()
	3624	* routines.
	3625	*/
	3626	iosize = bp->b_bcount - bp->b_resid;
	3627	if (cmd == BUF_CMD_READ &&
	3628	(bp->b_flags & (B_INVAL\|B_NOCACHE\|B_ERROR)) == 0) {
	3629	bp->b_flags \|= B_CACHE;
	3630	}
	3631
	3632	crit_enter();
	3633	get_mplock();
	3634	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	3635	int bogusflag = 0;
	3636	int resid;
	3637
	3638	resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
	3639	if (resid > iosize)
	3640	resid = iosize;
	3641
	3642	/*
	3643	* cleanup bogus pages, restoring the originals. Since
	3644	* the originals should still be wired, we don't have
	3645	* to worry about interrupt/freeing races destroying
	3646	* the VM object association.
	3647	*/
	3648	m = bp->b_xio.xio_pages[i];
	3649	if (m == bogus_page) {
	3650	bogusflag = 1;
	3651	m = vm_page_lookup(obj, OFF_TO_IDX(foff));
	3652	if (m == NULL)
	3653	panic("biodone: page disappeared");
	3654	bp->b_xio.xio_pages[i] = m;
	3655	pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
	3656	bp->b_xio.xio_pages, bp->b_xio.xio_npages);
	3657	}
	3658	#if defined(VFS_BIO_DEBUG)
	3659	if (OFF_TO_IDX(foff) != m->pindex) {
	3660	kprintf("biodone: foff(%lu)/m->pindex(%ld) "
	3661	"mismatch\n",
	3662	(unsigned long)foff, (long)m->pindex);
	3663	}
	3664	#endif
	3665
	3666	/*
	3667	* In the write case, the valid and clean bits are
	3668	* already changed correctly (see bdwrite()), so we
	3669	* only need to do this here in the read case.
	3670	*/
	3671	if (cmd == BUF_CMD_READ && !bogusflag && resid > 0) {
	3672	vfs_clean_one_page(bp, i, m);
	3673	}
	3674	vm_page_flag_clear(m, PG_ZERO);
	3675
	3676	/*
	3677	* when debugging new filesystems or buffer I/O
	3678	* methods, this is the most common error that pops
	3679	* up. if you see this, you have not set the page
	3680	* busy flag correctly!!!
	3681	*/
	3682	if (m->busy == 0) {
	3683	kprintf("biodone: page busy < 0, "
	3684	"pindex: %d, foff: 0x(%x,%x), "
	3685	"resid: %d, index: %d\n",
	3686	(int) m->pindex, (int)(foff >> 32),
	3687	(int) foff & 0xffffffff, resid, i);
	3688	if (!vn_isdisk(vp, NULL))
	3689	kprintf(" iosize: %ld, loffset: %lld, "
	3690	"flags: 0x%08x, npages: %d\n",
	3691	bp->b_vp->v_mount->mnt_stat.f_iosize,
	3692	(long long)bp->b_loffset,
	3693	bp->b_flags, bp->b_xio.xio_npages);
	3694	else
	3695	kprintf(" VDEV, loffset: %lld, flags: 0x%08x, npages: %d\n",
	3696	(long long)bp->b_loffset,
	3697	bp->b_flags, bp->b_xio.xio_npages);
	3698	kprintf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
	3699	m->valid, m->dirty, m->wire_count);
	3700	panic("biodone: page busy < 0");
	3701	}
	3702	vm_page_io_finish(m);
	3703	vm_object_pip_subtract(obj, 1);
	3704	foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
	3705	iosize -= resid;
	3706	}
	3707	if (obj)
	3708	vm_object_pip_wakeupn(obj, 0);
	3709	rel_mplock();
	3710	crit_exit();
	3711	}
	3712
	3713	/*
	3714	* Finish up by releasing the buffer. There are no more synchronous
	3715	* or asynchronous completions, those were handled by bio_done
	3716	* callbacks.
	3717	*/
	3718	if (elseit) {
	3719	if (bp->b_flags & (B_NOCACHE\|B_INVAL\|B_ERROR\|B_RELBUF))
	3720	brelse(bp);
	3721	else
	3722	bqrelse(bp);
	3723	}
	3724	}
	3725
	3726	/*
	3727	* Normal biodone.
	3728	*/
	3729	void
	3730	biodone(struct bio *bio)
	3731	{
	3732	struct buf *bp = bio->bio_buf;
	3733
	3734	runningbufwakeup(bp);
	3735
	3736	/*
	3737	* Run up the chain of BIO's. Leave b_cmd intact for the duration.
	3738	*/
	3739	while (bio) {
	3740	biodone_t *done_func;
	3741	struct bio_track *track;
	3742
	3743	/*
	3744	* BIO tracking. Most but not all BIOs are tracked.
	3745	*/
	3746	if ((track = bio->bio_track) != NULL) {
	3747	bio_track_rel(track);
	3748	bio->bio_track = NULL;
	3749	}
	3750
	3751	/*
	3752	* A bio_done function terminates the loop. The function
	3753	* will be responsible for any further chaining and/or
	3754	* buffer management.
	3755	*
	3756	* WARNING! The done function can deallocate the buffer!
	3757	*/
	3758	if ((done_func = bio->bio_done) != NULL) {
	3759	bio->bio_done = NULL;
	3760	done_func(bio);
	3761	return;
	3762	}
	3763	bio = bio->bio_prev;
	3764	}
	3765
	3766	/*
	3767	* If we've run out of bio's do normal [a]synchronous completion.
	3768	*/
	3769	bpdone(bp, 1);
	3770	}
	3771
	3772	/*
	3773	* Synchronous biodone - this terminates a synchronous BIO.
	3774	*
	3775	* bpdone() is called with elseit=FALSE, leaving the buffer completed
	3776	* but still locked. The caller must brelse() the buffer after waiting
	3777	* for completion.
	3778	*/
	3779	void
	3780	biodone_sync(struct bio *bio)
	3781	{
	3782	struct buf *bp = bio->bio_buf;
	3783	int flags;
	3784	int nflags;
	3785
	3786	KKASSERT(bio == &bp->b_bio1);
	3787	bpdone(bp, 0);
	3788
	3789	for (;;) {
	3790	flags = bio->bio_flags;
	3791	nflags = (flags \| BIO_DONE) & ~BIO_WANT;
	3792
	3793	if (atomic_cmpset_int(&bio->bio_flags, flags, nflags)) {
	3794	if (flags & BIO_WANT)
	3795	wakeup(bio);
	3796	break;
	3797	}
	3798	}
	3799	}
	3800
	3801	/*
	3802	* vfs_unbusy_pages:
	3803	*
	3804	* This routine is called in lieu of iodone in the case of
	3805	* incomplete I/O. This keeps the busy status for pages
	3806	* consistant.
	3807	*/
	3808	void
	3809	vfs_unbusy_pages(struct buf *bp)
	3810	{
	3811	int i;
	3812
	3813	runningbufwakeup(bp);
	3814	if (bp->b_flags & B_VMIO) {
	3815	struct vnode *vp = bp->b_vp;
	3816	vm_object_t obj;
	3817
	3818	obj = vp->v_object;
	3819
	3820	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	3821	vm_page_t m = bp->b_xio.xio_pages[i];
	3822
	3823	/*
	3824	* When restoring bogus changes the original pages
	3825	* should still be wired, so we are in no danger of
	3826	* losing the object association and do not need
	3827	* critical section protection particularly.
	3828	*/
	3829	if (m == bogus_page) {
	3830	m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_loffset) + i);
	3831	if (!m) {
	3832	panic("vfs_unbusy_pages: page missing");
	3833	}
	3834	bp->b_xio.xio_pages[i] = m;
	3835	pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
	3836	bp->b_xio.xio_pages, bp->b_xio.xio_npages);
	3837	}
	3838	vm_object_pip_subtract(obj, 1);
	3839	vm_page_flag_clear(m, PG_ZERO);
	3840	vm_page_io_finish(m);
	3841	}
	3842	vm_object_pip_wakeupn(obj, 0);
	3843	}
	3844	}
	3845
	3846	/*
	3847	* vfs_busy_pages:
	3848	*
	3849	* This routine is called before a device strategy routine.
	3850	* It is used to tell the VM system that paging I/O is in
	3851	* progress, and treat the pages associated with the buffer
	3852	* almost as being PG_BUSY. Also the object 'paging_in_progress'
	3853	* flag is handled to make sure that the object doesn't become
	3854	* inconsistant.
	3855	*
	3856	* Since I/O has not been initiated yet, certain buffer flags
	3857	* such as B_ERROR or B_INVAL may be in an inconsistant state
	3858	* and should be ignored.
	3859	*/
	3860	void
	3861	vfs_busy_pages(struct vnode vp, struct buf bp)
	3862	{
	3863	int i, bogus;
	3864	struct lwp *lp = curthread->td_lwp;
	3865
	3866	/*
	3867	* The buffer's I/O command must already be set. If reading,
	3868	* B_CACHE must be 0 (double check against callers only doing
	3869	* I/O when B_CACHE is 0).
	3870	*/
	3871	KKASSERT(bp->b_cmd != BUF_CMD_DONE);
	3872	KKASSERT(bp->b_cmd == BUF_CMD_WRITE \|\| (bp->b_flags & B_CACHE) == 0);
	3873
	3874	if (bp->b_flags & B_VMIO) {
	3875	vm_object_t obj;
	3876
	3877	obj = vp->v_object;
	3878	KASSERT(bp->b_loffset != NOOFFSET,
	3879	("vfs_busy_pages: no buffer offset"));
	3880
	3881	/*
	3882	* Loop until none of the pages are busy.
	3883	*/
	3884	retry:
	3885	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	3886	vm_page_t m = bp->b_xio.xio_pages[i];
	3887
	3888	if (vm_page_sleep_busy(m, FALSE, "vbpage"))
	3889	goto retry;
	3890	}
	3891
	3892	/*
	3893	* Setup for I/O, soft-busy the page right now because
	3894	* the next loop may block.
	3895	*/
	3896	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	3897	vm_page_t m = bp->b_xio.xio_pages[i];
	3898
	3899	vm_page_flag_clear(m, PG_ZERO);
	3900	if ((bp->b_flags & B_CLUSTER) == 0) {
	3901	vm_object_pip_add(obj, 1);
	3902	vm_page_io_start(m);
	3903	}
	3904	}
	3905
	3906	/*
	3907	* Adjust protections for I/O and do bogus-page mapping.
	3908	* Assume that vm_page_protect() can block (it can block
	3909	* if VM_PROT_NONE, don't take any chances regardless).
	3910	*
	3911	* In particular note that for writes we must incorporate
	3912	* page dirtyness from the VM system into the buffer's
	3913	* dirty range.
	3914	*
	3915	* For reads we theoretically must incorporate page dirtyness
	3916	* from the VM system to determine if the page needs bogus
	3917	* replacement, but we shortcut the test by simply checking
	3918	* that all m->valid bits are set, indicating that the page
	3919	* is fully valid and does not need to be re-read. For any
	3920	* VM system dirtyness the page will also be fully valid
	3921	* since it was mapped at one point.
	3922	*/
	3923	bogus = 0;
	3924	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	3925	vm_page_t m = bp->b_xio.xio_pages[i];
	3926
	3927	vm_page_flag_clear(m, PG_ZERO); /* XXX */
	3928	if (bp->b_cmd == BUF_CMD_WRITE) {
	3929	/*
	3930	* When readying a vnode-backed buffer for
	3931	* a write we must zero-fill any invalid
	3932	* portions of the backing VM pages, mark
	3933	* it valid and clear related dirty bits.
	3934	*
	3935	* vfs_clean_one_page() incorporates any
	3936	* VM dirtyness and updates the b_dirtyoff
	3937	* range (after we've made the page RO).
	3938	*
	3939	* It is also expected that the pmap modified
	3940	* bit has already been cleared by the
	3941	* vm_page_protect(). We may not be able
	3942	* to clear all dirty bits for a page if it
	3943	* was also memory mapped (NFS).
	3944	*
	3945	* Finally be sure to unassign any swap-cache
	3946	* backing store as it is now stale.
	3947	*/
	3948	vm_page_protect(m, VM_PROT_READ);
	3949	vfs_clean_one_page(bp, i, m);
	3950	swap_pager_unswapped(m);
	3951	} else if (m->valid == VM_PAGE_BITS_ALL) {
	3952	/*
	3953	* When readying a vnode-backed buffer for
	3954	* read we must replace any dirty pages with
	3955	* a bogus page so dirty data is not destroyed
	3956	* when filling gaps.
	3957	*
	3958	* To avoid testing whether the page is
	3959	* dirty we instead test that the page was
	3960	* at some point mapped (m->valid fully
	3961	* valid) with the understanding that
	3962	* this also covers the dirty case.
	3963	*/
	3964	bp->b_xio.xio_pages[i] = bogus_page;
	3965	bogus++;
	3966	} else if (m->valid & m->dirty) {
	3967	/*
	3968	* This case should not occur as partial
	3969	* dirtyment can only happen if the buffer
	3970	* is B_CACHE, and this code is not entered
	3971	* if the buffer is B_CACHE.
	3972	*/
	3973	kprintf("Warning: vfs_busy_pages - page not "
	3974	"fully valid! loff=%jx bpf=%08x "
	3975	"idx=%d val=%02x dir=%02x\n",
	3976	(intmax_t)bp->b_loffset, bp->b_flags,
	3977	i, m->valid, m->dirty);
	3978	vm_page_protect(m, VM_PROT_NONE);
	3979	} else {
	3980	/*
	3981	* The page is not valid and can be made
	3982	* part of the read.
	3983	*/
	3984	vm_page_protect(m, VM_PROT_NONE);
	3985	}
	3986	}
	3987	if (bogus) {
	3988	pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
	3989	bp->b_xio.xio_pages, bp->b_xio.xio_npages);
	3990	}
	3991	}
	3992
	3993	/*
	3994	* This is the easiest place to put the process accounting for the I/O
	3995	* for now.
	3996	*/
	3997	if (lp != NULL) {
	3998	if (bp->b_cmd == BUF_CMD_READ)
	3999	lp->lwp_ru.ru_inblock++;
	4000	else
	4001	lp->lwp_ru.ru_oublock++;
	4002	}
	4003	}
	4004
	4005	/*
	4006	* vfs_clean_pages:
	4007	*
	4008	* Tell the VM system that the pages associated with this buffer
	4009	* are clean. This is used for delayed writes where the data is
	4010	* going to go to disk eventually without additional VM intevention.
	4011	*
	4012	* Note that while we only really need to clean through to b_bcount, we
	4013	* just go ahead and clean through to b_bufsize.
	4014	*/
	4015	static void
	4016	vfs_clean_pages(struct buf *bp)
	4017	{
	4018	vm_page_t m;
	4019	int i;
	4020
	4021	if ((bp->b_flags & B_VMIO) == 0)
	4022	return;
	4023
	4024	KASSERT(bp->b_loffset != NOOFFSET,
	4025	("vfs_clean_pages: no buffer offset"));
	4026
	4027	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	4028	m = bp->b_xio.xio_pages[i];
	4029	vfs_clean_one_page(bp, i, m);
	4030	}
	4031	}
	4032
	4033	/*
	4034	* vfs_clean_one_page:
	4035	*
	4036	* Set the valid bits and clear the dirty bits in a page within a
	4037	* buffer. The range is restricted to the buffer's size and the
	4038	* buffer's logical offset might index into the first page.
	4039	*
	4040	* The caller has busied or soft-busied the page and it is not mapped,
	4041	* test and incorporate the dirty bits into b_dirtyoff/end before
	4042	* clearing them. Note that we need to clear the pmap modified bits
	4043	* after determining the the page was dirty, vm_page_set_validclean()
	4044	* does not do it for us.
	4045	*
	4046	* This routine is typically called after a read completes (dirty should
	4047	* be zero in that case as we are not called on bogus-replace pages),
	4048	* or before a write is initiated.
	4049	*/
	4050	static void
	4051	vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m)
	4052	{
	4053	int bcount;
	4054	int xoff;
	4055	int soff;
	4056	int eoff;
	4057
	4058	/*
	4059	* Calculate offset range within the page but relative to buffer's
	4060	* loffset. loffset might be offset into the first page.
	4061	*/
	4062	xoff = (int)bp->b_loffset & PAGE_MASK; /* loffset offset into pg 0 */
	4063	bcount = bp->b_bcount + xoff; /* offset adjusted */
	4064
	4065	if (pageno == 0) {
	4066	soff = xoff;
	4067	eoff = PAGE_SIZE;
	4068	} else {
	4069	soff = (pageno << PAGE_SHIFT);
	4070	eoff = soff + PAGE_SIZE;
	4071	}
	4072	if (eoff > bcount)
	4073	eoff = bcount;
	4074	if (soff >= eoff)
	4075	return;
	4076
	4077	/*
	4078	* Test dirty bits and adjust b_dirtyoff/end.
	4079	*
	4080	* If dirty pages are incorporated into the bp any prior
	4081	* B_NEEDCOMMIT state (NFS) must be cleared because the
	4082	* caller has not taken into account the new dirty data.
	4083	*
	4084	* If the page was memory mapped the dirty bits might go beyond the
	4085	* end of the buffer, but we can't really make the assumption that
	4086	* a file EOF straddles the buffer (even though this is the case for
	4087	* NFS if B_NEEDCOMMIT is also set). So for the purposes of clearing
	4088	* B_NEEDCOMMIT we only test the dirty bits covered by the buffer.
	4089	* This also saves some console spam.
	4090	*
	4091	* When clearing B_NEEDCOMMIT we must also clear B_CLUSTEROK,
	4092	* NFS can handle huge commits but not huge writes.
	4093	*/
	4094	vm_page_test_dirty(m);
	4095	if (m->dirty) {
	4096	if ((bp->b_flags & B_NEEDCOMMIT) &&
	4097	(m->dirty & vm_page_bits(soff & PAGE_MASK, eoff - soff))) {
	4098	if (debug_commit)
	4099	kprintf("Warning: vfs_clean_one_page: bp %p "
	4100	"loff=%jx,%d flgs=%08x clr B_NEEDCOMMIT"
	4101	" cmd %d vd %02x/%02x x/s/e %d %d %d "
	4102	"doff/end %d %d\n",
	4103	bp, (intmax_t)bp->b_loffset, bp->b_bcount,
	4104	bp->b_flags, bp->b_cmd,
	4105	m->valid, m->dirty, xoff, soff, eoff,
	4106	bp->b_dirtyoff, bp->b_dirtyend);
	4107	bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK);
	4108	if (debug_commit)
	4109	print_backtrace(-1);
	4110	}
	4111	/*
	4112	* Only clear the pmap modified bits if ALL the dirty bits
	4113	* are set, otherwise the system might mis-clear portions
	4114	* of a page.
	4115	*/
	4116	if (m->dirty == VM_PAGE_BITS_ALL &&
	4117	(bp->b_flags & B_NEEDCOMMIT) == 0) {
	4118	pmap_clear_modify(m);
	4119	}
	4120	if (bp->b_dirtyoff > soff - xoff)
	4121	bp->b_dirtyoff = soff - xoff;
	4122	if (bp->b_dirtyend < eoff - xoff)
	4123	bp->b_dirtyend = eoff - xoff;
	4124	}
	4125
	4126	/*
	4127	* Set related valid bits, clear related dirty bits.
	4128	* Does not mess with the pmap modified bit.
	4129	*
	4130	* WARNING! We cannot just clear all of m->dirty here as the
	4131	* buffer cache buffers may use a DEV_BSIZE'd aligned
	4132	* block size, or have an odd size (e.g. NFS at file EOF).
	4133	* The putpages code can clear m->dirty to 0.
	4134	*
	4135	* If a VOP_WRITE generates a buffer cache buffer which
	4136	* covers the same space as mapped writable pages the
	4137	* buffer flush might not be able to clear all the dirty
	4138	* bits and still require a putpages from the VM system
	4139	* to finish it off.
	4140	*/
	4141	vm_page_set_validclean(m, soff & PAGE_MASK, eoff - soff);
	4142	}
	4143
	4144	/*
	4145	* Similar to vfs_clean_one_page() but sets the bits to valid and dirty.
	4146	* The page data is assumed to be valid (there is no zeroing here).
	4147	*/
	4148	static void
	4149	vfs_dirty_one_page(struct buf *bp, int pageno, vm_page_t m)
	4150	{
	4151	int bcount;
	4152	int xoff;
	4153	int soff;
	4154	int eoff;
	4155
	4156	/*
	4157	* Calculate offset range within the page but relative to buffer's
	4158	* loffset. loffset might be offset into the first page.
	4159	*/
	4160	xoff = (int)bp->b_loffset & PAGE_MASK; /* loffset offset into pg 0 */
	4161	bcount = bp->b_bcount + xoff; /* offset adjusted */
	4162
	4163	if (pageno == 0) {
	4164	soff = xoff;
	4165	eoff = PAGE_SIZE;
	4166	} else {
	4167	soff = (pageno << PAGE_SHIFT);
	4168	eoff = soff + PAGE_SIZE;
	4169	}
	4170	if (eoff > bcount)
	4171	eoff = bcount;
	4172	if (soff >= eoff)
	4173	return;
	4174	vm_page_set_validdirty(m, soff & PAGE_MASK, eoff - soff);
	4175	}
	4176
	4177	/*
	4178	* vfs_bio_clrbuf:
	4179	*
	4180	* Clear a buffer. This routine essentially fakes an I/O, so we need
	4181	* to clear B_ERROR and B_INVAL.
	4182	*
	4183	* Note that while we only theoretically need to clear through b_bcount,
	4184	* we go ahead and clear through b_bufsize.
	4185	*/
	4186
	4187	void
	4188	vfs_bio_clrbuf(struct buf *bp)
	4189	{
	4190	int i, mask = 0;
	4191	caddr_t sa, ea;
	4192	if ((bp->b_flags & (B_VMIO \| B_MALLOC)) == B_VMIO) {
	4193	bp->b_flags &= ~(B_INVAL \| B_EINTR \| B_ERROR);
	4194	if ((bp->b_xio.xio_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
	4195	(bp->b_loffset & PAGE_MASK) == 0) {
	4196	mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
	4197	if ((bp->b_xio.xio_pages[0]->valid & mask) == mask) {
	4198	bp->b_resid = 0;
	4199	return;
	4200	}
	4201	if (((bp->b_xio.xio_pages[0]->flags & PG_ZERO) == 0) &&
	4202	((bp->b_xio.xio_pages[0]->valid & mask) == 0)) {
	4203	bzero(bp->b_data, bp->b_bufsize);
	4204	bp->b_xio.xio_pages[0]->valid \|= mask;
	4205	bp->b_resid = 0;
	4206	return;
	4207	}
	4208	}
	4209	sa = bp->b_data;
	4210	for(i=0;i<bp->b_xio.xio_npages;i++,sa=ea) {
	4211	int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
	4212	ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
	4213	ea = (caddr_t)(vm_offset_t)ulmin(
	4214	(u_long)(vm_offset_t)ea,
	4215	(u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
	4216	mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
	4217	if ((bp->b_xio.xio_pages[i]->valid & mask) == mask)
	4218	continue;
	4219	if ((bp->b_xio.xio_pages[i]->valid & mask) == 0) {
	4220	if ((bp->b_xio.xio_pages[i]->flags & PG_ZERO) == 0) {
	4221	bzero(sa, ea - sa);
	4222	}
	4223	} else {
	4224	for (; sa < ea; sa += DEV_BSIZE, j++) {
	4225	if (((bp->b_xio.xio_pages[i]->flags & PG_ZERO) == 0) &&
	4226	(bp->b_xio.xio_pages[i]->valid & (1<<j)) == 0)
	4227	bzero(sa, DEV_BSIZE);
	4228	}
	4229	}
	4230	bp->b_xio.xio_pages[i]->valid \|= mask;
	4231	vm_page_flag_clear(bp->b_xio.xio_pages[i], PG_ZERO);
	4232	}
	4233	bp->b_resid = 0;
	4234	} else {
	4235	clrbuf(bp);
	4236	}
	4237	}
	4238
	4239	/*
	4240	* vm_hold_load_pages:
	4241	*
	4242	* Load pages into the buffer's address space. The pages are
	4243	* allocated from the kernel object in order to reduce interference
	4244	* with the any VM paging I/O activity. The range of loaded
	4245	* pages will be wired.
	4246	*
	4247	* If a page cannot be allocated, the 'pagedaemon' is woken up to
	4248	* retrieve the full range (to - from) of pages.
	4249	*
	4250	*/
	4251	void
	4252	vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
	4253	{
	4254	vm_offset_t pg;
	4255	vm_page_t p;
	4256	int index;
	4257
	4258	to = round_page(to);
	4259	from = round_page(from);
	4260	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
	4261
	4262	pg = from;
	4263	while (pg < to) {
	4264	/*
	4265	* Note: must allocate system pages since blocking here
	4266	* could intefere with paging I/O, no matter which
	4267	* process we are.
	4268	*/
	4269	p = bio_page_alloc(&kernel_object, pg >> PAGE_SHIFT,
	4270	(vm_pindex_t)((to - pg) >> PAGE_SHIFT));
	4271	if (p) {
	4272	vm_page_wire(p);
	4273	p->valid = VM_PAGE_BITS_ALL;
	4274	vm_page_flag_clear(p, PG_ZERO);
	4275	pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
	4276	bp->b_xio.xio_pages[index] = p;
	4277	vm_page_wakeup(p);
	4278
	4279	pg += PAGE_SIZE;
	4280	++index;
	4281	}
	4282	}
	4283	bp->b_xio.xio_npages = index;
	4284	}
	4285
	4286	/*
	4287	* Allocate pages for a buffer cache buffer.
	4288	*
	4289	* Under extremely severe memory conditions even allocating out of the
	4290	* system reserve can fail. If this occurs we must allocate out of the
	4291	* interrupt reserve to avoid a deadlock with the pageout daemon.
	4292	*
	4293	* The pageout daemon can run (putpages -> VOP_WRITE -> getblk -> allocbuf).
	4294	* If the buffer cache's vm_page_alloc() fails a vm_wait() can deadlock
	4295	* against the pageout daemon if pages are not freed from other sources.
	4296	*/
	4297	static
	4298	vm_page_t
	4299	bio_page_alloc(vm_object_t obj, vm_pindex_t pg, int deficit)
	4300	{
	4301	vm_page_t p;
	4302
	4303	/*
	4304	* Try a normal allocation, allow use of system reserve.
	4305	*/
	4306	p = vm_page_alloc(obj, pg, VM_ALLOC_NORMAL \| VM_ALLOC_SYSTEM);
	4307	if (p)
	4308	return(p);
	4309
	4310	/*
	4311	* The normal allocation failed and we clearly have a page
	4312	* deficit. Try to reclaim some clean VM pages directly
	4313	* from the buffer cache.
	4314	*/
	4315	vm_pageout_deficit += deficit;
	4316	recoverbufpages();
	4317
	4318	/*
	4319	* We may have blocked, the caller will know what to do if the
	4320	* page now exists.
	4321	*/
	4322	if (vm_page_lookup(obj, pg))
	4323	return(NULL);
	4324
	4325	/*
	4326	* Allocate and allow use of the interrupt reserve.
	4327	*
	4328	* If after all that we still can't allocate a VM page we are
	4329	* in real trouble, but we slog on anyway hoping that the system
	4330	* won't deadlock.
	4331	*/
	4332	p = vm_page_alloc(obj, pg, VM_ALLOC_NORMAL \| VM_ALLOC_SYSTEM \|
	4333	VM_ALLOC_INTERRUPT);
	4334	if (p) {
	4335	if (vm_page_count_severe()) {
	4336	kprintf("bio_page_alloc: WARNING emergency page "
	4337	"allocation\n");
	4338	vm_wait(hz / 20);
	4339	}
	4340	} else {
	4341	kprintf("bio_page_alloc: WARNING emergency page "
	4342	"allocation failed\n");
	4343	vm_wait(hz * 5);
	4344	}
	4345	return(p);
	4346	}
	4347
	4348	/*
	4349	* vm_hold_free_pages:
	4350	*
	4351	* Return pages associated with the buffer back to the VM system.
	4352	*
	4353	* The range of pages underlying the buffer's address space will
	4354	* be unmapped and un-wired.
	4355	*/
	4356	void
	4357	vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
	4358	{
	4359	vm_offset_t pg;
	4360	vm_page_t p;
	4361	int index, newnpages;
	4362
	4363	from = round_page(from);
	4364	to = round_page(to);
	4365	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
	4366	newnpages = index;
	4367
	4368	lwkt_gettoken(&vm_token);
	4369	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
	4370	p = bp->b_xio.xio_pages[index];
	4371	if (p && (index < bp->b_xio.xio_npages)) {
	4372	if (p->busy) {
	4373	kprintf("vm_hold_free_pages: doffset: %lld, "
	4374	"loffset: %lld\n",
	4375	(long long)bp->b_bio2.bio_offset,
	4376	(long long)bp->b_loffset);
	4377	}
	4378	bp->b_xio.xio_pages[index] = NULL;
	4379	pmap_kremove(pg);
	4380	vm_page_busy(p);
	4381	vm_page_unwire(p, 0);
	4382	vm_page_free(p);
	4383	}
	4384	}
	4385	bp->b_xio.xio_npages = newnpages;
	4386	lwkt_reltoken(&vm_token);
	4387	}
	4388
	4389	/*
	4390	* vmapbuf:
	4391	*
	4392	* Map a user buffer into KVM via a pbuf. On return the buffer's
	4393	* b_data, b_bufsize, and b_bcount will be set, and its XIO page array
	4394	* initialized.
	4395	*/
	4396	int
	4397	vmapbuf(struct buf *bp, caddr_t udata, int bytes)
	4398	{
	4399	caddr_t addr;
	4400	vm_offset_t va;
	4401	vm_page_t m;
	4402	int vmprot;
	4403	int error;
	4404	int pidx;
	4405	int i;
	4406
	4407	/*
	4408	* bp had better have a command and it better be a pbuf.
	4409	*/
	4410	KKASSERT(bp->b_cmd != BUF_CMD_DONE);
	4411	KKASSERT(bp->b_flags & B_PAGING);
	4412
	4413	if (bytes < 0)
	4414	return (-1);
	4415
	4416	/*
	4417	* Map the user data into KVM. Mappings have to be page-aligned.
	4418	*/
	4419	addr = (caddr_t)trunc_page((vm_offset_t)udata);
	4420	pidx = 0;
	4421
	4422	vmprot = VM_PROT_READ;
	4423	if (bp->b_cmd == BUF_CMD_READ)
	4424	vmprot \|= VM_PROT_WRITE;
	4425
	4426	while (addr < udata + bytes) {
	4427	/*
	4428	* Do the vm_fault if needed; do the copy-on-write thing
	4429	* when reading stuff off device into memory.
	4430	*
	4431	* vm_fault_page*() returns a held VM page.
	4432	*/
	4433	va = (addr >= udata) ? (vm_offset_t)addr : (vm_offset_t)udata;
	4434	va = trunc_page(va);
	4435
	4436	m = vm_fault_page_quick(va, vmprot, &error);
	4437	if (m == NULL) {
	4438	for (i = 0; i < pidx; ++i) {
	4439	vm_page_unhold(bp->b_xio.xio_pages[i]);
	4440	bp->b_xio.xio_pages[i] = NULL;
	4441	}
	4442	return(-1);
	4443	}
	4444	bp->b_xio.xio_pages[pidx] = m;
	4445	addr += PAGE_SIZE;
	4446	++pidx;
	4447	}
	4448
	4449	/*
	4450	* Map the page array and set the buffer fields to point to
	4451	* the mapped data buffer.
	4452	*/
	4453	if (pidx > btoc(MAXPHYS))
	4454	panic("vmapbuf: mapped more than MAXPHYS");
	4455	pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_xio.xio_pages, pidx);
	4456
	4457	bp->b_xio.xio_npages = pidx;
	4458	bp->b_data = bp->b_kvabase + ((int)(intptr_t)udata & PAGE_MASK);
	4459	bp->b_bcount = bytes;
	4460	bp->b_bufsize = bytes;
	4461	return(0);
	4462	}
	4463
	4464	/*
	4465	* vunmapbuf:
	4466	*
	4467	* Free the io map PTEs associated with this IO operation.
	4468	* We also invalidate the TLB entries and restore the original b_addr.
	4469	*/
	4470	void
	4471	vunmapbuf(struct buf *bp)
	4472	{
	4473	int pidx;
	4474	int npages;
	4475
	4476	KKASSERT(bp->b_flags & B_PAGING);
	4477
	4478	npages = bp->b_xio.xio_npages;
	4479	pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
	4480	for (pidx = 0; pidx < npages; ++pidx) {
	4481	vm_page_unhold(bp->b_xio.xio_pages[pidx]);
	4482	bp->b_xio.xio_pages[pidx] = NULL;
	4483	}
	4484	bp->b_xio.xio_npages = 0;
	4485	bp->b_data = bp->b_kvabase;
	4486	}
	4487
	4488	/*
	4489	* Scan all buffers in the system and issue the callback.
	4490	*/
	4491	int
	4492	scan_all_buffers(int (callback)(struct buf , void ), void info)
	4493	{
	4494	int count = 0;
	4495	int error;
	4496	int n;
	4497
	4498	for (n = 0; n < nbuf; ++n) {
	4499	if ((error = callback(&buf[n], info)) < 0) {
	4500	count = error;
	4501	break;
	4502	}
	4503	count += error;
	4504	}
	4505	return (count);
	4506	}
	4507
	4508	/*
	4509	* nestiobuf_iodone: biodone callback for nested buffers and propagate
	4510	* completion to the master buffer.
	4511	*/
	4512	static void
	4513	nestiobuf_iodone(struct bio *bio)
	4514	{
	4515	struct bio *mbio;
	4516	struct buf mbp, bp;
	4517	int error;
	4518	int donebytes;
	4519
	4520	bp = bio->bio_buf;
	4521	mbio = bio->bio_caller_info1.ptr;
	4522	mbp = mbio->bio_buf;
	4523
	4524	KKASSERT(bp->b_bcount <= bp->b_bufsize);
	4525	KKASSERT(mbp != bp);
	4526
	4527	error = bp->b_error;
	4528	if (bp->b_error == 0 &&
	4529	(bp->b_bcount < bp->b_bufsize \|\| bp->b_resid > 0)) {
	4530	/*
	4531	* Not all got transfered, raise an error. We have no way to
	4532	* propagate these conditions to mbp.
	4533	*/
	4534	error = EIO;
	4535	}
	4536
	4537	donebytes = bp->b_bufsize;
	4538
	4539	relpbuf(bp, NULL);
	4540	nestiobuf_done(mbio, donebytes, error);
	4541	}
	4542
	4543	void
	4544	nestiobuf_done(struct bio *mbio, int donebytes, int error)
	4545	{
	4546	struct buf *mbp;
	4547
	4548	mbp = mbio->bio_buf;
	4549
	4550	/* If this buf didn't do anything, we are done. */
	4551	if (donebytes == 0)
	4552	return;
	4553
	4554	KKASSERT(mbp->b_resid >= donebytes);
	4555
	4556	/* If an error occured, propagate it to the master buffer */
	4557	if (error)
	4558	mbp->b_error = error;
	4559
	4560	/*
	4561	* Decrement the master buf b_resid according to our donebytes, and
	4562	* also check if this is the last missing bit for the whole nestio
	4563	* mess to complete. If so, call biodone() on the master buf mbp.
	4564	*/
	4565	if (atomic_fetchadd_int(&mbp->b_resid, -donebytes) == donebytes) {
	4566	biodone(mbio);
	4567	}
	4568	}
	4569
	4570	/*
	4571	* nestiobuf_setup: setup a "nested" buffer.
	4572	*
	4573	* => 'mbp' is a "master" buffer which is being divided into sub pieces.
	4574	* => 'bp' should be a buffer allocated by getiobuf.
	4575	* => 'offset' is a byte offset in the master buffer.
	4576	* => 'size' is a size in bytes of this nested buffer.
	4577	*/
	4578	void
	4579	nestiobuf_setup(struct bio bio, struct buf bp, int offset, size_t size)
	4580	{
	4581	struct buf *mbp = bio->bio_buf;
	4582	struct vnode *vp = mbp->b_vp;
	4583
	4584	KKASSERT(mbp->b_bcount >= offset + size);
	4585
	4586	/* kernel needs to own the lock for it to be released in biodone */
	4587	BUF_KERNPROC(bp);
	4588	bp->b_vp = vp;
	4589	bp->b_cmd = mbp->b_cmd;
	4590	bp->b_bio1.bio_done = nestiobuf_iodone;
	4591	bp->b_data = (char *)mbp->b_data + offset;
	4592	bp->b_resid = bp->b_bcount = size;
	4593	bp->b_bufsize = bp->b_bcount;
	4594
	4595	bp->b_bio1.bio_track = NULL;
	4596	bp->b_bio1.bio_caller_info1.ptr = bio;
	4597	}
	4598
	4599	/*
	4600	* print out statistics from the current status of the buffer pool
	4601	* this can be toggeled by the system control option debug.syncprt
	4602	*/
	4603	#ifdef DEBUG
	4604	void
	4605	vfs_bufstats(void)
	4606	{
	4607	int i, j, count;
	4608	struct buf *bp;
	4609	struct bqueues *dp;
	4610	int counts[(MAXBSIZE / PAGE_SIZE) + 1];
	4611	static char *bname[3] = { "LOCKED", "LRU", "AGE" };
	4612
	4613	for (dp = bufqueues, i = 0; dp < &bufqueues[3]; dp++, i++) {
	4614	count = 0;
	4615	for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
	4616	counts[j] = 0;
	4617	crit_enter();
	4618	TAILQ_FOREACH(bp, dp, b_freelist) {
	4619	counts[bp->b_bufsize/PAGE_SIZE]++;
	4620	count++;
	4621	}
	4622	crit_exit();
	4623	kprintf("%s: total-%d", bname[i], count);
	4624	for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
	4625	if (counts[j] != 0)
	4626	kprintf(", %d-%d", j * PAGE_SIZE, counts[j]);
	4627	kprintf("\n");
	4628	}
	4629	}
	4630	#endif
	4631
	4632	#ifdef DDB
	4633
	4634	DB_SHOW_COMMAND(buffer, db_show_buffer)
	4635	{
	4636	/* get args */
	4637	struct buf bp = (struct buf )addr;
	4638
	4639	if (!have_addr) {
	4640	db_printf("usage: show buffer <addr>\n");
	4641	return;
	4642	}
	4643
	4644	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
	4645	db_printf("b_cmd = %d\n", bp->b_cmd);
	4646	db_printf("b_error = %d, b_bufsize = %d, b_bcount = %d, "
	4647	"b_resid = %d\n, b_data = %p, "
	4648	"bio_offset(disk) = %lld, bio_offset(phys) = %lld\n",
	4649	bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
	4650	bp->b_data,
	4651	(long long)bp->b_bio2.bio_offset,
	4652	(long long)(bp->b_bio2.bio_next ?
	4653	bp->b_bio2.bio_next->bio_offset : (off_t)-1));
	4654	if (bp->b_xio.xio_npages) {
	4655	int i;
	4656	db_printf("b_xio.xio_npages = %d, pages(OBJ, IDX, PA): ",
	4657	bp->b_xio.xio_npages);
	4658	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	4659	vm_page_t m;
	4660	m = bp->b_xio.xio_pages[i];
	4661	db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
	4662	(u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
	4663	if ((i + 1) < bp->b_xio.xio_npages)
	4664	db_printf(",");
	4665	}
	4666	db_printf("\n");
	4667	}
	4668	}
	4669	#endif /* DDB */