gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1994,1997 John S. Dyson
	3	* All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* 1. Redistributions of source code must retain the above copyright
	9	* notice immediately at the beginning of the file, without modification,
	10	* this list of conditions, and the following disclaimer.
	11	* 2. Absolutely no warranty of function or purpose is made by the author
	12	* John S. Dyson.
	13	*
	14	* $FreeBSD: src/sys/kern/vfs_bio.c,v 1.242.2.20 2003/05/28 18:38:10 alc Exp $
	15	* $DragonFly: src/sys/kern/vfs_bio.c,v 1.96 2008/01/10 07:34:01 dillon Exp $
	16	*/
	17
	18	/*
	19	* this file contains a new buffer I/O scheme implementing a coherent
	20	* VM object and buffer cache scheme. Pains have been taken to make
	21	* sure that the performance degradation associated with schemes such
	22	* as this is not realized.
	23	*
	24	* Author: John S. Dyson
	25	* Significant help during the development and debugging phases
	26	* had been provided by David Greenman, also of the FreeBSD core team.
	27	*
	28	* see man buf(9) for more info.
	29	*/
	30
	31	#include <sys/param.h>
	32	#include <sys/systm.h>
	33	#include <sys/buf.h>
	34	#include <sys/conf.h>
	35	#include <sys/eventhandler.h>
	36	#include <sys/lock.h>
	37	#include <sys/malloc.h>
	38	#include <sys/mount.h>
	39	#include <sys/kernel.h>
	40	#include <sys/kthread.h>
	41	#include <sys/proc.h>
	42	#include <sys/reboot.h>
	43	#include <sys/resourcevar.h>
	44	#include <sys/sysctl.h>
	45	#include <sys/vmmeter.h>
	46	#include <sys/vnode.h>
	47	#include <sys/proc.h>
	48	#include <vm/vm.h>
	49	#include <vm/vm_param.h>
	50	#include <vm/vm_kern.h>
	51	#include <vm/vm_pageout.h>
	52	#include <vm/vm_page.h>
	53	#include <vm/vm_object.h>
	54	#include <vm/vm_extern.h>
	55	#include <vm/vm_map.h>
	56
	57	#include <sys/buf2.h>
	58	#include <sys/thread2.h>
	59	#include <sys/spinlock2.h>
	60	#include <vm/vm_page2.h>
	61
	62	#include "opt_ddb.h"
	63	#ifdef DDB
	64	#include <ddb/ddb.h>
	65	#endif
	66
	67	/*
	68	* Buffer queues.
	69	*/
	70	enum bufq_type {
	71	BQUEUE_NONE, /* not on any queue */
	72	BQUEUE_LOCKED, /* locked buffers */
	73	BQUEUE_CLEAN, /* non-B_DELWRI buffers */
	74	BQUEUE_DIRTY, /* B_DELWRI buffers */
	75	BQUEUE_DIRTY_HW, /* B_DELWRI buffers - heavy weight */
	76	BQUEUE_EMPTYKVA, /* empty buffer headers with KVA assignment */
	77	BQUEUE_EMPTY, /* empty buffer headers */
	78
	79	BUFFER_QUEUES /* number of buffer queues */
	80	};
	81
	82	typedef enum bufq_type bufq_type_t;
	83
	84	TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
	85
	86	static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
	87
	88	struct buf buf; / buffer header pool */
	89
	90	static void vm_hold_free_pages(struct buf *bp, vm_offset_t from,
	91	vm_offset_t to);
	92	static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
	93	vm_offset_t to);
	94	static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
	95	int pageno, vm_page_t m);
	96	static void vfs_clean_pages(struct buf *bp);
	97	static void vfs_setdirty(struct buf *bp);
	98	static void vfs_vmio_release(struct buf *bp);
	99	static int flushbufqueues(bufq_type_t q);
	100
	101	static void buf_daemon(void);
	102	static void buf_daemon_hw(void);
	103	/*
	104	* bogus page -- for I/O to/from partially complete buffers
	105	* this is a temporary solution to the problem, but it is not
	106	* really that bad. it would be better to split the buffer
	107	* for input in the case of buffers partially already in memory,
	108	* but the code is intricate enough already.
	109	*/
	110	vm_page_t bogus_page;
	111	int runningbufspace;
	112
	113	/*
	114	* These are all static, but make the ones we export globals so we do
	115	* not need to use compiler magic.
	116	*/
	117	int bufspace, maxbufspace,
	118	bufmallocspace, maxbufmallocspace, lobufspace, hibufspace;
	119	static int bufreusecnt, bufdefragcnt, buffreekvacnt;
	120	static int lorunningspace, hirunningspace, runningbufreq;
	121	int numdirtybuffers, numdirtybuffershw, lodirtybuffers, hidirtybuffers;
	122	static int numfreebuffers, lofreebuffers, hifreebuffers;
	123	static int getnewbufcalls;
	124	static int getnewbufrestarts;
	125
	126	static int needsbuffer; /* locked by needsbuffer_spin */
	127	static int bd_request; /* locked by needsbuffer_spin */
	128	static int bd_request_hw; /* locked by needsbuffer_spin */
	129	static struct spinlock needsbuffer_spin;
	130
	131	/*
	132	* Sysctls for operational control of the buffer cache.
	133	*/
	134	SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
	135	"Number of dirty buffers to flush before bufdaemon becomes inactive");
	136	SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
	137	"High watermark used to trigger explicit flushing of dirty buffers");
	138	SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
	139	"Low watermark for special reserve in low-memory situations");
	140	SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
	141	"High watermark for special reserve in low-memory situations");
	142	SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
	143	"Minimum amount of buffer space required for active I/O");
	144	SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
	145	"Maximum amount of buffer space to usable for active I/O");
	146	/*
	147	* Sysctls determining current state of the buffer cache.
	148	*/
	149	SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
	150	"Pending number of dirty buffers (all)");
	151	SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffershw, CTLFLAG_RD, &numdirtybuffershw, 0,
	152	"Pending number of dirty buffers (heavy weight)");
	153	SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
	154	"Number of free buffers on the buffer cache free list");
	155	SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
	156	"I/O bytes currently in progress due to asynchronous writes");
	157	SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
	158	"Hard limit on maximum amount of memory usable for buffer space");
	159	SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
	160	"Soft limit on maximum amount of memory usable for buffer space");
	161	SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
	162	"Minimum amount of memory to reserve for system buffer space");
	163	SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
	164	"Amount of memory available for buffers");
	165	SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RD, &maxbufmallocspace,
	166	0, "Maximum amount of memory reserved for buffers using malloc");
	167	SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
	168	"Amount of memory left for buffers using malloc-scheme");
	169	SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, &getnewbufcalls, 0,
	170	"New buffer header acquisition requests");
	171	SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, &getnewbufrestarts,
	172	0, "New buffer header acquisition restarts");
	173	SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RD, &bufdefragcnt, 0,
	174	"Buffer acquisition restarts due to fragmented buffer map");
	175	SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RD, &buffreekvacnt, 0,
	176	"Amount of time KVA space was deallocated in an arbitrary buffer");
	177	SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RD, &bufreusecnt, 0,
	178	"Amount of time buffer re-use operations were successful");
	179	SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, 0, sizeof(struct buf),
	180	"sizeof(struct buf)");
	181
	182	char *buf_wmesg = BUF_WMESG;
	183
	184	extern int vm_swap_size;
	185
	186	#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */
	187	#define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */
	188	#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */
	189	#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
	190
	191	/*
	192	* numdirtywakeup:
	193	*
	194	* If someone is blocked due to there being too many dirty buffers,
	195	* and numdirtybuffers is now reasonable, wake them up.
	196	*/
	197	static __inline void
	198	numdirtywakeup(void)
	199	{
	200	if (numdirtybuffers <= (lodirtybuffers + hidirtybuffers) / 2) {
	201	if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
	202	spin_lock_wr(&needsbuffer_spin);
	203	needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
	204	spin_unlock_wr(&needsbuffer_spin);
	205	wakeup(&needsbuffer);
	206	}
	207	}
	208	}
	209
	210	/*
	211	* bufspacewakeup:
	212	*
	213	* Called when buffer space is potentially available for recovery.
	214	* getnewbuf() will block on this flag when it is unable to free
	215	* sufficient buffer space. Buffer space becomes recoverable when
	216	* bp's get placed back in the queues.
	217	*/
	218
	219	static __inline void
	220	bufspacewakeup(void)
	221	{
	222	/*
	223	* If someone is waiting for BUF space, wake them up. Even
	224	* though we haven't freed the kva space yet, the waiting
	225	* process will be able to now.
	226	*/
	227	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
	228	spin_lock_wr(&needsbuffer_spin);
	229	needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
	230	spin_unlock_wr(&needsbuffer_spin);
	231	wakeup(&needsbuffer);
	232	}
	233	}
	234
	235	/*
	236	* runningbufwakeup:
	237	*
	238	* Accounting for I/O in progress.
	239	*
	240	*/
	241	static __inline void
	242	runningbufwakeup(struct buf *bp)
	243	{
	244	if (bp->b_runningbufspace) {
	245	runningbufspace -= bp->b_runningbufspace;
	246	bp->b_runningbufspace = 0;
	247	if (runningbufreq && runningbufspace <= lorunningspace) {
	248	runningbufreq = 0;
	249	wakeup(&runningbufreq);
	250	}
	251	}
	252	}
	253
	254	/*
	255	* bufcountwakeup:
	256	*
	257	* Called when a buffer has been added to one of the free queues to
	258	* account for the buffer and to wakeup anyone waiting for free buffers.
	259	* This typically occurs when large amounts of metadata are being handled
	260	* by the buffer cache ( else buffer space runs out first, usually ).
	261	*/
	262
	263	static __inline void
	264	bufcountwakeup(void)
	265	{
	266	++numfreebuffers;
	267	if (needsbuffer) {
	268	spin_lock_wr(&needsbuffer_spin);
	269	needsbuffer &= ~VFS_BIO_NEED_ANY;
	270	if (numfreebuffers >= hifreebuffers)
	271	needsbuffer &= ~VFS_BIO_NEED_FREE;
	272	spin_unlock_wr(&needsbuffer_spin);
	273	wakeup(&needsbuffer);
	274	}
	275	}
	276
	277	/*
	278	* waitrunningbufspace()
	279	*
	280	* runningbufspace is a measure of the amount of I/O currently
	281	* running. This routine is used in async-write situations to
	282	* prevent creating huge backups of pending writes to a device.
	283	* Only asynchronous writes are governed by this function.
	284	*
	285	* Reads will adjust runningbufspace, but will not block based on it.
	286	* The read load has a side effect of reducing the allowed write load.
	287	*
	288	* This does NOT turn an async write into a sync write. It waits
	289	* for earlier writes to complete and generally returns before the
	290	* caller's write has reached the device.
	291	*/
	292	static __inline void
	293	waitrunningbufspace(void)
	294	{
	295	if (runningbufspace > hirunningspace) {
	296	crit_enter();
	297	while (runningbufspace > hirunningspace) {
	298	++runningbufreq;
	299	tsleep(&runningbufreq, 0, "wdrain", 0);
	300	}
	301	crit_exit();
	302	}
	303	}
	304
	305	/*
	306	* vfs_buf_test_cache:
	307	*
	308	* Called when a buffer is extended. This function clears the B_CACHE
	309	* bit if the newly extended portion of the buffer does not contain
	310	* valid data.
	311	*/
	312	static __inline__
	313	void
	314	vfs_buf_test_cache(struct buf *bp,
	315	vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
	316	vm_page_t m)
	317	{
	318	if (bp->b_flags & B_CACHE) {
	319	int base = (foff + off) & PAGE_MASK;
	320	if (vm_page_is_valid(m, base, size) == 0)
	321	bp->b_flags &= ~B_CACHE;
	322	}
	323	}
	324
	325	/*
	326	* bd_wakeup:
	327	*
	328	* Wake up the buffer daemon if the number of outstanding dirty buffers
	329	* is above specified threshold 'dirtybuflevel'.
	330	*
	331	* The buffer daemons are explicitly woken up when (a) the pending number
	332	* of dirty buffers exceeds the recovery and stall mid-point value,
	333	* (b) during bwillwrite() or (c) buf freelist was exhausted.
	334	*
	335	* The buffer daemons will generally not stop flushing until the dirty
	336	* buffer count goes below lodirtybuffers.
	337	*/
	338	static __inline__
	339	void
	340	bd_wakeup(int dirtybuflevel)
	341	{
	342	if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
	343	spin_lock_wr(&needsbuffer_spin);
	344	bd_request = 1;
	345	spin_unlock_wr(&needsbuffer_spin);
	346	wakeup(&bd_request);
	347	}
	348	if (bd_request_hw == 0 && numdirtybuffershw >= dirtybuflevel) {
	349	spin_lock_wr(&needsbuffer_spin);
	350	bd_request_hw = 1;
	351	spin_unlock_wr(&needsbuffer_spin);
	352	wakeup(&bd_request_hw);
	353	}
	354	}
	355
	356	/*
	357	* bd_speedup:
	358	*
	359	* Speed up the buffer cache flushing process.
	360	*/
	361
	362	static __inline__
	363	void
	364	bd_speedup(void)
	365	{
	366	bd_wakeup(1);
	367	}
	368
	369	/*
	370	* bufinit:
	371	*
	372	* Load time initialisation of the buffer cache, called from machine
	373	* dependant initialization code.
	374	*/
	375	void
	376	bufinit(void)
	377	{
	378	struct buf *bp;
	379	vm_offset_t bogus_offset;
	380	int i;
	381
	382	spin_init(&needsbuffer_spin);
	383
	384	/* next, make a null set of free lists */
	385	for (i = 0; i < BUFFER_QUEUES; i++)
	386	TAILQ_INIT(&bufqueues[i]);
	387
	388	/* finally, initialize each buffer header and stick on empty q */
	389	for (i = 0; i < nbuf; i++) {
	390	bp = &buf[i];
	391	bzero(bp, sizeof *bp);
	392	bp->b_flags = B_INVAL; /* we're just an empty header */
	393	bp->b_cmd = BUF_CMD_DONE;
	394	bp->b_qindex = BQUEUE_EMPTY;
	395	initbufbio(bp);
	396	xio_init(&bp->b_xio);
	397	buf_dep_init(bp);
	398	BUF_LOCKINIT(bp);
	399	TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_EMPTY], bp, b_freelist);
	400	}
	401
	402	/*
	403	* maxbufspace is the absolute maximum amount of buffer space we are
	404	* allowed to reserve in KVM and in real terms. The absolute maximum
	405	* is nominally used by buf_daemon. hibufspace is the nominal maximum
	406	* used by most other processes. The differential is required to
	407	* ensure that buf_daemon is able to run when other processes might
	408	* be blocked waiting for buffer space.
	409	*
	410	* maxbufspace is based on BKVASIZE. Allocating buffers larger then
	411	* this may result in KVM fragmentation which is not handled optimally
	412	* by the system.
	413	*/
	414	maxbufspace = nbuf * BKVASIZE;
	415	hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
	416	lobufspace = hibufspace - MAXBSIZE;
	417
	418	lorunningspace = 512 * 1024;
	419	hirunningspace = 1024 * 1024;
	420
	421	/*
	422	* Limit the amount of malloc memory since it is wired permanently into
	423	* the kernel space. Even though this is accounted for in the buffer
	424	* allocation, we don't want the malloced region to grow uncontrolled.
	425	* The malloc scheme improves memory utilization significantly on average
	426	* (small) directories.
	427	*/
	428	maxbufmallocspace = hibufspace / 20;
	429
	430	/*
	431	* Reduce the chance of a deadlock occuring by limiting the number
	432	* of delayed-write dirty buffers we allow to stack up.
	433	*/
	434	hidirtybuffers = nbuf / 4 + 20;
	435	numdirtybuffers = 0;
	436	numdirtybuffershw = 0;
	437	/*
	438	* To support extreme low-memory systems, make sure hidirtybuffers cannot
	439	* eat up all available buffer space. This occurs when our minimum cannot
	440	* be met. We try to size hidirtybuffers to 3/4 our buffer space assuming
	441	* BKVASIZE'd (8K) buffers.
	442	*/
	443	while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
	444	hidirtybuffers >>= 1;
	445	}
	446	lodirtybuffers = hidirtybuffers / 2;
	447
	448	/*
	449	* Try to keep the number of free buffers in the specified range,
	450	* and give special processes (e.g. like buf_daemon) access to an
	451	* emergency reserve.
	452	*/
	453	lofreebuffers = nbuf / 18 + 5;
	454	hifreebuffers = 2 * lofreebuffers;
	455	numfreebuffers = nbuf;
	456
	457	/*
	458	* Maximum number of async ops initiated per buf_daemon loop. This is
	459	* somewhat of a hack at the moment, we really need to limit ourselves
	460	* based on the number of bytes of I/O in-transit that were initiated
	461	* from buf_daemon.
	462	*/
	463
	464	bogus_offset = kmem_alloc_pageable(&kernel_map, PAGE_SIZE);
	465	bogus_page = vm_page_alloc(&kernel_object,
	466	(bogus_offset >> PAGE_SHIFT),
	467	VM_ALLOC_NORMAL);
	468	vmstats.v_wire_count++;
	469
	470	}
	471
	472	/*
	473	* Initialize the embedded bio structures
	474	*/
	475	void
	476	initbufbio(struct buf *bp)
	477	{
	478	bp->b_bio1.bio_buf = bp;
	479	bp->b_bio1.bio_prev = NULL;
	480	bp->b_bio1.bio_offset = NOOFFSET;
	481	bp->b_bio1.bio_next = &bp->b_bio2;
	482	bp->b_bio1.bio_done = NULL;
	483
	484	bp->b_bio2.bio_buf = bp;
	485	bp->b_bio2.bio_prev = &bp->b_bio1;
	486	bp->b_bio2.bio_offset = NOOFFSET;
	487	bp->b_bio2.bio_next = NULL;
	488	bp->b_bio2.bio_done = NULL;
	489	}
	490
	491	/*
	492	* Reinitialize the embedded bio structures as well as any additional
	493	* translation cache layers.
	494	*/
	495	void
	496	reinitbufbio(struct buf *bp)
	497	{
	498	struct bio *bio;
	499
	500	for (bio = &bp->b_bio1; bio; bio = bio->bio_next) {
	501	bio->bio_done = NULL;
	502	bio->bio_offset = NOOFFSET;
	503	}
	504	}
	505
	506	/*
	507	* Push another BIO layer onto an existing BIO and return it. The new
	508	* BIO layer may already exist, holding cached translation data.
	509	*/
	510	struct bio *
	511	push_bio(struct bio *bio)
	512	{
	513	struct bio *nbio;
	514
	515	if ((nbio = bio->bio_next) == NULL) {
	516	int index = bio - &bio->bio_buf->b_bio_array[0];
	517	if (index >= NBUF_BIO - 1) {
	518	panic("push_bio: too many layers bp %p\n",
	519	bio->bio_buf);
	520	}
	521	nbio = &bio->bio_buf->b_bio_array[index + 1];
	522	bio->bio_next = nbio;
	523	nbio->bio_prev = bio;
	524	nbio->bio_buf = bio->bio_buf;
	525	nbio->bio_offset = NOOFFSET;
	526	nbio->bio_done = NULL;
	527	nbio->bio_next = NULL;
	528	}
	529	KKASSERT(nbio->bio_done == NULL);
	530	return(nbio);
	531	}
	532
	533	void
	534	pop_bio(struct bio *bio)
	535	{
	536	/* NOP */
	537	}
	538
	539	void
	540	clearbiocache(struct bio *bio)
	541	{
	542	while (bio) {
	543	bio->bio_offset = NOOFFSET;
	544	bio = bio->bio_next;
	545	}
	546	}
	547
	548	/*
	549	* bfreekva:
	550	*
	551	* Free the KVA allocation for buffer 'bp'.
	552	*
	553	* Must be called from a critical section as this is the only locking for
	554	* buffer_map.
	555	*
	556	* Since this call frees up buffer space, we call bufspacewakeup().
	557	*/
	558	static void
	559	bfreekva(struct buf *bp)
	560	{
	561	int count;
	562
	563	if (bp->b_kvasize) {
	564	++buffreekvacnt;
	565	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	566	vm_map_lock(&buffer_map);
	567	bufspace -= bp->b_kvasize;
	568	vm_map_delete(&buffer_map,
	569	(vm_offset_t) bp->b_kvabase,
	570	(vm_offset_t) bp->b_kvabase + bp->b_kvasize,
	571	&count
	572	);
	573	vm_map_unlock(&buffer_map);
	574	vm_map_entry_release(count);
	575	bp->b_kvasize = 0;
	576	bufspacewakeup();
	577	}
	578	}
	579
	580	/*
	581	* bremfree:
	582	*
	583	* Remove the buffer from the appropriate free list.
	584	*/
	585	void
	586	bremfree(struct buf *bp)
	587	{
	588	int old_qindex;
	589
	590	crit_enter();
	591	old_qindex = bp->b_qindex;
	592
	593	if (bp->b_qindex != BQUEUE_NONE) {
	594	KASSERT(BUF_REFCNTNB(bp) == 1,
	595	("bremfree: bp %p not locked",bp));
	596	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
	597	bp->b_qindex = BQUEUE_NONE;
	598	} else {
	599	if (BUF_REFCNTNB(bp) <= 1)
	600	panic("bremfree: removing a buffer not on a queue");
	601	}
	602
	603	/*
	604	* Fixup numfreebuffers count. If the buffer is invalid or not
	605	* delayed-write, and it was on the EMPTY, LRU, or AGE queues,
	606	* the buffer was free and we must decrement numfreebuffers.
	607	*/
	608	if ((bp->b_flags & B_INVAL) \|\| (bp->b_flags & B_DELWRI) == 0) {
	609	switch(old_qindex) {
	610	case BQUEUE_DIRTY:
	611	case BQUEUE_DIRTY_HW:
	612	case BQUEUE_CLEAN:
	613	case BQUEUE_EMPTY:
	614	case BQUEUE_EMPTYKVA:
	615	--numfreebuffers;
	616	break;
	617	default:
	618	break;
	619	}
	620	}
	621	crit_exit();
	622	}
	623
	624
	625	/*
	626	* bread:
	627	*
	628	* Get a buffer with the specified data. Look in the cache first. We
	629	* must clear B_ERROR and B_INVAL prior to initiating I/O. If B_CACHE
	630	* is set, the buffer is valid and we do not have to do anything ( see
	631	* getblk() ).
	632	*/
	633	int
	634	bread(struct vnode vp, off_t loffset, int size, struct buf *bpp)
	635	{
	636	struct buf *bp;
	637
	638	bp = getblk(vp, loffset, size, 0, 0);
	639	*bpp = bp;
	640
	641	/* if not found in cache, do some I/O */
	642	if ((bp->b_flags & B_CACHE) == 0) {
	643	KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp));
	644	bp->b_flags &= ~(B_ERROR \| B_INVAL);
	645	bp->b_cmd = BUF_CMD_READ;
	646	vfs_busy_pages(vp, bp);
	647	vn_strategy(vp, &bp->b_bio1);
	648	return (biowait(bp));
	649	}
	650	return (0);
	651	}
	652
	653	/*
	654	* breadn:
	655	*
	656	* Operates like bread, but also starts asynchronous I/O on
	657	* read-ahead blocks. We must clear B_ERROR and B_INVAL prior
	658	* to initiating I/O . If B_CACHE is set, the buffer is valid
	659	* and we do not have to do anything.
	660	*/
	661	int
	662	breadn(struct vnode vp, off_t loffset, int size, off_t raoffset,
	663	int rabsize, int cnt, struct buf *bpp)
	664	{
	665	struct buf bp, rabp;
	666	int i;
	667	int rv = 0, readwait = 0;
	668
	669	*bpp = bp = getblk(vp, loffset, size, 0, 0);
	670
	671	/* if not found in cache, do some I/O */
	672	if ((bp->b_flags & B_CACHE) == 0) {
	673	bp->b_flags &= ~(B_ERROR \| B_INVAL);
	674	bp->b_cmd = BUF_CMD_READ;
	675	vfs_busy_pages(vp, bp);
	676	vn_strategy(vp, &bp->b_bio1);
	677	++readwait;
	678	}
	679
	680	for (i = 0; i < cnt; i++, raoffset++, rabsize++) {
	681	if (inmem(vp, *raoffset))
	682	continue;
	683	rabp = getblk(vp, raoffset, rabsize, 0, 0);
	684
	685	if ((rabp->b_flags & B_CACHE) == 0) {
	686	rabp->b_flags \|= B_ASYNC;
	687	rabp->b_flags &= ~(B_ERROR \| B_INVAL);
	688	rabp->b_cmd = BUF_CMD_READ;
	689	vfs_busy_pages(vp, rabp);
	690	BUF_KERNPROC(rabp);
	691	vn_strategy(vp, &rabp->b_bio1);
	692	} else {
	693	brelse(rabp);
	694	}
	695	}
	696
	697	if (readwait) {
	698	rv = biowait(bp);
	699	}
	700	return (rv);
	701	}
	702
	703	/*
	704	* bwrite:
	705	*
	706	* Write, release buffer on completion. (Done by iodone
	707	* if async). Do not bother writing anything if the buffer
	708	* is invalid.
	709	*
	710	* Note that we set B_CACHE here, indicating that buffer is
	711	* fully valid and thus cacheable. This is true even of NFS
	712	* now so we set it generally. This could be set either here
	713	* or in biodone() since the I/O is synchronous. We put it
	714	* here.
	715	*/
	716	int
	717	bwrite(struct buf *bp)
	718	{
	719	int oldflags;
	720
	721	if (bp->b_flags & B_INVAL) {
	722	brelse(bp);
	723	return (0);
	724	}
	725
	726	oldflags = bp->b_flags;
	727
	728	if (BUF_REFCNTNB(bp) == 0)
	729	panic("bwrite: buffer is not busy???");
	730	crit_enter();
	731
	732	/* Mark the buffer clean */
	733	bundirty(bp);
	734
	735	bp->b_flags &= ~B_ERROR;
	736	bp->b_flags \|= B_CACHE;
	737	bp->b_cmd = BUF_CMD_WRITE;
	738	vfs_busy_pages(bp->b_vp, bp);
	739
	740	/*
	741	* Normal bwrites pipeline writes. NOTE: b_bufsize is only
	742	* valid for vnode-backed buffers.
	743	*/
	744	bp->b_runningbufspace = bp->b_bufsize;
	745	runningbufspace += bp->b_runningbufspace;
	746
	747	crit_exit();
	748	if (oldflags & B_ASYNC)
	749	BUF_KERNPROC(bp);
	750	vn_strategy(bp->b_vp, &bp->b_bio1);
	751
	752	if ((oldflags & B_ASYNC) == 0) {
	753	int rtval = biowait(bp);
	754	brelse(bp);
	755	return (rtval);
	756	} else if ((oldflags & B_NOWDRAIN) == 0) {
	757	/*
	758	* don't allow the async write to saturate the I/O
	759	* system. Deadlocks can occur only if a device strategy
	760	* routine (like in VN) turns around and issues another
	761	* high-level write, in which case B_NOWDRAIN is expected
	762	* to be set. Otherwise we will not deadlock here because
	763	* we are blocking waiting for I/O that is already in-progress
	764	* to complete.
	765	*/
	766	waitrunningbufspace();
	767	}
	768
	769	return (0);
	770	}
	771
	772	/*
	773	* bdwrite:
	774	*
	775	* Delayed write. (Buffer is marked dirty). Do not bother writing
	776	* anything if the buffer is marked invalid.
	777	*
	778	* Note that since the buffer must be completely valid, we can safely
	779	* set B_CACHE. In fact, we have to set B_CACHE here rather then in
	780	* biodone() in order to prevent getblk from writing the buffer
	781	* out synchronously.
	782	*/
	783	void
	784	bdwrite(struct buf *bp)
	785	{
	786	if (BUF_REFCNTNB(bp) == 0)
	787	panic("bdwrite: buffer is not busy");
	788
	789	if (bp->b_flags & B_INVAL) {
	790	brelse(bp);
	791	return;
	792	}
	793	bdirty(bp);
	794
	795	/*
	796	* Set B_CACHE, indicating that the buffer is fully valid. This is
	797	* true even of NFS now.
	798	*/
	799	bp->b_flags \|= B_CACHE;
	800
	801	/*
	802	* This bmap keeps the system from needing to do the bmap later,
	803	* perhaps when the system is attempting to do a sync. Since it
	804	* is likely that the indirect block -- or whatever other datastructure
	805	* that the filesystem needs is still in memory now, it is a good
	806	* thing to do this. Note also, that if the pageout daemon is
	807	* requesting a sync -- there might not be enough memory to do
	808	* the bmap then... So, this is important to do.
	809	*/
	810	if (bp->b_bio2.bio_offset == NOOFFSET) {
	811	VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset,
	812	NULL, NULL);
	813	}
	814
	815	/*
	816	* Set the dirty buffer range based upon the VM system dirty pages.
	817	*/
	818	vfs_setdirty(bp);
	819
	820	/*
	821	* We need to do this here to satisfy the vnode_pager and the
	822	* pageout daemon, so that it thinks that the pages have been
	823	* "cleaned". Note that since the pages are in a delayed write
	824	* buffer -- the VFS layer "will" see that the pages get written
	825	* out on the next sync, or perhaps the cluster will be completed.
	826	*/
	827	vfs_clean_pages(bp);
	828	bqrelse(bp);
	829
	830	/*
	831	* Wakeup the buffer flushing daemon if we have a lot of dirty
	832	* buffers (midpoint between our recovery point and our stall
	833	* point).
	834	*/
	835	bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
	836
	837	/*
	838	* note: we cannot initiate I/O from a bdwrite even if we wanted to,
	839	* due to the softdep code.
	840	*/
	841	}
	842
	843	/*
	844	* bdirty:
	845	*
	846	* Turn buffer into delayed write request by marking it B_DELWRI.
	847	* B_RELBUF and B_NOCACHE must be cleared.
	848	*
	849	* We reassign the buffer to itself to properly update it in the
	850	* dirty/clean lists.
	851	*
	852	* Since the buffer is not on a queue, we do not update the
	853	* numfreebuffers count.
	854	*
	855	* Must be called from a critical section.
	856	* The buffer must be on BQUEUE_NONE.
	857	*/
	858	void
	859	bdirty(struct buf *bp)
	860	{
	861	KASSERT(bp->b_qindex == BQUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
	862	if (bp->b_flags & B_NOCACHE) {
	863	kprintf("bdirty: clearing B_NOCACHE on buf %p\n", bp);
	864	bp->b_flags &= ~B_NOCACHE;
	865	}
	866	if (bp->b_flags & B_INVAL) {
	867	kprintf("bdirty: warning, dirtying invalid buffer %p\n", bp);
	868	}
	869	bp->b_flags &= ~B_RELBUF;
	870
	871	if ((bp->b_flags & B_DELWRI) == 0) {
	872	bp->b_flags \|= B_DELWRI;
	873	reassignbuf(bp);
	874	++numdirtybuffers;
	875	if (bp->b_flags & B_HEAVY)
	876	++numdirtybuffershw;
	877	bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
	878	}
	879	}
	880
	881	/*
	882	* Set B_HEAVY, indicating that this is a heavy-weight buffer that
	883	* needs to be flushed with a different buf_daemon thread to avoid
	884	* deadlocks. B_HEAVY also imposes restrictions in getnewbuf().
	885	*/
	886	void
	887	bheavy(struct buf *bp)
	888	{
	889	if ((bp->b_flags & B_HEAVY) == 0) {
	890	bp->b_flags \|= B_HEAVY;
	891	if (bp->b_flags & B_DELWRI)
	892	++numdirtybuffershw;
	893	}
	894	}
	895
	896	/*
	897	* bundirty:
	898	*
	899	* Clear B_DELWRI for buffer.
	900	*
	901	* Since the buffer is not on a queue, we do not update the numfreebuffers
	902	* count.
	903	*
	904	* Must be called from a critical section.
	905	*
	906	* The buffer is typically on BQUEUE_NONE but there is one case in
	907	* brelse() that calls this function after placing the buffer on
	908	* a different queue.
	909	*/
	910
	911	void
	912	bundirty(struct buf *bp)
	913	{
	914	if (bp->b_flags & B_DELWRI) {
	915	bp->b_flags &= ~B_DELWRI;
	916	reassignbuf(bp);
	917	--numdirtybuffers;
	918	if (bp->b_flags & B_HEAVY)
	919	--numdirtybuffershw;
	920	numdirtywakeup();
	921	}
	922	/*
	923	* Since it is now being written, we can clear its deferred write flag.
	924	*/
	925	bp->b_flags &= ~B_DEFERRED;
	926	}
	927
	928	/*
	929	* bawrite:
	930	*
	931	* Asynchronous write. Start output on a buffer, but do not wait for
	932	* it to complete. The buffer is released when the output completes.
	933	*
	934	* bwrite() ( or the VOP routine anyway ) is responsible for handling
	935	* B_INVAL buffers. Not us.
	936	*/
	937	void
	938	bawrite(struct buf *bp)
	939	{
	940	bp->b_flags \|= B_ASYNC;
	941	bwrite(bp);
	942	}
	943
	944	/*
	945	* bowrite:
	946	*
	947	* Ordered write. Start output on a buffer, and flag it so that the
	948	* device will write it in the order it was queued. The buffer is
	949	* released when the output completes. bwrite() ( or the VOP routine
	950	* anyway ) is responsible for handling B_INVAL buffers.
	951	*/
	952	int
	953	bowrite(struct buf *bp)
	954	{
	955	bp->b_flags \|= B_ORDERED \| B_ASYNC;
	956	return (bwrite(bp));
	957	}
	958
	959	/*
	960	* bwillwrite:
	961	*
	962	* Called prior to the locking of any vnodes when we are expecting to
	963	* write. We do not want to starve the buffer cache with too many
	964	* dirty buffers so we block here. By blocking prior to the locking
	965	* of any vnodes we attempt to avoid the situation where a locked vnode
	966	* prevents the various system daemons from flushing related buffers.
	967	*/
	968	void
	969	bwillwrite(void)
	970	{
	971	if (numdirtybuffers >= hidirtybuffers) {
	972	while (numdirtybuffers >= hidirtybuffers) {
	973	bd_wakeup(1);
	974	spin_lock_wr(&needsbuffer_spin);
	975	if (numdirtybuffers >= hidirtybuffers) {
	976	needsbuffer \|= VFS_BIO_NEED_DIRTYFLUSH;
	977	msleep(&needsbuffer, &needsbuffer_spin, 0,
	978	"flswai", 0);
	979	}
	980	spin_unlock_wr(&needsbuffer_spin);
	981	}
	982	}
	983	}
	984
	985	/*
	986	* buf_dirty_count_severe:
	987	*
	988	* Return true if we have too many dirty buffers.
	989	*/
	990	int
	991	buf_dirty_count_severe(void)
	992	{
	993	return(numdirtybuffers >= hidirtybuffers);
	994	}
	995
	996	/*
	997	* brelse:
	998	*
	999	* Release a busy buffer and, if requested, free its resources. The
	1000	* buffer will be stashed in the appropriate bufqueue[] allowing it
	1001	* to be accessed later as a cache entity or reused for other purposes.
	1002	*/
	1003	void
	1004	brelse(struct buf *bp)
	1005	{
	1006	#ifdef INVARIANTS
	1007	int saved_flags = bp->b_flags;
	1008	#endif
	1009
	1010	KASSERT(!(bp->b_flags & (B_CLUSTER\|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
	1011
	1012	crit_enter();
	1013
	1014	/*
	1015	* If B_NOCACHE is set we are being asked to destroy the buffer and
	1016	* its backing store. Clear B_DELWRI.
	1017	*
	1018	* B_NOCACHE is set in two cases: (1) when the caller really wants
	1019	* to destroy the buffer and backing store and (2) when the caller
	1020	* wants to destroy the buffer and backing store after a write
	1021	* completes.
	1022	*/
	1023	if ((bp->b_flags & (B_NOCACHE\|B_DELWRI)) == (B_NOCACHE\|B_DELWRI)) {
	1024	bundirty(bp);
	1025	}
	1026
	1027	if (bp->b_flags & B_LOCKED)
	1028	bp->b_flags &= ~B_ERROR;
	1029
	1030	/*
	1031	* If a write error occurs and the caller does not want to throw
	1032	* away the buffer, redirty the buffer. This will also clear
	1033	* B_NOCACHE.
	1034	*/
	1035	if (bp->b_cmd == BUF_CMD_WRITE &&
	1036	(bp->b_flags & (B_ERROR \| B_INVAL)) == B_ERROR) {
	1037	/*
	1038	* Failed write, redirty. Must clear B_ERROR to prevent
	1039	* pages from being scrapped. If B_INVAL is set then
	1040	* this case is not run and the next case is run to
	1041	* destroy the buffer. B_INVAL can occur if the buffer
	1042	* is outside the range supported by the underlying device.
	1043	*/
	1044	bp->b_flags &= ~B_ERROR;
	1045	bdirty(bp);
	1046	} else if ((bp->b_flags & (B_NOCACHE \| B_INVAL \| B_ERROR)) \|\|
	1047	(bp->b_bufsize <= 0) \|\| bp->b_cmd == BUF_CMD_FREEBLKS) {
	1048	/*
	1049	* Either a failed I/O or we were asked to free or not
	1050	* cache the buffer.
	1051	*
	1052	* NOTE: HAMMER will set B_LOCKED in buf_deallocate if the
	1053	* buffer cannot be immediately freed.
	1054	*/
	1055	bp->b_flags \|= B_INVAL;
	1056	if (LIST_FIRST(&bp->b_dep) != NULL)
	1057	buf_deallocate(bp);
	1058	if (bp->b_flags & B_DELWRI) {
	1059	--numdirtybuffers;
	1060	if (bp->b_flags & B_HEAVY)
	1061	--numdirtybuffershw;
	1062	numdirtywakeup();
	1063	}
	1064	bp->b_flags &= ~(B_DELWRI \| B_CACHE);
	1065	}
	1066
	1067	/*
	1068	* We must clear B_RELBUF if B_DELWRI or B_LOCKED is set.
	1069	* If vfs_vmio_release() is called with either bit set, the
	1070	* underlying pages may wind up getting freed causing a previous
	1071	* write (bdwrite()) to get 'lost' because pages associated with
	1072	* a B_DELWRI bp are marked clean. Pages associated with a
	1073	* B_LOCKED buffer may be mapped by the filesystem.
	1074	*
	1075	* If we want to release the buffer ourselves (rather then the
	1076	* originator asking us to release it), give the originator a
	1077	* chance to countermand the release by setting B_LOCKED.
	1078	*
	1079	* We still allow the B_INVAL case to call vfs_vmio_release(), even
	1080	* if B_DELWRI is set.
	1081	*
	1082	* If B_DELWRI is not set we may have to set B_RELBUF if we are low
	1083	* on pages to return pages to the VM page queues.
	1084	*/
	1085	if (bp->b_flags & (B_DELWRI \| B_LOCKED)) {
	1086	bp->b_flags &= ~B_RELBUF;
	1087	} else if (vm_page_count_severe()) {
	1088	buf_deallocate(bp);
	1089	if (bp->b_flags & (B_DELWRI \| B_LOCKED))
	1090	bp->b_flags &= ~B_RELBUF;
	1091	else
	1092	bp->b_flags \|= B_RELBUF;
	1093	}
	1094
	1095	/*
	1096	* At this point destroying the buffer is governed by the B_INVAL
	1097	* or B_RELBUF flags.
	1098	*/
	1099	bp->b_cmd = BUF_CMD_DONE;
	1100
	1101	/*
	1102	* VMIO buffer rundown. Make sure the VM page array is restored
	1103	* after an I/O may have replaces some of the pages with bogus pages
	1104	* in order to not destroy dirty pages in a fill-in read.
	1105	*
	1106	* Note that due to the code above, if a buffer is marked B_DELWRI
	1107	* then the B_RELBUF and B_NOCACHE bits will always be clear.
	1108	* B_INVAL may still be set, however.
	1109	*
	1110	* For clean buffers, B_INVAL or B_RELBUF will destroy the buffer
	1111	* but not the backing store. B_NOCACHE will destroy the backing
	1112	* store.
	1113	*
	1114	* Note that dirty NFS buffers contain byte-granular write ranges
	1115	* and should not be destroyed w/ B_INVAL even if the backing store
	1116	* is left intact.
	1117	*/
	1118	if (bp->b_flags & B_VMIO) {
	1119	/*
	1120	* Rundown for VMIO buffers which are not dirty NFS buffers.
	1121	*/
	1122	int i, j, resid;
	1123	vm_page_t m;
	1124	off_t foff;
	1125	vm_pindex_t poff;
	1126	vm_object_t obj;
	1127	struct vnode *vp;
	1128
	1129	vp = bp->b_vp;
	1130
	1131	/*
	1132	* Get the base offset and length of the buffer. Note that
	1133	* in the VMIO case if the buffer block size is not
	1134	* page-aligned then b_data pointer may not be page-aligned.
	1135	* But our b_xio.xio_pages array IS page aligned.
	1136	*
	1137	* block sizes less then DEV_BSIZE (usually 512) are not
	1138	* supported due to the page granularity bits (m->valid,
	1139	* m->dirty, etc...).
	1140	*
	1141	* See man buf(9) for more information
	1142	*/
	1143
	1144	resid = bp->b_bufsize;
	1145	foff = bp->b_loffset;
	1146
	1147	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	1148	m = bp->b_xio.xio_pages[i];
	1149	vm_page_flag_clear(m, PG_ZERO);
	1150	/*
	1151	* If we hit a bogus page, fixup all of them
	1152	* now. Note that we left these pages wired
	1153	* when we removed them so they had better exist,
	1154	* and they cannot be ripped out from under us so
	1155	* no critical section protection is necessary.
	1156	*/
	1157	if (m == bogus_page) {
	1158	obj = vp->v_object;
	1159	poff = OFF_TO_IDX(bp->b_loffset);
	1160
	1161	for (j = i; j < bp->b_xio.xio_npages; j++) {
	1162	vm_page_t mtmp;
	1163
	1164	mtmp = bp->b_xio.xio_pages[j];
	1165	if (mtmp == bogus_page) {
	1166	mtmp = vm_page_lookup(obj, poff + j);
	1167	if (!mtmp) {
	1168	panic("brelse: page missing");
	1169	}
	1170	bp->b_xio.xio_pages[j] = mtmp;
	1171	}
	1172	}
	1173
	1174	if ((bp->b_flags & B_INVAL) == 0) {
	1175	pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
	1176	bp->b_xio.xio_pages, bp->b_xio.xio_npages);
	1177	}
	1178	m = bp->b_xio.xio_pages[i];
	1179	}
	1180
	1181	/*
	1182	* Invalidate the backing store if B_NOCACHE is set
	1183	* (e.g. used with vinvalbuf()). If this is NFS
	1184	* we impose a requirement that the block size be
	1185	* a multiple of PAGE_SIZE and create a temporary
	1186	* hack to basically invalidate the whole page. The
	1187	* problem is that NFS uses really odd buffer sizes
	1188	* especially when tracking piecemeal writes and
	1189	* it also vinvalbuf()'s a lot, which would result
	1190	* in only partial page validation and invalidation
	1191	* here. If the file page is mmap()'d, however,
	1192	* all the valid bits get set so after we invalidate
	1193	* here we would end up with weird m->valid values
	1194	* like 0xfc. nfs_getpages() can't handle this so
	1195	* we clear all the valid bits for the NFS case
	1196	* instead of just some of them.
	1197	*
	1198	* The real bug is the VM system having to set m->valid
	1199	* to VM_PAGE_BITS_ALL for faulted-in pages, which
	1200	* itself is an artifact of the whole 512-byte
	1201	* granular mess that exists to support odd block
	1202	* sizes and UFS meta-data block sizes (e.g. 6144).
	1203	* A complete rewrite is required.
	1204	*/
	1205	if (bp->b_flags & (B_NOCACHE\|B_ERROR)) {
	1206	int poffset = foff & PAGE_MASK;
	1207	int presid;
	1208
	1209	presid = PAGE_SIZE - poffset;
	1210	if (bp->b_vp->v_tag == VT_NFS &&
	1211	bp->b_vp->v_type == VREG) {
	1212	; /* entire page */
	1213	} else if (presid > resid) {
	1214	presid = resid;
	1215	}
	1216	KASSERT(presid >= 0, ("brelse: extra page"));
	1217	vm_page_set_invalid(m, poffset, presid);
	1218	}
	1219	resid -= PAGE_SIZE - (foff & PAGE_MASK);
	1220	foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
	1221	}
	1222	if (bp->b_flags & (B_INVAL \| B_RELBUF))
	1223	vfs_vmio_release(bp);
	1224	} else {
	1225	/*
	1226	* Rundown for non-VMIO buffers.
	1227	*/
	1228	if (bp->b_flags & (B_INVAL \| B_RELBUF)) {
	1229	#if 0
	1230	if (bp->b_vp)
	1231	kprintf("brelse bp %p %08x/%08x: Warning, caught and fixed brelvp bug\n", bp, saved_flags, bp->b_flags);
	1232	#endif
	1233	if (bp->b_bufsize)
	1234	allocbuf(bp, 0);
	1235	if (bp->b_vp)
	1236	brelvp(bp);
	1237	}
	1238	}
	1239
	1240	if (bp->b_qindex != BQUEUE_NONE)
	1241	panic("brelse: free buffer onto another queue???");
	1242	if (BUF_REFCNTNB(bp) > 1) {
	1243	/* Temporary panic to verify exclusive locking */
	1244	/* This panic goes away when we allow shared refs */
	1245	panic("brelse: multiple refs");
	1246	/* do not release to free list */
	1247	BUF_UNLOCK(bp);
	1248	crit_exit();
	1249	return;
	1250	}
	1251
	1252	/*
	1253	* Figure out the correct queue to place the cleaned up buffer on.
	1254	* Buffers placed in the EMPTY or EMPTYKVA had better already be
	1255	* disassociated from their vnode.
	1256	*/
	1257	if (bp->b_flags & B_LOCKED) {
	1258	/*
	1259	* Buffers that are locked are placed in the locked queue
	1260	* immediately, regardless of their state.
	1261	*/
	1262	bp->b_qindex = BQUEUE_LOCKED;
	1263	TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_LOCKED], bp, b_freelist);
	1264	} else if (bp->b_bufsize == 0) {
	1265	/*
	1266	* Buffers with no memory. Due to conditionals near the top
	1267	* of brelse() such buffers should probably already be
	1268	* marked B_INVAL and disassociated from their vnode.
	1269	*/
	1270	bp->b_flags \|= B_INVAL;
	1271	KASSERT(bp->b_vp == NULL, ("bp1 %p flags %08x/%08x vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp));
	1272	KKASSERT((bp->b_flags & B_HASHED) == 0);
	1273	if (bp->b_kvasize) {
	1274	bp->b_qindex = BQUEUE_EMPTYKVA;
	1275	} else {
	1276	bp->b_qindex = BQUEUE_EMPTY;
	1277	}
	1278	TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
	1279	} else if (bp->b_flags & (B_ERROR \| B_INVAL \| B_NOCACHE \| B_RELBUF)) {
	1280	/*
	1281	* Buffers with junk contents. Again these buffers had better
	1282	* already be disassociated from their vnode.
	1283	*/
	1284	KASSERT(bp->b_vp == NULL, ("bp2 %p flags %08x/%08x vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp));
	1285	KKASSERT((bp->b_flags & B_HASHED) == 0);
	1286	bp->b_flags \|= B_INVAL;
	1287	bp->b_qindex = BQUEUE_CLEAN;
	1288	TAILQ_INSERT_HEAD(&bufqueues[BQUEUE_CLEAN], bp, b_freelist);
	1289	} else {
	1290	/*
	1291	* Remaining buffers. These buffers are still associated with
	1292	* their vnode.
	1293	*/
	1294	switch(bp->b_flags & (B_DELWRI\|B_HEAVY\|B_AGE)) {
	1295	case B_DELWRI \| B_AGE:
	1296	bp->b_qindex = BQUEUE_DIRTY;
	1297	TAILQ_INSERT_HEAD(&bufqueues[BQUEUE_DIRTY], bp, b_freelist);
	1298	break;
	1299	case B_DELWRI:
	1300	bp->b_qindex = BQUEUE_DIRTY;
	1301	TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_DIRTY], bp, b_freelist);
	1302	break;
	1303	case B_DELWRI \| B_HEAVY \| B_AGE:
	1304	bp->b_qindex = BQUEUE_DIRTY_HW;
	1305	TAILQ_INSERT_HEAD(&bufqueues[BQUEUE_DIRTY_HW], bp,
	1306	b_freelist);
	1307	break;
	1308	case B_DELWRI \| B_HEAVY:
	1309	bp->b_qindex = BQUEUE_DIRTY_HW;
	1310	TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_DIRTY_HW], bp,
	1311	b_freelist);
	1312	break;
	1313	case B_HEAVY \| B_AGE:
	1314	case B_AGE:
	1315	bp->b_qindex = BQUEUE_CLEAN;
	1316	TAILQ_INSERT_HEAD(&bufqueues[BQUEUE_CLEAN], bp, b_freelist);
	1317	break;
	1318	default:
	1319	bp->b_qindex = BQUEUE_CLEAN;
	1320	TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN], bp, b_freelist);
	1321	break;
	1322	}
	1323	}
	1324
	1325	/*
	1326	* If B_INVAL, clear B_DELWRI. We've already placed the buffer
	1327	* on the correct queue.
	1328	*/
	1329	if ((bp->b_flags & (B_INVAL\|B_DELWRI)) == (B_INVAL\|B_DELWRI))
	1330	bundirty(bp);
	1331
	1332	/*
	1333	* Fixup numfreebuffers count. The bp is on an appropriate queue
	1334	* unless locked. We then bump numfreebuffers if it is not B_DELWRI.
	1335	* We've already handled the B_INVAL case ( B_DELWRI will be clear
	1336	* if B_INVAL is set ).
	1337	*/
	1338	if ((bp->b_flags & (B_LOCKED\|B_DELWRI)) == 0)
	1339	bufcountwakeup();
	1340
	1341	/*
	1342	* Something we can maybe free or reuse
	1343	*/
	1344	if (bp->b_bufsize \|\| bp->b_kvasize)
	1345	bufspacewakeup();
	1346
	1347	/*
	1348	* Clean up temporary flags and unlock the buffer.
	1349	*/
	1350	bp->b_flags &= ~(B_ORDERED \| B_ASYNC \| B_NOCACHE \| B_AGE \| B_RELBUF \|
	1351	B_DIRECT \| B_NOWDRAIN);
	1352	BUF_UNLOCK(bp);
	1353	crit_exit();
	1354	}
	1355
	1356	/*
	1357	* bqrelse:
	1358	*
	1359	* Release a buffer back to the appropriate queue but do not try to free
	1360	* it. The buffer is expected to be used again soon.
	1361	*
	1362	* bqrelse() is used by bdwrite() to requeue a delayed write, and used by
	1363	* biodone() to requeue an async I/O on completion. It is also used when
	1364	* known good buffers need to be requeued but we think we may need the data
	1365	* again soon.
	1366	*
	1367	* XXX we should be able to leave the B_RELBUF hint set on completion.
	1368	*/
	1369	void
	1370	bqrelse(struct buf *bp)
	1371	{
	1372	crit_enter();
	1373
	1374	KASSERT(!(bp->b_flags & (B_CLUSTER\|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
	1375
	1376	if (bp->b_qindex != BQUEUE_NONE)
	1377	panic("bqrelse: free buffer onto another queue???");
	1378	if (BUF_REFCNTNB(bp) > 1) {
	1379	/* do not release to free list */
	1380	panic("bqrelse: multiple refs");
	1381	BUF_UNLOCK(bp);
	1382	crit_exit();
	1383	return;
	1384	}
	1385	if (bp->b_flags & B_LOCKED) {
	1386	/*
	1387	* Locked buffers are released to the locked queue. However,
	1388	* if the buffer is dirty it will first go into the dirty
	1389	* queue and later on after the I/O completes successfully it
	1390	* will be released to the locked queue.
	1391	*/
	1392	bp->b_flags &= ~B_ERROR;
	1393	bp->b_qindex = BQUEUE_LOCKED;
	1394	TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_LOCKED], bp, b_freelist);
	1395	} else if (bp->b_flags & B_DELWRI) {
	1396	bp->b_qindex = (bp->b_flags & B_HEAVY) ?
	1397	BQUEUE_DIRTY_HW : BQUEUE_DIRTY;
	1398	TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
	1399	} else if (vm_page_count_severe()) {
	1400	/*
	1401	* We are too low on memory, we have to try to free the
	1402	* buffer (most importantly: the wired pages making up its
	1403	* backing store) now.
	1404	*/
	1405	crit_exit();
	1406	brelse(bp);
	1407	return;
	1408	} else {
	1409	bp->b_qindex = BQUEUE_CLEAN;
	1410	TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN], bp, b_freelist);
	1411	}
	1412
	1413	if ((bp->b_flags & B_LOCKED) == 0 &&
	1414	((bp->b_flags & B_INVAL) \|\| (bp->b_flags & B_DELWRI) == 0)) {
	1415	bufcountwakeup();
	1416	}
	1417
	1418	/*
	1419	* Something we can maybe free or reuse.
	1420	*/
	1421	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
	1422	bufspacewakeup();
	1423
	1424	/*
	1425	* Final cleanup and unlock. Clear bits that are only used while a
	1426	* buffer is actively locked.
	1427	*/
	1428	bp->b_flags &= ~(B_ORDERED \| B_ASYNC \| B_NOCACHE \| B_AGE \| B_RELBUF);
	1429	BUF_UNLOCK(bp);
	1430	crit_exit();
	1431	}
	1432
	1433	/*
	1434	* vfs_vmio_release:
	1435	*
	1436	* Return backing pages held by the buffer 'bp' back to the VM system
	1437	* if possible. The pages are freed if they are no longer valid or
	1438	* attempt to free if it was used for direct I/O otherwise they are
	1439	* sent to the page cache.
	1440	*
	1441	* Pages that were marked busy are left alone and skipped.
	1442	*
	1443	* The KVA mapping (b_data) for the underlying pages is removed by
	1444	* this function.
	1445	*/
	1446	static void
	1447	vfs_vmio_release(struct buf *bp)
	1448	{
	1449	int i;
	1450	vm_page_t m;
	1451
	1452	crit_enter();
	1453	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	1454	m = bp->b_xio.xio_pages[i];
	1455	bp->b_xio.xio_pages[i] = NULL;
	1456	/*
	1457	* In order to keep page LRU ordering consistent, put
	1458	* everything on the inactive queue.
	1459	*/
	1460	vm_page_unwire(m, 0);
	1461	/*
	1462	* We don't mess with busy pages, it is
	1463	* the responsibility of the process that
	1464	* busied the pages to deal with them.
	1465	*/
	1466	if ((m->flags & PG_BUSY) \|\| (m->busy != 0))
	1467	continue;
	1468
	1469	if (m->wire_count == 0) {
	1470	vm_page_flag_clear(m, PG_ZERO);
	1471	/*
	1472	* Might as well free the page if we can and it has
	1473	* no valid data. We also free the page if the
	1474	* buffer was used for direct I/O.
	1475	*/
	1476	if ((bp->b_flags & B_ASYNC) == 0 && !m->valid &&
	1477	m->hold_count == 0) {
	1478	vm_page_busy(m);
	1479	vm_page_protect(m, VM_PROT_NONE);
	1480	vm_page_free(m);
	1481	} else if (bp->b_flags & B_DIRECT) {
	1482	vm_page_try_to_free(m);
	1483	} else if (vm_page_count_severe()) {
	1484	vm_page_try_to_cache(m);
	1485	}
	1486	}
	1487	}
	1488	crit_exit();
	1489	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_xio.xio_npages);
	1490	if (bp->b_bufsize) {
	1491	bufspacewakeup();
	1492	bp->b_bufsize = 0;
	1493	}
	1494	bp->b_xio.xio_npages = 0;
	1495	bp->b_flags &= ~B_VMIO;
	1496	if (bp->b_vp)
	1497	brelvp(bp);
	1498	}
	1499
	1500	/*
	1501	* vfs_bio_awrite:
	1502	*
	1503	* Implement clustered async writes for clearing out B_DELWRI buffers.
	1504	* This is much better then the old way of writing only one buffer at
	1505	* a time. Note that we may not be presented with the buffers in the
	1506	* correct order, so we search for the cluster in both directions.
	1507	*
	1508	* The buffer is locked on call.
	1509	*/
	1510	int
	1511	vfs_bio_awrite(struct buf *bp)
	1512	{
	1513	int i;
	1514	int j;
	1515	off_t loffset = bp->b_loffset;
	1516	struct vnode *vp = bp->b_vp;
	1517	int nbytes;
	1518	struct buf *bpa;
	1519	int nwritten;
	1520	int size;
	1521
	1522	crit_enter();
	1523	/*
	1524	* right now we support clustered writing only to regular files. If
	1525	* we find a clusterable block we could be in the middle of a cluster
	1526	* rather then at the beginning.
	1527	*
	1528	* NOTE: b_bio1 contains the logical loffset and is aliased
	1529	* to b_loffset. b_bio2 contains the translated block number.
	1530	*/
	1531	if ((vp->v_type == VREG) &&
	1532	(vp->v_mount != 0) && /* Only on nodes that have the size info */
	1533	(bp->b_flags & (B_CLUSTEROK \| B_INVAL)) == B_CLUSTEROK) {
	1534
	1535	size = vp->v_mount->mnt_stat.f_iosize;
	1536
	1537	for (i = size; i < MAXPHYS; i += size) {
	1538	if ((bpa = findblk(vp, loffset + i)) &&
	1539	BUF_REFCNT(bpa) == 0 &&
	1540	((bpa->b_flags & (B_DELWRI \| B_CLUSTEROK \| B_INVAL)) ==
	1541	(B_DELWRI \| B_CLUSTEROK)) &&
	1542	(bpa->b_bufsize == size)) {
	1543	if ((bpa->b_bio2.bio_offset == NOOFFSET) \|\|
	1544	(bpa->b_bio2.bio_offset !=
	1545	bp->b_bio2.bio_offset + i))
	1546	break;
	1547	} else {
	1548	break;
	1549	}
	1550	}
	1551	for (j = size; i + j <= MAXPHYS && j <= loffset; j += size) {
	1552	if ((bpa = findblk(vp, loffset - j)) &&
	1553	BUF_REFCNT(bpa) == 0 &&
	1554	((bpa->b_flags & (B_DELWRI \| B_CLUSTEROK \| B_INVAL)) ==
	1555	(B_DELWRI \| B_CLUSTEROK)) &&
	1556	(bpa->b_bufsize == size)) {
	1557	if ((bpa->b_bio2.bio_offset == NOOFFSET) \|\|
	1558	(bpa->b_bio2.bio_offset !=
	1559	bp->b_bio2.bio_offset - j))
	1560	break;
	1561	} else {
	1562	break;
	1563	}
	1564	}
	1565	j -= size;
	1566	nbytes = (i + j);
	1567	/*
	1568	* this is a possible cluster write
	1569	*/
	1570	if (nbytes != size) {
	1571	BUF_UNLOCK(bp);
	1572	nwritten = cluster_wbuild(vp, size,
	1573	loffset - j, nbytes);
	1574	crit_exit();
	1575	return nwritten;
	1576	}
	1577	}
	1578
	1579	bremfree(bp);
	1580	bp->b_flags \|= B_ASYNC;
	1581
	1582	crit_exit();
	1583	/*
	1584	* default (old) behavior, writing out only one block
	1585	*
	1586	* XXX returns b_bufsize instead of b_bcount for nwritten?
	1587	*/
	1588	nwritten = bp->b_bufsize;
	1589	bwrite(bp);
	1590
	1591	return nwritten;
	1592	}
	1593
	1594	/*
	1595	* getnewbuf:
	1596	*
	1597	* Find and initialize a new buffer header, freeing up existing buffers
	1598	* in the bufqueues as necessary. The new buffer is returned locked.
	1599	*
	1600	* Important: B_INVAL is not set. If the caller wishes to throw the
	1601	* buffer away, the caller must set B_INVAL prior to calling brelse().
	1602	*
	1603	* We block if:
	1604	* We have insufficient buffer headers
	1605	* We have insufficient buffer space
	1606	* buffer_map is too fragmented ( space reservation fails )
	1607	* If we have to flush dirty buffers ( but we try to avoid this )
	1608	*
	1609	* To avoid VFS layer recursion we do not flush dirty buffers ourselves.
	1610	* Instead we ask the buf daemon to do it for us. We attempt to
	1611	* avoid piecemeal wakeups of the pageout daemon.
	1612	*/
	1613
	1614	static struct buf *
	1615	getnewbuf(int blkflags, int slptimeo, int size, int maxsize)
	1616	{
	1617	struct buf *bp;
	1618	struct buf *nbp;
	1619	int defrag = 0;
	1620	int nqindex;
	1621	int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0;
	1622	static int flushingbufs;
	1623
	1624	/*
	1625	* We can't afford to block since we might be holding a vnode lock,
	1626	* which may prevent system daemons from running. We deal with
	1627	* low-memory situations by proactively returning memory and running
	1628	* async I/O rather then sync I/O.
	1629	*/
	1630
	1631	++getnewbufcalls;
	1632	--getnewbufrestarts;
	1633	restart:
	1634	++getnewbufrestarts;
	1635
	1636	/*
	1637	* Setup for scan. If we do not have enough free buffers,
	1638	* we setup a degenerate case that immediately fails. Note
	1639	* that if we are specially marked process, we are allowed to
	1640	* dip into our reserves.
	1641	*
	1642	* The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
	1643	*
	1644	* We start with EMPTYKVA. If the list is empty we backup to EMPTY.
	1645	* However, there are a number of cases (defragging, reusing, ...)
	1646	* where we cannot backup.
	1647	*/
	1648	nqindex = BQUEUE_EMPTYKVA;
	1649	nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTYKVA]);
	1650
	1651	if (nbp == NULL) {
	1652	/*
	1653	* If no EMPTYKVA buffers and we are either
	1654	* defragging or reusing, locate a CLEAN buffer
	1655	* to free or reuse. If bufspace useage is low
	1656	* skip this step so we can allocate a new buffer.
	1657	*/
	1658	if (defrag \|\| bufspace >= lobufspace) {
	1659	nqindex = BQUEUE_CLEAN;
	1660	nbp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN]);
	1661	}
	1662
	1663	/*
	1664	* If we could not find or were not allowed to reuse a
	1665	* CLEAN buffer, check to see if it is ok to use an EMPTY
	1666	* buffer. We can only use an EMPTY buffer if allocating
	1667	* its KVA would not otherwise run us out of buffer space.
	1668	*/
	1669	if (nbp == NULL && defrag == 0 &&
	1670	bufspace + maxsize < hibufspace) {
	1671	nqindex = BQUEUE_EMPTY;
	1672	nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTY]);
	1673	}
	1674	}
	1675
	1676	/*
	1677	* Run scan, possibly freeing data and/or kva mappings on the fly
	1678	* depending.
	1679	*/
	1680
	1681	while ((bp = nbp) != NULL) {
	1682	int qindex = nqindex;
	1683
	1684	/*
	1685	* Calculate next bp ( we can only use it if we do not block
	1686	* or do other fancy things ).
	1687	*/
	1688	if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
	1689	switch(qindex) {
	1690	case BQUEUE_EMPTY:
	1691	nqindex = BQUEUE_EMPTYKVA;
	1692	if ((nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTYKVA])))
	1693	break;
	1694	/* fall through */
	1695	case BQUEUE_EMPTYKVA:
	1696	nqindex = BQUEUE_CLEAN;
	1697	if ((nbp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN])))
	1698	break;
	1699	/* fall through */
	1700	case BQUEUE_CLEAN:
	1701	/*
	1702	* nbp is NULL.
	1703	*/
	1704	break;
	1705	}
	1706	}
	1707
	1708	/*
	1709	* Sanity Checks
	1710	*/
	1711	KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
	1712
	1713	/*
	1714	* Note: we no longer distinguish between VMIO and non-VMIO
	1715	* buffers.
	1716	*/
	1717
	1718	KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
	1719
	1720	/*
	1721	* If we are defragging then we need a buffer with
	1722	* b_kvasize != 0. XXX this situation should no longer
	1723	* occur, if defrag is non-zero the buffer's b_kvasize
	1724	* should also be non-zero at this point. XXX
	1725	*/
	1726	if (defrag && bp->b_kvasize == 0) {
	1727	kprintf("Warning: defrag empty buffer %p\n", bp);
	1728	continue;
	1729	}
	1730
	1731	/*
	1732	* Start freeing the bp. This is somewhat involved. nbp
	1733	* remains valid only for BQUEUE_EMPTY[KVA] bp's. Buffers
	1734	* on the clean list must be disassociated from their
	1735	* current vnode. Buffers on the empty[kva] lists have
	1736	* already been disassociated.
	1737	*/
	1738
	1739	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT) != 0) {
	1740	kprintf("getnewbuf: warning, locked buf %p, race corrected\n", bp);
	1741	tsleep(&bd_request, 0, "gnbxxx", hz / 100);
	1742	goto restart;
	1743	}
	1744	if (bp->b_qindex != qindex) {
	1745	kprintf("getnewbuf: warning, BUF_LOCK blocked unexpectedly on buf %p index %d->%d, race corrected\n", bp, qindex, bp->b_qindex);
	1746	BUF_UNLOCK(bp);
	1747	goto restart;
	1748	}
	1749	bremfree(bp);
	1750
	1751	/*
	1752	* Dependancies must be handled before we disassociate the
	1753	* vnode.
	1754	*
	1755	* NOTE: HAMMER will set B_LOCKED if the buffer cannot
	1756	* be immediately disassociated. HAMMER then becomes
	1757	* responsible for releasing the buffer.
	1758	*/
	1759	if (LIST_FIRST(&bp->b_dep) != NULL) {
	1760	buf_deallocate(bp);
	1761	if (bp->b_flags & B_LOCKED) {
	1762	bqrelse(bp);
	1763	goto restart;
	1764	}
	1765	KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
	1766	}
	1767
	1768	if (qindex == BQUEUE_CLEAN) {
	1769	if (bp->b_flags & B_VMIO) {
	1770	bp->b_flags &= ~B_ASYNC;
	1771	vfs_vmio_release(bp);
	1772	}
	1773	if (bp->b_vp)
	1774	brelvp(bp);
	1775	}
	1776
	1777	/*
	1778	* NOTE: nbp is now entirely invalid. We can only restart
	1779	* the scan from this point on.
	1780	*
	1781	* Get the rest of the buffer freed up. b_kva* is still
	1782	* valid after this operation.
	1783	*/
	1784
	1785	KASSERT(bp->b_vp == NULL, ("bp3 %p flags %08x vnode %p qindex %d unexpectededly still associated!", bp, bp->b_flags, bp->b_vp, qindex));
	1786	KKASSERT((bp->b_flags & B_HASHED) == 0);
	1787
	1788	/*
	1789	* critical section protection is not required when
	1790	* scrapping a buffer's contents because it is already
	1791	* wired.
	1792	*/
	1793	if (bp->b_bufsize)
	1794	allocbuf(bp, 0);
	1795
	1796	bp->b_flags = B_BNOCLIP;
	1797	bp->b_cmd = BUF_CMD_DONE;
	1798	bp->b_vp = NULL;
	1799	bp->b_error = 0;
	1800	bp->b_resid = 0;
	1801	bp->b_bcount = 0;
	1802	bp->b_xio.xio_npages = 0;
	1803	bp->b_dirtyoff = bp->b_dirtyend = 0;
	1804	reinitbufbio(bp);
	1805	buf_dep_init(bp);
	1806	if (blkflags & GETBLK_BHEAVY)
	1807	bp->b_flags \|= B_HEAVY;
	1808
	1809	/*
	1810	* If we are defragging then free the buffer.
	1811	*/
	1812	if (defrag) {
	1813	bp->b_flags \|= B_INVAL;
	1814	bfreekva(bp);
	1815	brelse(bp);
	1816	defrag = 0;
	1817	goto restart;
	1818	}
	1819
	1820	/*
	1821	* If we are overcomitted then recover the buffer and its
	1822	* KVM space. This occurs in rare situations when multiple
	1823	* processes are blocked in getnewbuf() or allocbuf().
	1824	*/
	1825	if (bufspace >= hibufspace)
	1826	flushingbufs = 1;
	1827	if (flushingbufs && bp->b_kvasize != 0) {
	1828	bp->b_flags \|= B_INVAL;
	1829	bfreekva(bp);
	1830	brelse(bp);
	1831	goto restart;
	1832	}
	1833	if (bufspace < lobufspace)
	1834	flushingbufs = 0;
	1835	break;
	1836	}
	1837
	1838	/*
	1839	* If we exhausted our list, sleep as appropriate. We may have to
	1840	* wakeup various daemons and write out some dirty buffers.
	1841	*
	1842	* Generally we are sleeping due to insufficient buffer space.
	1843	*/
	1844
	1845	if (bp == NULL) {
	1846	int flags;
	1847	char *waitmsg;
	1848
	1849	if (defrag) {
	1850	flags = VFS_BIO_NEED_BUFSPACE;
	1851	waitmsg = "nbufkv";
	1852	} else if (bufspace >= hibufspace) {
	1853	waitmsg = "nbufbs";
	1854	flags = VFS_BIO_NEED_BUFSPACE;
	1855	} else {
	1856	waitmsg = "newbuf";
	1857	flags = VFS_BIO_NEED_ANY;
	1858	}
	1859
	1860	needsbuffer \|= flags;
	1861	bd_speedup(); /* heeeelp */
	1862	while (needsbuffer & flags) {
	1863	if (tsleep(&needsbuffer, slpflags, waitmsg, slptimeo))
	1864	return (NULL);
	1865	}
	1866	} else {
	1867	/*
	1868	* We finally have a valid bp. We aren't quite out of the
	1869	* woods, we still have to reserve kva space. In order
	1870	* to keep fragmentation sane we only allocate kva in
	1871	* BKVASIZE chunks.
	1872	*/
	1873	maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
	1874
	1875	if (maxsize != bp->b_kvasize) {
	1876	vm_offset_t addr = 0;
	1877	int count;
	1878
	1879	bfreekva(bp);
	1880
	1881	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1882	vm_map_lock(&buffer_map);
	1883
	1884	if (vm_map_findspace(&buffer_map,
	1885	vm_map_min(&buffer_map), maxsize,
	1886	maxsize, &addr)) {
	1887	/*
	1888	* Uh oh. Buffer map is too fragmented. We
	1889	* must defragment the map.
	1890	*/
	1891	vm_map_unlock(&buffer_map);
	1892	vm_map_entry_release(count);
	1893	++bufdefragcnt;
	1894	defrag = 1;
	1895	bp->b_flags \|= B_INVAL;
	1896	brelse(bp);
	1897	goto restart;
	1898	}
	1899	if (addr) {
	1900	vm_map_insert(&buffer_map, &count,
	1901	NULL, 0,
	1902	addr, addr + maxsize,
	1903	VM_MAPTYPE_NORMAL,
	1904	VM_PROT_ALL, VM_PROT_ALL,
	1905	MAP_NOFAULT);
	1906
	1907	bp->b_kvabase = (caddr_t) addr;
	1908	bp->b_kvasize = maxsize;
	1909	bufspace += bp->b_kvasize;
	1910	++bufreusecnt;
	1911	}
	1912	vm_map_unlock(&buffer_map);
	1913	vm_map_entry_release(count);
	1914	}
	1915	bp->b_data = bp->b_kvabase;
	1916	}
	1917	return(bp);
	1918	}
	1919
	1920	/*
	1921	* buf_daemon:
	1922	*
	1923	* Buffer flushing daemon. Buffers are normally flushed by the
	1924	* update daemon but if it cannot keep up this process starts to
	1925	* take the load in an attempt to prevent getnewbuf() from blocking.
	1926	*
	1927	* Once a flush is initiated it does not stop until the number
	1928	* of buffers falls below lodirtybuffers, but we will wake up anyone
	1929	* waiting at the mid-point.
	1930	*/
	1931
	1932	static struct thread *bufdaemon_td;
	1933	static struct thread *bufdaemonhw_td;
	1934
	1935	static struct kproc_desc buf_kp = {
	1936	"bufdaemon",
	1937	buf_daemon,
	1938	&bufdaemon_td
	1939	};
	1940	SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST,
	1941	kproc_start, &buf_kp)
	1942
	1943	static struct kproc_desc bufhw_kp = {
	1944	"bufdaemon_hw",
	1945	buf_daemon_hw,
	1946	&bufdaemonhw_td
	1947	};
	1948	SYSINIT(bufdaemon_hw, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST,
	1949	kproc_start, &bufhw_kp)
	1950
	1951	static void
	1952	buf_daemon(void)
	1953	{
	1954	/*
	1955	* This process needs to be suspended prior to shutdown sync.
	1956	*/
	1957	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc,
	1958	bufdaemon_td, SHUTDOWN_PRI_LAST);
	1959
	1960	/*
	1961	* This process is allowed to take the buffer cache to the limit
	1962	*/
	1963	crit_enter();
	1964
	1965	for (;;) {
	1966	kproc_suspend_loop();
	1967
	1968	/*
	1969	* Do the flush. Limit the amount of in-transit I/O we
	1970	* allow to build up, otherwise we would completely saturate
	1971	* the I/O system. Wakeup any waiting processes before we
	1972	* normally would so they can run in parallel with our drain.
	1973	*/
	1974	while (numdirtybuffers > lodirtybuffers) {
	1975	if (flushbufqueues(BQUEUE_DIRTY) == 0)
	1976	break;
	1977	waitrunningbufspace();
	1978	numdirtywakeup();
	1979	}
	1980	numdirtywakeup();
	1981
	1982	/*
	1983	* Only clear bd_request if we have reached our low water
	1984	* mark. The buf_daemon normally waits 5 seconds and
	1985	* then incrementally flushes any dirty buffers that have
	1986	* built up, within reason.
	1987	*
	1988	* If we were unable to hit our low water mark and couldn't
	1989	* find any flushable buffers, we sleep half a second.
	1990	* Otherwise we loop immediately.
	1991	*/
	1992	if (numdirtybuffers <= lodirtybuffers) {
	1993	/*
	1994	* We reached our low water mark, reset the
	1995	* request and sleep until we are needed again.
	1996	* The sleep is just so the suspend code works.
	1997	*/
	1998	spin_lock_wr(&needsbuffer_spin);
	1999	bd_request = 0;
	2000	msleep(&bd_request, &needsbuffer_spin, 0,
	2001	"psleep", hz);
	2002	spin_unlock_wr(&needsbuffer_spin);
	2003	} else {
	2004	/*
	2005	* We couldn't find any flushable dirty buffers but
	2006	* still have too many dirty buffers, we
	2007	* have to sleep and try again. (rare)
	2008	*/
	2009	tsleep(&bd_request, 0, "qsleep", hz / 2);
	2010	}
	2011	}
	2012	}
	2013
	2014	static void
	2015	buf_daemon_hw(void)
	2016	{
	2017	/*
	2018	* This process needs to be suspended prior to shutdown sync.
	2019	*/
	2020	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc,
	2021	bufdaemonhw_td, SHUTDOWN_PRI_LAST);
	2022
	2023	/*
	2024	* This process is allowed to take the buffer cache to the limit
	2025	*/
	2026	crit_enter();
	2027
	2028	for (;;) {
	2029	kproc_suspend_loop();
	2030
	2031	/*
	2032	* Do the flush. Limit the amount of in-transit I/O we
	2033	* allow to build up, otherwise we would completely saturate
	2034	* the I/O system. Wakeup any waiting processes before we
	2035	* normally would so they can run in parallel with our drain.
	2036	*/
	2037	while (numdirtybuffershw > lodirtybuffers) {
	2038	if (flushbufqueues(BQUEUE_DIRTY_HW) == 0)
	2039	break;
	2040	waitrunningbufspace();
	2041	numdirtywakeup();
	2042	}
	2043
	2044	/*
	2045	* Only clear bd_request if we have reached our low water
	2046	* mark. The buf_daemon normally waits 5 seconds and
	2047	* then incrementally flushes any dirty buffers that have
	2048	* built up, within reason.
	2049	*
	2050	* If we were unable to hit our low water mark and couldn't
	2051	* find any flushable buffers, we sleep half a second.
	2052	* Otherwise we loop immediately.
	2053	*/
	2054	if (numdirtybuffershw <= lodirtybuffers) {
	2055	/*
	2056	* We reached our low water mark, reset the
	2057	* request and sleep until we are needed again.
	2058	* The sleep is just so the suspend code works.
	2059	*/
	2060	spin_lock_wr(&needsbuffer_spin);
	2061	bd_request_hw = 0;
	2062	msleep(&bd_request_hw, &needsbuffer_spin, 0,
	2063	"psleep", hz);
	2064	spin_unlock_wr(&needsbuffer_spin);
	2065	} else {
	2066	/*
	2067	* We couldn't find any flushable dirty buffers but
	2068	* still have too many dirty buffers, we
	2069	* have to sleep and try again. (rare)
	2070	*/
	2071	tsleep(&bd_request_hw, 0, "qsleep", hz / 2);
	2072	}
	2073	}
	2074	}
	2075
	2076	/*
	2077	* flushbufqueues:
	2078	*
	2079	* Try to flush a buffer in the dirty queue. We must be careful to
	2080	* free up B_INVAL buffers instead of write them, which NFS is
	2081	* particularly sensitive to.
	2082	*/
	2083
	2084	static int
	2085	flushbufqueues(bufq_type_t q)
	2086	{
	2087	struct buf *bp;
	2088	int r = 0;
	2089
	2090	bp = TAILQ_FIRST(&bufqueues[q]);
	2091
	2092	while (bp) {
	2093	KASSERT((bp->b_flags & B_DELWRI),
	2094	("unexpected clean buffer %p", bp));
	2095	if (bp->b_flags & B_DELWRI) {
	2096	if (bp->b_flags & B_INVAL) {
	2097	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT) != 0)
	2098	panic("flushbufqueues: locked buf");
	2099	bremfree(bp);
	2100	brelse(bp);
	2101	++r;
	2102	break;
	2103	}
	2104	if (LIST_FIRST(&bp->b_dep) != NULL &&
	2105	(bp->b_flags & B_DEFERRED) == 0 &&
	2106	buf_countdeps(bp, 0)) {
	2107	TAILQ_REMOVE(&bufqueues[q], bp, b_freelist);
	2108	TAILQ_INSERT_TAIL(&bufqueues[q], bp,
	2109	b_freelist);
	2110	bp->b_flags \|= B_DEFERRED;
	2111	bp = TAILQ_FIRST(&bufqueues[q]);
	2112	continue;
	2113	}
	2114
	2115	/*
	2116	* Only write it out if we can successfully lock
	2117	* it. If the buffer has a dependancy,
	2118	* buf_checkwrite must also return 0.
	2119	*/
	2120	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT) == 0) {
	2121	if (LIST_FIRST(&bp->b_dep) != NULL &&
	2122	buf_checkwrite(bp)) {
	2123	bremfree(bp);
	2124	brelse(bp);
	2125	} else {
	2126	vfs_bio_awrite(bp);
	2127	}
	2128	++r;
	2129	break;
	2130	}
	2131	}
	2132	bp = TAILQ_NEXT(bp, b_freelist);
	2133	}
	2134	return (r);
	2135	}
	2136
	2137	/*
	2138	* inmem:
	2139	*
	2140	* Returns true if no I/O is needed to access the associated VM object.
	2141	* This is like findblk except it also hunts around in the VM system for
	2142	* the data.
	2143	*
	2144	* Note that we ignore vm_page_free() races from interrupts against our
	2145	* lookup, since if the caller is not protected our return value will not
	2146	* be any more valid then otherwise once we exit the critical section.
	2147	*/
	2148	int
	2149	inmem(struct vnode *vp, off_t loffset)
	2150	{
	2151	vm_object_t obj;
	2152	vm_offset_t toff, tinc, size;
	2153	vm_page_t m;
	2154
	2155	if (findblk(vp, loffset))
	2156	return 1;
	2157	if (vp->v_mount == NULL)
	2158	return 0;
	2159	if ((obj = vp->v_object) == NULL)
	2160	return 0;
	2161
	2162	size = PAGE_SIZE;
	2163	if (size > vp->v_mount->mnt_stat.f_iosize)
	2164	size = vp->v_mount->mnt_stat.f_iosize;
	2165
	2166	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
	2167	m = vm_page_lookup(obj, OFF_TO_IDX(loffset + toff));
	2168	if (m == NULL)
	2169	return 0;
	2170	tinc = size;
	2171	if (tinc > PAGE_SIZE - ((toff + loffset) & PAGE_MASK))
	2172	tinc = PAGE_SIZE - ((toff + loffset) & PAGE_MASK);
	2173	if (vm_page_is_valid(m,
	2174	(vm_offset_t) ((toff + loffset) & PAGE_MASK), tinc) == 0)
	2175	return 0;
	2176	}
	2177	return 1;
	2178	}
	2179
	2180	/*
	2181	* vfs_setdirty:
	2182	*
	2183	* Sets the dirty range for a buffer based on the status of the dirty
	2184	* bits in the pages comprising the buffer.
	2185	*
	2186	* The range is limited to the size of the buffer.
	2187	*
	2188	* This routine is primarily used by NFS, but is generalized for the
	2189	* B_VMIO case.
	2190	*/
	2191	static void
	2192	vfs_setdirty(struct buf *bp)
	2193	{
	2194	int i;
	2195	vm_object_t object;
	2196
	2197	/*
	2198	* Degenerate case - empty buffer
	2199	*/
	2200
	2201	if (bp->b_bufsize == 0)
	2202	return;
	2203
	2204	/*
	2205	* We qualify the scan for modified pages on whether the
	2206	* object has been flushed yet. The OBJ_WRITEABLE flag
	2207	* is not cleared simply by protecting pages off.
	2208	*/
	2209
	2210	if ((bp->b_flags & B_VMIO) == 0)
	2211	return;
	2212
	2213	object = bp->b_xio.xio_pages[0]->object;
	2214
	2215	if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
	2216	kprintf("Warning: object %p writeable but not mightbedirty\n", object);
	2217	if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
	2218	kprintf("Warning: object %p mightbedirty but not writeable\n", object);
	2219
	2220	if (object->flags & (OBJ_MIGHTBEDIRTY\|OBJ_CLEANING)) {
	2221	vm_offset_t boffset;
	2222	vm_offset_t eoffset;
	2223
	2224	/*
	2225	* test the pages to see if they have been modified directly
	2226	* by users through the VM system.
	2227	*/
	2228	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	2229	vm_page_flag_clear(bp->b_xio.xio_pages[i], PG_ZERO);
	2230	vm_page_test_dirty(bp->b_xio.xio_pages[i]);
	2231	}
	2232
	2233	/*
	2234	* Calculate the encompassing dirty range, boffset and eoffset,
	2235	* (eoffset - boffset) bytes.
	2236	*/
	2237
	2238	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	2239	if (bp->b_xio.xio_pages[i]->dirty)
	2240	break;
	2241	}
	2242	boffset = (i << PAGE_SHIFT) - (bp->b_loffset & PAGE_MASK);
	2243
	2244	for (i = bp->b_xio.xio_npages - 1; i >= 0; --i) {
	2245	if (bp->b_xio.xio_pages[i]->dirty) {
	2246	break;
	2247	}
	2248	}
	2249	eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_loffset & PAGE_MASK);
	2250
	2251	/*
	2252	* Fit it to the buffer.
	2253	*/
	2254
	2255	if (eoffset > bp->b_bcount)
	2256	eoffset = bp->b_bcount;
	2257
	2258	/*
	2259	* If we have a good dirty range, merge with the existing
	2260	* dirty range.
	2261	*/
	2262
	2263	if (boffset < eoffset) {
	2264	if (bp->b_dirtyoff > boffset)
	2265	bp->b_dirtyoff = boffset;
	2266	if (bp->b_dirtyend < eoffset)
	2267	bp->b_dirtyend = eoffset;
	2268	}
	2269	}
	2270	}
	2271
	2272	/*
	2273	* findblk:
	2274	*
	2275	* Locate and return the specified buffer, or NULL if the buffer does
	2276	* not exist. Do not attempt to lock the buffer or manipulate it in
	2277	* any way. The caller must validate that the correct buffer has been
	2278	* obtain after locking it.
	2279	*/
	2280	struct buf *
	2281	findblk(struct vnode *vp, off_t loffset)
	2282	{
	2283	struct buf *bp;
	2284
	2285	crit_enter();
	2286	bp = buf_rb_hash_RB_LOOKUP(&vp->v_rbhash_tree, loffset);
	2287	crit_exit();
	2288	return(bp);
	2289	}
	2290
	2291	/*
	2292	* getblk:
	2293	*
	2294	* Get a block given a specified block and offset into a file/device.
	2295	* B_INVAL may or may not be set on return. The caller should clear
	2296	* B_INVAL prior to initiating a READ.
	2297	*
	2298	* IT IS IMPORTANT TO UNDERSTAND THAT IF YOU CALL GETBLK() AND B_CACHE
	2299	* IS NOT SET, YOU MUST INITIALIZE THE RETURNED BUFFER, ISSUE A READ,
	2300	* OR SET B_INVAL BEFORE RETIRING IT. If you retire a getblk'd buffer
	2301	* without doing any of those things the system will likely believe
	2302	* the buffer to be valid (especially if it is not B_VMIO), and the
	2303	* next getblk() will return the buffer with B_CACHE set.
	2304	*
	2305	* For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
	2306	* an existing buffer.
	2307	*
	2308	* For a VMIO buffer, B_CACHE is modified according to the backing VM.
	2309	* If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
	2310	* and then cleared based on the backing VM. If the previous buffer is
	2311	* non-0-sized but invalid, B_CACHE will be cleared.
	2312	*
	2313	* If getblk() must create a new buffer, the new buffer is returned with
	2314	* both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
	2315	* case it is returned with B_INVAL clear and B_CACHE set based on the
	2316	* backing VM.
	2317	*
	2318	* getblk() also forces a bwrite() for any B_DELWRI buffer whos
	2319	* B_CACHE bit is clear.
	2320	*
	2321	* What this means, basically, is that the caller should use B_CACHE to
	2322	* determine whether the buffer is fully valid or not and should clear
	2323	* B_INVAL prior to issuing a read. If the caller intends to validate
	2324	* the buffer by loading its data area with something, the caller needs
	2325	* to clear B_INVAL. If the caller does this without issuing an I/O,
	2326	* the caller should set B_CACHE ( as an optimization ), else the caller
	2327	* should issue the I/O and biodone() will set B_CACHE if the I/O was
	2328	* a write attempt or if it was a successfull read. If the caller
	2329	* intends to issue a READ, the caller must clear B_INVAL and B_ERROR
	2330	* prior to issuing the READ. biodone() will not clear B_INVAL.
	2331	*
	2332	* getblk flags:
	2333	*
	2334	* GETBLK_PCATCH - catch signal if blocked, can cause NULL return
	2335	* GETBLK_BHEAVY - heavy-weight buffer cache buffer
	2336	*/
	2337	struct buf *
	2338	getblk(struct vnode *vp, off_t loffset, int size, int blkflags, int slptimeo)
	2339	{
	2340	struct buf *bp;
	2341	int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0;
	2342
	2343	if (size > MAXBSIZE)
	2344	panic("getblk: size(%d) > MAXBSIZE(%d)", size, MAXBSIZE);
	2345	if (vp->v_object == NULL)
	2346	panic("getblk: vnode %p has no object!", vp);
	2347
	2348	crit_enter();
	2349	loop:
	2350	if ((bp = findblk(vp, loffset))) {
	2351	/*
	2352	* The buffer was found in the cache, but we need to lock it.
	2353	* Even with LK_NOWAIT the lockmgr may break our critical
	2354	* section, so double-check the validity of the buffer
	2355	* once the lock has been obtained.
	2356	*/
	2357	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	2358	int lkflags = LK_EXCLUSIVE \| LK_SLEEPFAIL;
	2359	if (blkflags & GETBLK_PCATCH)
	2360	lkflags \|= LK_PCATCH;
	2361	if (BUF_TIMELOCK(bp, lkflags, "getblk", slptimeo) ==
	2362	ENOLCK) {
	2363	goto loop;
	2364	}
	2365	crit_exit();
	2366	return (NULL);
	2367	}
	2368
	2369	/*
	2370	* Once the buffer has been locked, make sure we didn't race
	2371	* a buffer recyclement. Buffers that are no longer hashed
	2372	* will have b_vp == NULL, so this takes care of that check
	2373	* as well.
	2374	*/
	2375	if (bp->b_vp != vp \|\| bp->b_loffset != loffset) {
	2376	kprintf("Warning buffer %p (vp %p loffset %lld) was recycled\n", bp, vp, loffset);
	2377	BUF_UNLOCK(bp);
	2378	goto loop;
	2379	}
	2380
	2381	/*
	2382	* All vnode-based buffers must be backed by a VM object.
	2383	*/
	2384	KKASSERT(bp->b_flags & B_VMIO);
	2385	KKASSERT(bp->b_cmd == BUF_CMD_DONE);
	2386
	2387	/*
	2388	* Make sure that B_INVAL buffers do not have a cached
	2389	* block number translation.
	2390	*/
	2391	if ((bp->b_flags & B_INVAL) && (bp->b_bio2.bio_offset != NOOFFSET)) {
	2392	kprintf("Warning invalid buffer %p (vp %p loffset %lld) did not have cleared bio_offset cache\n", bp, vp, loffset);
	2393	clearbiocache(&bp->b_bio2);
	2394	}
	2395
	2396	/*
	2397	* The buffer is locked. B_CACHE is cleared if the buffer is
	2398	* invalid.
	2399	*/
	2400	if (bp->b_flags & B_INVAL)
	2401	bp->b_flags &= ~B_CACHE;
	2402	bremfree(bp);
	2403
	2404	/*
	2405	* Any size inconsistancy with a dirty buffer or a buffer
	2406	* with a softupdates dependancy must be resolved. Resizing
	2407	* the buffer in such circumstances can lead to problems.
	2408	*/
	2409	if (size != bp->b_bcount) {
	2410	if (bp->b_flags & B_DELWRI) {
	2411	bp->b_flags \|= B_NOCACHE;
	2412	bwrite(bp);
	2413	} else if (LIST_FIRST(&bp->b_dep)) {
	2414	bp->b_flags \|= B_NOCACHE;
	2415	bwrite(bp);
	2416	} else {
	2417	bp->b_flags \|= B_RELBUF;
	2418	brelse(bp);
	2419	}
	2420	goto loop;
	2421	}
	2422	KKASSERT(size <= bp->b_kvasize);
	2423	KASSERT(bp->b_loffset != NOOFFSET,
	2424	("getblk: no buffer offset"));
	2425
	2426	/*
	2427	* A buffer with B_DELWRI set and B_CACHE clear must
	2428	* be committed before we can return the buffer in
	2429	* order to prevent the caller from issuing a read
	2430	* ( due to B_CACHE not being set ) and overwriting
	2431	* it.
	2432	*
	2433	* Most callers, including NFS and FFS, need this to
	2434	* operate properly either because they assume they
	2435	* can issue a read if B_CACHE is not set, or because
	2436	* ( for example ) an uncached B_DELWRI might loop due
	2437	* to softupdates re-dirtying the buffer. In the latter
	2438	* case, B_CACHE is set after the first write completes,
	2439	* preventing further loops.
	2440	*
	2441	* NOTE! b*write() sets B_CACHE. If we cleared B_CACHE
	2442	* above while extending the buffer, we cannot allow the
	2443	* buffer to remain with B_CACHE set after the write
	2444	* completes or it will represent a corrupt state. To
	2445	* deal with this we set B_NOCACHE to scrap the buffer
	2446	* after the write.
	2447	*
	2448	* We might be able to do something fancy, like setting
	2449	* B_CACHE in bwrite() except if B_DELWRI is already set,
	2450	* so the below call doesn't set B_CACHE, but that gets real
	2451	* confusing. This is much easier.
	2452	*/
	2453
	2454	if ((bp->b_flags & (B_CACHE\|B_DELWRI)) == B_DELWRI) {
	2455	bp->b_flags \|= B_NOCACHE;
	2456	bwrite(bp);
	2457	goto loop;
	2458	}
	2459	crit_exit();
	2460	} else {
	2461	/*
	2462	* Buffer is not in-core, create new buffer. The buffer
	2463	* returned by getnewbuf() is locked. Note that the returned
	2464	* buffer is also considered valid (not marked B_INVAL).
	2465	*
	2466	* Calculating the offset for the I/O requires figuring out
	2467	* the block size. We use DEV_BSIZE for VBLK or VCHR and
	2468	* the mount's f_iosize otherwise. If the vnode does not
	2469	* have an associated mount we assume that the passed size is
	2470	* the block size.
	2471	*
	2472	* Note that vn_isdisk() cannot be used here since it may
	2473	* return a failure for numerous reasons. Note that the
	2474	* buffer size may be larger then the block size (the caller
	2475	* will use block numbers with the proper multiple). Beware
	2476	* of using any v_* fields which are part of unions. In
	2477	* particular, in DragonFly the mount point overloading
	2478	* mechanism uses the namecache only and the underlying
	2479	* directory vnode is not a special case.
	2480	*/
	2481	int bsize, maxsize;
	2482
	2483	/*
	2484	* Don't let heavy weight buffers deadlock us.
	2485	*/
	2486	if ((blkflags & GETBLK_BHEAVY) &&
	2487	numdirtybuffershw > hidirtybuffers) {
	2488	while (numdirtybuffershw > hidirtybuffers) {
	2489	needsbuffer \|= VFS_BIO_NEED_DIRTYFLUSH;
	2490	tsleep(&needsbuffer, slpflags, "newbuf",
	2491	slptimeo);
	2492	}
	2493	goto loop;
	2494	}
	2495
	2496	if (vp->v_type == VBLK \|\| vp->v_type == VCHR)
	2497	bsize = DEV_BSIZE;
	2498	else if (vp->v_mount)
	2499	bsize = vp->v_mount->mnt_stat.f_iosize;
	2500	else
	2501	bsize = size;
	2502
	2503	maxsize = size + (loffset & PAGE_MASK);
	2504	maxsize = imax(maxsize, bsize);
	2505
	2506	if ((bp = getnewbuf(blkflags, slptimeo, size, maxsize)) == NULL) {
	2507	if (slpflags \|\| slptimeo) {
	2508	crit_exit();
	2509	return NULL;
	2510	}
	2511	goto loop;
	2512	}
	2513
	2514	/*
	2515	* This code is used to make sure that a buffer is not
	2516	* created while the getnewbuf routine is blocked.
	2517	* This can be a problem whether the vnode is locked or not.
	2518	* If the buffer is created out from under us, we have to
	2519	* throw away the one we just created. There is no window
	2520	* race because we are safely running in a critical section
	2521	* from the point of the duplicate buffer creation through
	2522	* to here, and we've locked the buffer.
	2523	*/
	2524	if (findblk(vp, loffset)) {
	2525	bp->b_flags \|= B_INVAL;
	2526	brelse(bp);
	2527	goto loop;
	2528	}
	2529
	2530	/*
	2531	* Insert the buffer into the hash, so that it can
	2532	* be found by findblk().
	2533	*
	2534	* Make sure the translation layer has been cleared.
	2535	*/
	2536	bp->b_loffset = loffset;
	2537	bp->b_bio2.bio_offset = NOOFFSET;
	2538	/* bp->b_bio2.bio_next = NULL; */
	2539
	2540	bgetvp(vp, bp);
	2541
	2542	/*
	2543	* All vnode-based buffers must be backed by a VM object.
	2544	*/
	2545	KKASSERT(vp->v_object != NULL);
	2546	bp->b_flags \|= B_VMIO;
	2547	KKASSERT(bp->b_cmd == BUF_CMD_DONE);
	2548
	2549	allocbuf(bp, size);
	2550
	2551	crit_exit();
	2552	}
	2553	return (bp);
	2554	}
	2555
	2556	/*
	2557	* regetblk(bp)
	2558	*
	2559	* Reacquire a buffer that was previously released to the locked queue,
	2560	* or reacquire a buffer which is interlocked by having bioops->io_deallocate
	2561	* set B_LOCKED (which handles the acquisition race).
	2562	*
	2563	* To this end, either B_LOCKED must be set or the dependancy list must be
	2564	* non-empty.
	2565	*/
	2566	void
	2567	regetblk(struct buf *bp)
	2568	{
	2569	KKASSERT((bp->b_flags & B_LOCKED) \|\| LIST_FIRST(&bp->b_dep) != NULL);
	2570	BUF_LOCK(bp, LK_EXCLUSIVE \| LK_RETRY);
	2571	crit_enter();
	2572	bremfree(bp);
	2573	crit_exit();
	2574	}
	2575
	2576	/*
	2577	* geteblk:
	2578	*
	2579	* Get an empty, disassociated buffer of given size. The buffer is
	2580	* initially set to B_INVAL.
	2581	*
	2582	* critical section protection is not required for the allocbuf()
	2583	* call because races are impossible here.
	2584	*/
	2585	struct buf *
	2586	geteblk(int size)
	2587	{
	2588	struct buf *bp;
	2589	int maxsize;
	2590
	2591	maxsize = (size + BKVAMASK) & ~BKVAMASK;
	2592
	2593	crit_enter();
	2594	while ((bp = getnewbuf(0, 0, size, maxsize)) == 0)
	2595	;
	2596	crit_exit();
	2597	allocbuf(bp, size);
	2598	bp->b_flags \|= B_INVAL; /* b_dep cleared by getnewbuf() */
	2599	return (bp);
	2600	}
	2601
	2602
	2603	/*
	2604	* allocbuf:
	2605	*
	2606	* This code constitutes the buffer memory from either anonymous system
	2607	* memory (in the case of non-VMIO operations) or from an associated
	2608	* VM object (in the case of VMIO operations). This code is able to
	2609	* resize a buffer up or down.
	2610	*
	2611	* Note that this code is tricky, and has many complications to resolve
	2612	* deadlock or inconsistant data situations. Tread lightly!!!
	2613	* There are B_CACHE and B_DELWRI interactions that must be dealt with by
	2614	* the caller. Calling this code willy nilly can result in the loss of data.
	2615	*
	2616	* allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with
	2617	* B_CACHE for the non-VMIO case.
	2618	*
	2619	* This routine does not need to be called from a critical section but you
	2620	* must own the buffer.
	2621	*/
	2622	int
	2623	allocbuf(struct buf *bp, int size)
	2624	{
	2625	int newbsize, mbsize;
	2626	int i;
	2627
	2628	if (BUF_REFCNT(bp) == 0)
	2629	panic("allocbuf: buffer not busy");
	2630
	2631	if (bp->b_kvasize < size)
	2632	panic("allocbuf: buffer too small");
	2633
	2634	if ((bp->b_flags & B_VMIO) == 0) {
	2635	caddr_t origbuf;
	2636	int origbufsize;
	2637	/*
	2638	* Just get anonymous memory from the kernel. Don't
	2639	* mess with B_CACHE.
	2640	*/
	2641	mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
	2642	if (bp->b_flags & B_MALLOC)
	2643	newbsize = mbsize;
	2644	else
	2645	newbsize = round_page(size);
	2646
	2647	if (newbsize < bp->b_bufsize) {
	2648	/*
	2649	* Malloced buffers are not shrunk
	2650	*/
	2651	if (bp->b_flags & B_MALLOC) {
	2652	if (newbsize) {
	2653	bp->b_bcount = size;
	2654	} else {
	2655	kfree(bp->b_data, M_BIOBUF);
	2656	if (bp->b_bufsize) {
	2657	bufmallocspace -= bp->b_bufsize;
	2658	bufspacewakeup();
	2659	bp->b_bufsize = 0;
	2660	}
	2661	bp->b_data = bp->b_kvabase;
	2662	bp->b_bcount = 0;
	2663	bp->b_flags &= ~B_MALLOC;
	2664	}
	2665	return 1;
	2666	}
	2667	vm_hold_free_pages(
	2668	bp,
	2669	(vm_offset_t) bp->b_data + newbsize,
	2670	(vm_offset_t) bp->b_data + bp->b_bufsize);
	2671	} else if (newbsize > bp->b_bufsize) {
	2672	/*
	2673	* We only use malloced memory on the first allocation.
	2674	* and revert to page-allocated memory when the buffer
	2675	* grows.
	2676	*/
	2677	if ((bufmallocspace < maxbufmallocspace) &&
	2678	(bp->b_bufsize == 0) &&
	2679	(mbsize <= PAGE_SIZE/2)) {
	2680
	2681	bp->b_data = kmalloc(mbsize, M_BIOBUF, M_WAITOK);
	2682	bp->b_bufsize = mbsize;
	2683	bp->b_bcount = size;
	2684	bp->b_flags \|= B_MALLOC;
	2685	bufmallocspace += mbsize;
	2686	return 1;
	2687	}
	2688	origbuf = NULL;
	2689	origbufsize = 0;
	2690	/*
	2691	* If the buffer is growing on its other-than-first
	2692	* allocation, then we revert to the page-allocation
	2693	* scheme.
	2694	*/
	2695	if (bp->b_flags & B_MALLOC) {
	2696	origbuf = bp->b_data;
	2697	origbufsize = bp->b_bufsize;
	2698	bp->b_data = bp->b_kvabase;
	2699	if (bp->b_bufsize) {
	2700	bufmallocspace -= bp->b_bufsize;
	2701	bufspacewakeup();
	2702	bp->b_bufsize = 0;
	2703	}
	2704	bp->b_flags &= ~B_MALLOC;
	2705	newbsize = round_page(newbsize);
	2706	}
	2707	vm_hold_load_pages(
	2708	bp,
	2709	(vm_offset_t) bp->b_data + bp->b_bufsize,
	2710	(vm_offset_t) bp->b_data + newbsize);
	2711	if (origbuf) {
	2712	bcopy(origbuf, bp->b_data, origbufsize);
	2713	kfree(origbuf, M_BIOBUF);
	2714	}
	2715	}
	2716	} else {
	2717	vm_page_t m;
	2718	int desiredpages;
	2719
	2720	newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
	2721	desiredpages = ((int)(bp->b_loffset & PAGE_MASK) +
	2722	newbsize + PAGE_MASK) >> PAGE_SHIFT;
	2723	KKASSERT(desiredpages <= XIO_INTERNAL_PAGES);
	2724
	2725	if (bp->b_flags & B_MALLOC)
	2726	panic("allocbuf: VMIO buffer can't be malloced");
	2727	/*
	2728	* Set B_CACHE initially if buffer is 0 length or will become
	2729	* 0-length.
	2730	*/
	2731	if (size == 0 \|\| bp->b_bufsize == 0)
	2732	bp->b_flags \|= B_CACHE;
	2733
	2734	if (newbsize < bp->b_bufsize) {
	2735	/*
	2736	* DEV_BSIZE aligned new buffer size is less then the
	2737	* DEV_BSIZE aligned existing buffer size. Figure out
	2738	* if we have to remove any pages.
	2739	*/
	2740	if (desiredpages < bp->b_xio.xio_npages) {
	2741	for (i = desiredpages; i < bp->b_xio.xio_npages; i++) {
	2742	/*
	2743	* the page is not freed here -- it
	2744	* is the responsibility of
	2745	* vnode_pager_setsize
	2746	*/
	2747	m = bp->b_xio.xio_pages[i];
	2748	KASSERT(m != bogus_page,
	2749	("allocbuf: bogus page found"));
	2750	while (vm_page_sleep_busy(m, TRUE, "biodep"))
	2751	;
	2752
	2753	bp->b_xio.xio_pages[i] = NULL;
	2754	vm_page_unwire(m, 0);
	2755	}
	2756	pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
	2757	(desiredpages << PAGE_SHIFT), (bp->b_xio.xio_npages - desiredpages));
	2758	bp->b_xio.xio_npages = desiredpages;
	2759	}
	2760	} else if (size > bp->b_bcount) {
	2761	/*
	2762	* We are growing the buffer, possibly in a
	2763	* byte-granular fashion.
	2764	*/
	2765	struct vnode *vp;
	2766	vm_object_t obj;
	2767	vm_offset_t toff;
	2768	vm_offset_t tinc;
	2769
	2770	/*
	2771	* Step 1, bring in the VM pages from the object,
	2772	* allocating them if necessary. We must clear
	2773	* B_CACHE if these pages are not valid for the
	2774	* range covered by the buffer.
	2775	*
	2776	* critical section protection is required to protect
	2777	* against interrupts unbusying and freeing pages
	2778	* between our vm_page_lookup() and our
	2779	* busycheck/wiring call.
	2780	*/
	2781	vp = bp->b_vp;
	2782	obj = vp->v_object;
	2783
	2784	crit_enter();
	2785	while (bp->b_xio.xio_npages < desiredpages) {
	2786	vm_page_t m;
	2787	vm_pindex_t pi;
	2788
	2789	pi = OFF_TO_IDX(bp->b_loffset) + bp->b_xio.xio_npages;
	2790	if ((m = vm_page_lookup(obj, pi)) == NULL) {
	2791	/*
	2792	* note: must allocate system pages
	2793	* since blocking here could intefere
	2794	* with paging I/O, no matter which
	2795	* process we are.
	2796	*/
	2797	m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL \| VM_ALLOC_SYSTEM);
	2798	if (m == NULL) {
	2799	vm_wait();
	2800	vm_pageout_deficit += desiredpages -
	2801	bp->b_xio.xio_npages;
	2802	} else {
	2803	vm_page_wire(m);
	2804	vm_page_wakeup(m);
	2805	bp->b_flags &= ~B_CACHE;
	2806	bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
	2807	++bp->b_xio.xio_npages;
	2808	}
	2809	continue;
	2810	}
	2811
	2812	/*
	2813	* We found a page. If we have to sleep on it,
	2814	* retry because it might have gotten freed out
	2815	* from under us.
	2816	*
	2817	* We can only test PG_BUSY here. Blocking on
	2818	* m->busy might lead to a deadlock:
	2819	*
	2820	* vm_fault->getpages->cluster_read->allocbuf
	2821	*
	2822	*/
	2823
	2824	if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
	2825	continue;
	2826
	2827	/*
	2828	* We have a good page. Should we wakeup the
	2829	* page daemon?
	2830	*/
	2831	if ((curthread != pagethread) &&
	2832	((m->queue - m->pc) == PQ_CACHE) &&
	2833	((vmstats.v_free_count + vmstats.v_cache_count) <
	2834	(vmstats.v_free_min + vmstats.v_cache_min))) {
	2835	pagedaemon_wakeup();
	2836	}
	2837	vm_page_flag_clear(m, PG_ZERO);
	2838	vm_page_wire(m);
	2839	bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
	2840	++bp->b_xio.xio_npages;
	2841	}
	2842	crit_exit();
	2843
	2844	/*
	2845	* Step 2. We've loaded the pages into the buffer,
	2846	* we have to figure out if we can still have B_CACHE
	2847	* set. Note that B_CACHE is set according to the
	2848	* byte-granular range ( bcount and size ), not the
	2849	* aligned range ( newbsize ).
	2850	*
	2851	* The VM test is against m->valid, which is DEV_BSIZE
	2852	* aligned. Needless to say, the validity of the data
	2853	* needs to also be DEV_BSIZE aligned. Note that this
	2854	* fails with NFS if the server or some other client
	2855	* extends the file's EOF. If our buffer is resized,
	2856	* B_CACHE may remain set! XXX
	2857	*/
	2858
	2859	toff = bp->b_bcount;
	2860	tinc = PAGE_SIZE - ((bp->b_loffset + toff) & PAGE_MASK);
	2861
	2862	while ((bp->b_flags & B_CACHE) && toff < size) {
	2863	vm_pindex_t pi;
	2864
	2865	if (tinc > (size - toff))
	2866	tinc = size - toff;
	2867
	2868	pi = ((bp->b_loffset & PAGE_MASK) + toff) >>
	2869	PAGE_SHIFT;
	2870
	2871	vfs_buf_test_cache(
	2872	bp,
	2873	bp->b_loffset,
	2874	toff,
	2875	tinc,
	2876	bp->b_xio.xio_pages[pi]
	2877	);
	2878	toff += tinc;
	2879	tinc = PAGE_SIZE;
	2880	}
	2881
	2882	/*
	2883	* Step 3, fixup the KVM pmap. Remember that
	2884	* bp->b_data is relative to bp->b_loffset, but
	2885	* bp->b_loffset may be offset into the first page.
	2886	*/
	2887
	2888	bp->b_data = (caddr_t)
	2889	trunc_page((vm_offset_t)bp->b_data);
	2890	pmap_qenter(
	2891	(vm_offset_t)bp->b_data,
	2892	bp->b_xio.xio_pages,
	2893	bp->b_xio.xio_npages
	2894	);
	2895	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data \|
	2896	(vm_offset_t)(bp->b_loffset & PAGE_MASK));
	2897	}
	2898	}
	2899	if (newbsize < bp->b_bufsize)
	2900	bufspacewakeup();
	2901	bp->b_bufsize = newbsize; /* actual buffer allocation */
	2902	bp->b_bcount = size; /* requested buffer size */
	2903	return 1;
	2904	}
	2905
	2906	/*
	2907	* biowait:
	2908	*
	2909	* Wait for buffer I/O completion, returning error status. The buffer
	2910	* is left locked on return. B_EINTR is converted into an EINTR error
	2911	* and cleared.
	2912	*
	2913	* NOTE! The original b_cmd is lost on return, since b_cmd will be
	2914	* set to BUF_CMD_DONE.
	2915	*/
	2916	int
	2917	biowait(struct buf *bp)
	2918	{
	2919	crit_enter();
	2920	while (bp->b_cmd != BUF_CMD_DONE) {
	2921	if (bp->b_cmd == BUF_CMD_READ)
	2922	tsleep(bp, 0, "biord", 0);
	2923	else
	2924	tsleep(bp, 0, "biowr", 0);
	2925	}
	2926	crit_exit();
	2927	if (bp->b_flags & B_EINTR) {
	2928	bp->b_flags &= ~B_EINTR;
	2929	return (EINTR);
	2930	}
	2931	if (bp->b_flags & B_ERROR) {
	2932	return (bp->b_error ? bp->b_error : EIO);
	2933	} else {
	2934	return (0);
	2935	}
	2936	}
	2937
	2938	/*
	2939	* This associates a tracking count with an I/O. vn_strategy() and
	2940	* dev_dstrategy() do this automatically but there are a few cases
	2941	* where a vnode or device layer is bypassed when a block translation
	2942	* is cached. In such cases bio_start_transaction() may be called on
	2943	* the bypassed layers so the system gets an I/O in progress indication
	2944	* for those higher layers.
	2945	*/
	2946	void
	2947	bio_start_transaction(struct bio bio, struct bio_track track)
	2948	{
	2949	bio->bio_track = track;
	2950	atomic_add_int(&track->bk_active, 1);
	2951	}
	2952
	2953	/*
	2954	* Initiate I/O on a vnode.
	2955	*/
	2956	void
	2957	vn_strategy(struct vnode vp, struct bio bio)
	2958	{
	2959	struct bio_track *track;
	2960
	2961	KKASSERT(bio->bio_buf->b_cmd != BUF_CMD_DONE);
	2962	if (bio->bio_buf->b_cmd == BUF_CMD_READ)
	2963	track = &vp->v_track_read;
	2964	else
	2965	track = &vp->v_track_write;
	2966	bio->bio_track = track;
	2967	atomic_add_int(&track->bk_active, 1);
	2968	vop_strategy(*vp->v_ops, vp, bio);
	2969	}
	2970
	2971
	2972	/*
	2973	* biodone:
	2974	*
	2975	* Finish I/O on a buffer, optionally calling a completion function.
	2976	* This is usually called from an interrupt so process blocking is
	2977	* not allowed.
	2978	*
	2979	* biodone is also responsible for setting B_CACHE in a B_VMIO bp.
	2980	* In a non-VMIO bp, B_CACHE will be set on the next getblk()
	2981	* assuming B_INVAL is clear.
	2982	*
	2983	* For the VMIO case, we set B_CACHE if the op was a read and no
	2984	* read error occured, or if the op was a write. B_CACHE is never
	2985	* set if the buffer is invalid or otherwise uncacheable.
	2986	*
	2987	* biodone does not mess with B_INVAL, allowing the I/O routine or the
	2988	* initiator to leave B_INVAL set to brelse the buffer out of existance
	2989	* in the biodone routine.
	2990	*/
	2991	void
	2992	biodone(struct bio *bio)
	2993	{
	2994	struct buf *bp = bio->bio_buf;
	2995	buf_cmd_t cmd;
	2996
	2997	crit_enter();
	2998
	2999	KASSERT(BUF_REFCNTNB(bp) > 0,
	3000	("biodone: bp %p not busy %d", bp, BUF_REFCNTNB(bp)));
	3001	KASSERT(bp->b_cmd != BUF_CMD_DONE,
	3002	("biodone: bp %p already done!", bp));
	3003
	3004	runningbufwakeup(bp);
	3005
	3006	/*
	3007	* Run up the chain of BIO's. Leave b_cmd intact for the duration.
	3008	*/
	3009	while (bio) {
	3010	biodone_t *done_func;
	3011	struct bio_track *track;
	3012
	3013	/*
	3014	* BIO tracking. Most but not all BIOs are tracked.
	3015	*/
	3016	if ((track = bio->bio_track) != NULL) {
	3017	atomic_subtract_int(&track->bk_active, 1);
	3018	if (track->bk_active < 0) {
	3019	panic("biodone: bad active count bio %p\n",
	3020	bio);
	3021	}
	3022	if (track->bk_waitflag) {
	3023	track->bk_waitflag = 0;
	3024	wakeup(track);
	3025	}
	3026	bio->bio_track = NULL;
	3027	}
	3028
	3029	/*
	3030	* A bio_done function terminates the loop. The function
	3031	* will be responsible for any further chaining and/or
	3032	* buffer management.
	3033	*
	3034	* WARNING! The done function can deallocate the buffer!
	3035	*/
	3036	if ((done_func = bio->bio_done) != NULL) {
	3037	bio->bio_done = NULL;
	3038	done_func(bio);
	3039	crit_exit();
	3040	return;
	3041	}
	3042	bio = bio->bio_prev;
	3043	}
	3044
	3045	cmd = bp->b_cmd;
	3046	bp->b_cmd = BUF_CMD_DONE;
	3047
	3048	/*
	3049	* Only reads and writes are processed past this point.
	3050	*/
	3051	if (cmd != BUF_CMD_READ && cmd != BUF_CMD_WRITE) {
	3052	brelse(bp);
	3053	crit_exit();
	3054	return;
	3055	}
	3056
	3057	/*
	3058	* Warning: softupdates may re-dirty the buffer.
	3059	*/
	3060	if (LIST_FIRST(&bp->b_dep) != NULL)
	3061	buf_complete(bp);
	3062
	3063	if (bp->b_flags & B_VMIO) {
	3064	int i;
	3065	vm_ooffset_t foff;
	3066	vm_page_t m;
	3067	vm_object_t obj;
	3068	int iosize;
	3069	struct vnode *vp = bp->b_vp;
	3070
	3071	obj = vp->v_object;
	3072
	3073	#if defined(VFS_BIO_DEBUG)
	3074	if (vp->v_auxrefs == 0)
	3075	panic("biodone: zero vnode hold count");
	3076	if ((vp->v_flag & VOBJBUF) == 0)
	3077	panic("biodone: vnode is not setup for merged cache");
	3078	#endif
	3079
	3080	foff = bp->b_loffset;
	3081	KASSERT(foff != NOOFFSET, ("biodone: no buffer offset"));
	3082	KASSERT(obj != NULL, ("biodone: missing VM object"));
	3083
	3084	#if defined(VFS_BIO_DEBUG)
	3085	if (obj->paging_in_progress < bp->b_xio.xio_npages) {
	3086	kprintf("biodone: paging in progress(%d) < bp->b_xio.xio_npages(%d)\n",
	3087	obj->paging_in_progress, bp->b_xio.xio_npages);
	3088	}
	3089	#endif
	3090
	3091	/*
	3092	* Set B_CACHE if the op was a normal read and no error
	3093	* occured. B_CACHE is set for writes in the b*write()
	3094	* routines.
	3095	*/
	3096	iosize = bp->b_bcount - bp->b_resid;
	3097	if (cmd == BUF_CMD_READ && (bp->b_flags & (B_INVAL\|B_NOCACHE\|B_ERROR)) == 0) {
	3098	bp->b_flags \|= B_CACHE;
	3099	}
	3100
	3101	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	3102	int bogusflag = 0;
	3103	int resid;
	3104
	3105	resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
	3106	if (resid > iosize)
	3107	resid = iosize;
	3108
	3109	/*
	3110	* cleanup bogus pages, restoring the originals. Since
	3111	* the originals should still be wired, we don't have
	3112	* to worry about interrupt/freeing races destroying
	3113	* the VM object association.
	3114	*/
	3115	m = bp->b_xio.xio_pages[i];
	3116	if (m == bogus_page) {
	3117	bogusflag = 1;
	3118	m = vm_page_lookup(obj, OFF_TO_IDX(foff));
	3119	if (m == NULL)
	3120	panic("biodone: page disappeared");
	3121	bp->b_xio.xio_pages[i] = m;
	3122	pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
	3123	bp->b_xio.xio_pages, bp->b_xio.xio_npages);
	3124	}
	3125	#if defined(VFS_BIO_DEBUG)
	3126	if (OFF_TO_IDX(foff) != m->pindex) {
	3127	kprintf(
	3128	"biodone: foff(%lu)/m->pindex(%d) mismatch\n",
	3129	(unsigned long)foff, m->pindex);
	3130	}
	3131	#endif
	3132
	3133	/*
	3134	* In the write case, the valid and clean bits are
	3135	* already changed correctly ( see bdwrite() ), so we
	3136	* only need to do this here in the read case.
	3137	*/
	3138	if (cmd == BUF_CMD_READ && !bogusflag && resid > 0) {
	3139	vfs_page_set_valid(bp, foff, i, m);
	3140	}
	3141	vm_page_flag_clear(m, PG_ZERO);
	3142
	3143	/*
	3144	* when debugging new filesystems or buffer I/O methods, this
	3145	* is the most common error that pops up. if you see this, you
	3146	* have not set the page busy flag correctly!!!
	3147	*/
	3148	if (m->busy == 0) {
	3149	kprintf("biodone: page busy < 0, "
	3150	"pindex: %d, foff: 0x(%x,%x), "
	3151	"resid: %d, index: %d\n",
	3152	(int) m->pindex, (int)(foff >> 32),
	3153	(int) foff & 0xffffffff, resid, i);
	3154	if (!vn_isdisk(vp, NULL))
	3155	kprintf(" iosize: %ld, loffset: %lld, flags: 0x%08x, npages: %d\n",
	3156	bp->b_vp->v_mount->mnt_stat.f_iosize,
	3157	bp->b_loffset,
	3158	bp->b_flags, bp->b_xio.xio_npages);
	3159	else
	3160	kprintf(" VDEV, loffset: %lld, flags: 0x%08x, npages: %d\n",
	3161	bp->b_loffset,
	3162	bp->b_flags, bp->b_xio.xio_npages);
	3163	kprintf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
	3164	m->valid, m->dirty, m->wire_count);
	3165	panic("biodone: page busy < 0");
	3166	}
	3167	vm_page_io_finish(m);
	3168	vm_object_pip_subtract(obj, 1);
	3169	foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
	3170	iosize -= resid;
	3171	}
	3172	if (obj)
	3173	vm_object_pip_wakeupn(obj, 0);
	3174	}
	3175
	3176	/*
	3177	* For asynchronous completions, release the buffer now. The brelse
	3178	* will do a wakeup there if necessary - so no need to do a wakeup
	3179	* here in the async case. The sync case always needs to do a wakeup.
	3180	*/
	3181
	3182	if (bp->b_flags & B_ASYNC) {
	3183	if ((bp->b_flags & (B_NOCACHE \| B_INVAL \| B_ERROR \| B_RELBUF)) != 0)
	3184	brelse(bp);
	3185	else
	3186	bqrelse(bp);
	3187	} else {
	3188	wakeup(bp);
	3189	}
	3190	crit_exit();
	3191	}
	3192
	3193	/*
	3194	* vfs_unbusy_pages:
	3195	*
	3196	* This routine is called in lieu of iodone in the case of
	3197	* incomplete I/O. This keeps the busy status for pages
	3198	* consistant.
	3199	*/
	3200	void
	3201	vfs_unbusy_pages(struct buf *bp)
	3202	{
	3203	int i;
	3204
	3205	runningbufwakeup(bp);
	3206	if (bp->b_flags & B_VMIO) {
	3207	struct vnode *vp = bp->b_vp;
	3208	vm_object_t obj;
	3209
	3210	obj = vp->v_object;
	3211
	3212	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	3213	vm_page_t m = bp->b_xio.xio_pages[i];
	3214
	3215	/*
	3216	* When restoring bogus changes the original pages
	3217	* should still be wired, so we are in no danger of
	3218	* losing the object association and do not need
	3219	* critical section protection particularly.
	3220	*/
	3221	if (m == bogus_page) {
	3222	m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_loffset) + i);
	3223	if (!m) {
	3224	panic("vfs_unbusy_pages: page missing");
	3225	}
	3226	bp->b_xio.xio_pages[i] = m;
	3227	pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
	3228	bp->b_xio.xio_pages, bp->b_xio.xio_npages);
	3229	}
	3230	vm_object_pip_subtract(obj, 1);
	3231	vm_page_flag_clear(m, PG_ZERO);
	3232	vm_page_io_finish(m);
	3233	}
	3234	vm_object_pip_wakeupn(obj, 0);
	3235	}
	3236	}
	3237
	3238	/*
	3239	* vfs_page_set_valid:
	3240	*
	3241	* Set the valid bits in a page based on the supplied offset. The
	3242	* range is restricted to the buffer's size.
	3243	*
	3244	* This routine is typically called after a read completes.
	3245	*/
	3246	static void
	3247	vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
	3248	{
	3249	vm_ooffset_t soff, eoff;
	3250
	3251	/*
	3252	* Start and end offsets in buffer. eoff - soff may not cross a
	3253	* page boundry or cross the end of the buffer. The end of the
	3254	* buffer, in this case, is our file EOF, not the allocation size
	3255	* of the buffer.
	3256	*/
	3257	soff = off;
	3258	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
	3259	if (eoff > bp->b_loffset + bp->b_bcount)
	3260	eoff = bp->b_loffset + bp->b_bcount;
	3261
	3262	/*
	3263	* Set valid range. This is typically the entire buffer and thus the
	3264	* entire page.
	3265	*/
	3266	if (eoff > soff) {
	3267	vm_page_set_validclean(
	3268	m,
	3269	(vm_offset_t) (soff & PAGE_MASK),
	3270	(vm_offset_t) (eoff - soff)
	3271	);
	3272	}
	3273	}
	3274
	3275	/*
	3276	* vfs_busy_pages:
	3277	*
	3278	* This routine is called before a device strategy routine.
	3279	* It is used to tell the VM system that paging I/O is in
	3280	* progress, and treat the pages associated with the buffer
	3281	* almost as being PG_BUSY. Also the object 'paging_in_progress'
	3282	* flag is handled to make sure that the object doesn't become
	3283	* inconsistant.
	3284	*
	3285	* Since I/O has not been initiated yet, certain buffer flags
	3286	* such as B_ERROR or B_INVAL may be in an inconsistant state
	3287	* and should be ignored.
	3288	*/
	3289	void
	3290	vfs_busy_pages(struct vnode vp, struct buf bp)
	3291	{
	3292	int i, bogus;
	3293	struct lwp *lp = curthread->td_lwp;
	3294
	3295	/*
	3296	* The buffer's I/O command must already be set. If reading,
	3297	* B_CACHE must be 0 (double check against callers only doing
	3298	* I/O when B_CACHE is 0).
	3299	*/
	3300	KKASSERT(bp->b_cmd != BUF_CMD_DONE);
	3301	KKASSERT(bp->b_cmd == BUF_CMD_WRITE \|\| (bp->b_flags & B_CACHE) == 0);
	3302
	3303	if (bp->b_flags & B_VMIO) {
	3304	vm_object_t obj;
	3305	vm_ooffset_t foff;
	3306
	3307	obj = vp->v_object;
	3308	foff = bp->b_loffset;
	3309	KASSERT(bp->b_loffset != NOOFFSET,
	3310	("vfs_busy_pages: no buffer offset"));
	3311	vfs_setdirty(bp);
	3312
	3313	retry:
	3314	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	3315	vm_page_t m = bp->b_xio.xio_pages[i];
	3316	if (vm_page_sleep_busy(m, FALSE, "vbpage"))
	3317	goto retry;
	3318	}
	3319
	3320	bogus = 0;
	3321	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	3322	vm_page_t m = bp->b_xio.xio_pages[i];
	3323
	3324	vm_page_flag_clear(m, PG_ZERO);
	3325	if ((bp->b_flags & B_CLUSTER) == 0) {
	3326	vm_object_pip_add(obj, 1);
	3327	vm_page_io_start(m);
	3328	}
	3329
	3330	/*
	3331	* When readying a vnode-backed buffer for a write
	3332	* we must zero-fill any invalid portions of the
	3333	* backing VM pages.
	3334	*
	3335	* When readying a vnode-backed buffer for a read
	3336	* we must replace any dirty pages with a bogus
	3337	* page so we do not destroy dirty data when
	3338	* filling in gaps. Dirty pages might not
	3339	* necessarily be marked dirty yet, so use m->valid
	3340	* as a reasonable test.
	3341	*
	3342	* Bogus page replacement is, uh, bogus. We need
	3343	* to find a better way.
	3344	*/
	3345	vm_page_protect(m, VM_PROT_NONE);
	3346	if (bp->b_cmd == BUF_CMD_WRITE) {
	3347	vfs_page_set_valid(bp, foff, i, m);
	3348	} else if (m->valid == VM_PAGE_BITS_ALL) {
	3349	bp->b_xio.xio_pages[i] = bogus_page;
	3350	bogus++;
	3351	}
	3352	foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
	3353	}
	3354	if (bogus)
	3355	pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
	3356	bp->b_xio.xio_pages, bp->b_xio.xio_npages);
	3357	}
	3358
	3359	/*
	3360	* This is the easiest place to put the process accounting for the I/O
	3361	* for now.
	3362	*/
	3363	if (lp != NULL) {
	3364	if (bp->b_cmd == BUF_CMD_READ)
	3365	lp->lwp_ru.ru_inblock++;
	3366	else
	3367	lp->lwp_ru.ru_oublock++;
	3368	}
	3369	}
	3370
	3371	/*
	3372	* vfs_clean_pages:
	3373	*
	3374	* Tell the VM system that the pages associated with this buffer
	3375	* are clean. This is used for delayed writes where the data is
	3376	* going to go to disk eventually without additional VM intevention.
	3377	*
	3378	* Note that while we only really need to clean through to b_bcount, we
	3379	* just go ahead and clean through to b_bufsize.
	3380	*/
	3381	static void
	3382	vfs_clean_pages(struct buf *bp)
	3383	{
	3384	int i;
	3385
	3386	if (bp->b_flags & B_VMIO) {
	3387	vm_ooffset_t foff;
	3388
	3389	foff = bp->b_loffset;
	3390	KASSERT(foff != NOOFFSET, ("vfs_clean_pages: no buffer offset"));
	3391	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	3392	vm_page_t m = bp->b_xio.xio_pages[i];
	3393	vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
	3394	vm_ooffset_t eoff = noff;
	3395
	3396	if (eoff > bp->b_loffset + bp->b_bufsize)
	3397	eoff = bp->b_loffset + bp->b_bufsize;
	3398	vfs_page_set_valid(bp, foff, i, m);
	3399	/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
	3400	foff = noff;
	3401	}
	3402	}
	3403	}
	3404
	3405	/*
	3406	* vfs_bio_set_validclean:
	3407	*
	3408	* Set the range within the buffer to valid and clean. The range is
	3409	* relative to the beginning of the buffer, b_loffset. Note that
	3410	* b_loffset itself may be offset from the beginning of the first page.
	3411	*/
	3412
	3413	void
	3414	vfs_bio_set_validclean(struct buf *bp, int base, int size)
	3415	{
	3416	if (bp->b_flags & B_VMIO) {
	3417	int i;
	3418	int n;
	3419
	3420	/*
	3421	* Fixup base to be relative to beginning of first page.
	3422	* Set initial n to be the maximum number of bytes in the
	3423	* first page that can be validated.
	3424	*/
	3425
	3426	base += (bp->b_loffset & PAGE_MASK);
	3427	n = PAGE_SIZE - (base & PAGE_MASK);
	3428
	3429	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_xio.xio_npages; ++i) {
	3430	vm_page_t m = bp->b_xio.xio_pages[i];
	3431
	3432	if (n > size)
	3433	n = size;
	3434
	3435	vm_page_set_validclean(m, base & PAGE_MASK, n);
	3436	base += n;
	3437	size -= n;
	3438	n = PAGE_SIZE;
	3439	}
	3440	}
	3441	}
	3442
	3443	/*
	3444	* vfs_bio_clrbuf:
	3445	*
	3446	* Clear a buffer. This routine essentially fakes an I/O, so we need
	3447	* to clear B_ERROR and B_INVAL.
	3448	*
	3449	* Note that while we only theoretically need to clear through b_bcount,
	3450	* we go ahead and clear through b_bufsize.
	3451	*/
	3452
	3453	void
	3454	vfs_bio_clrbuf(struct buf *bp)
	3455	{
	3456	int i, mask = 0;
	3457	caddr_t sa, ea;
	3458	if ((bp->b_flags & (B_VMIO \| B_MALLOC)) == B_VMIO) {
	3459	bp->b_flags &= ~(B_INVAL\|B_ERROR);
	3460	if ((bp->b_xio.xio_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
	3461	(bp->b_loffset & PAGE_MASK) == 0) {
	3462	mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
	3463	if ((bp->b_xio.xio_pages[0]->valid & mask) == mask) {
	3464	bp->b_resid = 0;
	3465	return;
	3466	}
	3467	if (((bp->b_xio.xio_pages[0]->flags & PG_ZERO) == 0) &&
	3468	((bp->b_xio.xio_pages[0]->valid & mask) == 0)) {
	3469	bzero(bp->b_data, bp->b_bufsize);
	3470	bp->b_xio.xio_pages[0]->valid \|= mask;
	3471	bp->b_resid = 0;
	3472	return;
	3473	}
	3474	}
	3475	ea = sa = bp->b_data;
	3476	for(i=0;i<bp->b_xio.xio_npages;i++,sa=ea) {
	3477	int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
	3478	ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
	3479	ea = (caddr_t)(vm_offset_t)ulmin(
	3480	(u_long)(vm_offset_t)ea,
	3481	(u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
	3482	mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
	3483	if ((bp->b_xio.xio_pages[i]->valid & mask) == mask)
	3484	continue;
	3485	if ((bp->b_xio.xio_pages[i]->valid & mask) == 0) {
	3486	if ((bp->b_xio.xio_pages[i]->flags & PG_ZERO) == 0) {
	3487	bzero(sa, ea - sa);
	3488	}
	3489	} else {
	3490	for (; sa < ea; sa += DEV_BSIZE, j++) {
	3491	if (((bp->b_xio.xio_pages[i]->flags & PG_ZERO) == 0) &&
	3492	(bp->b_xio.xio_pages[i]->valid & (1<<j)) == 0)
	3493	bzero(sa, DEV_BSIZE);
	3494	}
	3495	}
	3496	bp->b_xio.xio_pages[i]->valid \|= mask;
	3497	vm_page_flag_clear(bp->b_xio.xio_pages[i], PG_ZERO);
	3498	}
	3499	bp->b_resid = 0;
	3500	} else {
	3501	clrbuf(bp);
	3502	}
	3503	}
	3504
	3505	/*
	3506	* vm_hold_load_pages:
	3507	*
	3508	* Load pages into the buffer's address space. The pages are
	3509	* allocated from the kernel object in order to reduce interference
	3510	* with the any VM paging I/O activity. The range of loaded
	3511	* pages will be wired.
	3512	*
	3513	* If a page cannot be allocated, the 'pagedaemon' is woken up to
	3514	* retrieve the full range (to - from) of pages.
	3515	*
	3516	*/
	3517	void
	3518	vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
	3519	{
	3520	vm_offset_t pg;
	3521	vm_page_t p;
	3522	int index;
	3523
	3524	to = round_page(to);
	3525	from = round_page(from);
	3526	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
	3527
	3528	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
	3529
	3530	tryagain:
	3531
	3532	/*
	3533	* Note: must allocate system pages since blocking here
	3534	* could intefere with paging I/O, no matter which
	3535	* process we are.
	3536	*/
	3537	p = vm_page_alloc(&kernel_object,
	3538	(pg >> PAGE_SHIFT),
	3539	VM_ALLOC_NORMAL \| VM_ALLOC_SYSTEM);
	3540	if (!p) {
	3541	vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
	3542	vm_wait();
	3543	goto tryagain;
	3544	}
	3545	vm_page_wire(p);
	3546	p->valid = VM_PAGE_BITS_ALL;
	3547	vm_page_flag_clear(p, PG_ZERO);
	3548	pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
	3549	bp->b_xio.xio_pages[index] = p;
	3550	vm_page_wakeup(p);
	3551	}
	3552	bp->b_xio.xio_npages = index;
	3553	}
	3554
	3555	/*
	3556	* vm_hold_free_pages:
	3557	*
	3558	* Return pages associated with the buffer back to the VM system.
	3559	*
	3560	* The range of pages underlying the buffer's address space will
	3561	* be unmapped and un-wired.
	3562	*/
	3563	void
	3564	vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
	3565	{
	3566	vm_offset_t pg;
	3567	vm_page_t p;
	3568	int index, newnpages;
	3569
	3570	from = round_page(from);
	3571	to = round_page(to);
	3572	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
	3573
	3574	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
	3575	p = bp->b_xio.xio_pages[index];
	3576	if (p && (index < bp->b_xio.xio_npages)) {
	3577	if (p->busy) {
	3578	kprintf("vm_hold_free_pages: doffset: %lld, loffset: %lld\n",
	3579	bp->b_bio2.bio_offset, bp->b_loffset);
	3580	}
	3581	bp->b_xio.xio_pages[index] = NULL;
	3582	pmap_kremove(pg);
	3583	vm_page_busy(p);
	3584	vm_page_unwire(p, 0);
	3585	vm_page_free(p);
	3586	}
	3587	}
	3588	bp->b_xio.xio_npages = newnpages;
	3589	}
	3590
	3591	/*
	3592	* vmapbuf:
	3593	*
	3594	* Map a user buffer into KVM via a pbuf. On return the buffer's
	3595	* b_data, b_bufsize, and b_bcount will be set, and its XIO page array
	3596	* initialized.
	3597	*/
	3598	int
	3599	vmapbuf(struct buf *bp, caddr_t udata, int bytes)
	3600	{
	3601	caddr_t addr;
	3602	vm_offset_t va;
	3603	vm_page_t m;
	3604	int vmprot;
	3605	int error;
	3606	int pidx;
	3607	int i;
	3608
	3609	/*
	3610	* bp had better have a command and it better be a pbuf.
	3611	*/
	3612	KKASSERT(bp->b_cmd != BUF_CMD_DONE);
	3613	KKASSERT(bp->b_flags & B_PAGING);
	3614
	3615	if (bytes < 0)
	3616	return (-1);
	3617
	3618	/*
	3619	* Map the user data into KVM. Mappings have to be page-aligned.
	3620	*/
	3621	addr = (caddr_t)trunc_page((vm_offset_t)udata);
	3622	pidx = 0;
	3623
	3624	vmprot = VM_PROT_READ;
	3625	if (bp->b_cmd == BUF_CMD_READ)
	3626	vmprot \|= VM_PROT_WRITE;
	3627
	3628	while (addr < udata + bytes) {
	3629	/*
	3630	* Do the vm_fault if needed; do the copy-on-write thing
	3631	* when reading stuff off device into memory.
	3632	*
	3633	* vm_fault_page*() returns a held VM page.
	3634	*/
	3635	va = (addr >= udata) ? (vm_offset_t)addr : (vm_offset_t)udata;
	3636	va = trunc_page(va);
	3637
	3638	m = vm_fault_page_quick(va, vmprot, &error);
	3639	if (m == NULL) {
	3640	for (i = 0; i < pidx; ++i) {
	3641	vm_page_unhold(bp->b_xio.xio_pages[i]);
	3642	bp->b_xio.xio_pages[i] = NULL;
	3643	}
	3644	return(-1);
	3645	}
	3646	bp->b_xio.xio_pages[pidx] = m;
	3647	addr += PAGE_SIZE;
	3648	++pidx;
	3649	}
	3650
	3651	/*
	3652	* Map the page array and set the buffer fields to point to
	3653	* the mapped data buffer.
	3654	*/
	3655	if (pidx > btoc(MAXPHYS))
	3656	panic("vmapbuf: mapped more than MAXPHYS");
	3657	pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_xio.xio_pages, pidx);
	3658
	3659	bp->b_xio.xio_npages = pidx;
	3660	bp->b_data = bp->b_kvabase + ((int)(intptr_t)udata & PAGE_MASK);
	3661	bp->b_bcount = bytes;
	3662	bp->b_bufsize = bytes;
	3663	return(0);
	3664	}
	3665
	3666	/*
	3667	* vunmapbuf:
	3668	*
	3669	* Free the io map PTEs associated with this IO operation.
	3670	* We also invalidate the TLB entries and restore the original b_addr.
	3671	*/
	3672	void
	3673	vunmapbuf(struct buf *bp)
	3674	{
	3675	int pidx;
	3676	int npages;
	3677
	3678	KKASSERT(bp->b_flags & B_PAGING);
	3679
	3680	npages = bp->b_xio.xio_npages;
	3681	pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
	3682	for (pidx = 0; pidx < npages; ++pidx) {
	3683	vm_page_unhold(bp->b_xio.xio_pages[pidx]);
	3684	bp->b_xio.xio_pages[pidx] = NULL;
	3685	}
	3686	bp->b_xio.xio_npages = 0;
	3687	bp->b_data = bp->b_kvabase;
	3688	}
	3689
	3690	/*
	3691	* Scan all buffers in the system and issue the callback.
	3692	*/
	3693	int
	3694	scan_all_buffers(int (callback)(struct buf , void ), void info)
	3695	{
	3696	int count = 0;
	3697	int error;
	3698	int n;
	3699
	3700	for (n = 0; n < nbuf; ++n) {
	3701	if ((error = callback(&buf[n], info)) < 0) {
	3702	count = error;
	3703	break;
	3704	}
	3705	count += error;
	3706	}
	3707	return (count);
	3708	}
	3709
	3710	/*
	3711	* print out statistics from the current status of the buffer pool
	3712	* this can be toggeled by the system control option debug.syncprt
	3713	*/
	3714	#ifdef DEBUG
	3715	void
	3716	vfs_bufstats(void)
	3717	{
	3718	int i, j, count;
	3719	struct buf *bp;
	3720	struct bqueues *dp;
	3721	int counts[(MAXBSIZE / PAGE_SIZE) + 1];
	3722	static char *bname[3] = { "LOCKED", "LRU", "AGE" };
	3723
	3724	for (dp = bufqueues, i = 0; dp < &bufqueues[3]; dp++, i++) {
	3725	count = 0;
	3726	for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
	3727	counts[j] = 0;
	3728	crit_enter();
	3729	TAILQ_FOREACH(bp, dp, b_freelist) {
	3730	counts[bp->b_bufsize/PAGE_SIZE]++;
	3731	count++;
	3732	}
	3733	crit_exit();
	3734	kprintf("%s: total-%d", bname[i], count);
	3735	for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
	3736	if (counts[j] != 0)
	3737	kprintf(", %d-%d", j * PAGE_SIZE, counts[j]);
	3738	kprintf("\n");
	3739	}
	3740	}
	3741	#endif
	3742
	3743	#ifdef DDB
	3744
	3745	DB_SHOW_COMMAND(buffer, db_show_buffer)
	3746	{
	3747	/* get args */
	3748	struct buf bp = (struct buf )addr;
	3749
	3750	if (!have_addr) {
	3751	db_printf("usage: show buffer <addr>\n");
	3752	return;
	3753	}
	3754
	3755	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
	3756	db_printf("b_cmd = %d\n", bp->b_cmd);
	3757	db_printf("b_error = %d, b_bufsize = %d, b_bcount = %d, "
	3758	"b_resid = %d\n, b_data = %p, "
	3759	"bio_offset(disk) = %lld, bio_offset(phys) = %lld\n",
	3760	bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
	3761	bp->b_data, bp->b_bio2.bio_offset, (bp->b_bio2.bio_next ? bp->b_bio2.bio_next->bio_offset : (off_t)-1));
	3762	if (bp->b_xio.xio_npages) {
	3763	int i;
	3764	db_printf("b_xio.xio_npages = %d, pages(OBJ, IDX, PA): ",
	3765	bp->b_xio.xio_npages);
	3766	for (i = 0; i < bp->b_xio.xio_npages; i++) {
	3767	vm_page_t m;
	3768	m = bp->b_xio.xio_pages[i];
	3769	db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
	3770	(u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
	3771	if ((i + 1) < bp->b_xio.xio_npages)
	3772	db_printf(",");
	3773	}
	3774	db_printf("\n");
	3775	}
	3776	}
	3777	#endif /* DDB */