gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1990 University of Utah.
	3	* Copyright (c) 1991 The Regents of the University of California.
	4	* All rights reserved.
	5	* Copyright (c) 1993, 1994 John S. Dyson
	6	* Copyright (c) 1995, David Greenman
	7	*
	8	* This code is derived from software contributed to Berkeley by
	9	* the Systems Programming Group of the University of Utah Computer
	10	* Science Department.
	11	*
	12	* Redistribution and use in source and binary forms, with or without
	13	* modification, are permitted provided that the following conditions
	14	* are met:
	15	* 1. Redistributions of source code must retain the above copyright
	16	* notice, this list of conditions and the following disclaimer.
	17	* 2. Redistributions in binary form must reproduce the above copyright
	18	* notice, this list of conditions and the following disclaimer in the
	19	* documentation and/or other materials provided with the distribution.
	20	* 3. All advertising materials mentioning features or use of this software
	21	* must display the following acknowledgement:
	22	* This product includes software developed by the University of
	23	* California, Berkeley and its contributors.
	24	* 4. Neither the name of the University nor the names of its contributors
	25	* may be used to endorse or promote products derived from this software
	26	* without specific prior written permission.
	27	*
	28	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	29	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	30	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	31	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	32	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	33	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	34	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	35	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	36	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	37	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	38	* SUCH DAMAGE.
	39	*
	40	* from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91
	41	* $FreeBSD: src/sys/vm/vnode_pager.c,v 1.116.2.7 2002/12/31 09:34:51 dillon Exp $
	42	* $DragonFly: src/sys/vm/vnode_pager.c,v 1.43 2008/06/19 23:27:39 dillon Exp $
	43	*/
	44
	45	/*
	46	* Page to/from files (vnodes).
	47	*/
	48
	49	/*
	50	* TODO:
	51	* Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will
	52	* greatly re-simplify the vnode_pager.
	53	*/
	54
	55	#include <sys/param.h>
	56	#include <sys/systm.h>
	57	#include <sys/kernel.h>
	58	#include <sys/proc.h>
	59	#include <sys/vnode.h>
	60	#include <sys/mount.h>
	61	#include <sys/buf.h>
	62	#include <sys/vmmeter.h>
	63	#include <sys/conf.h>
	64
	65	#include <cpu/lwbuf.h>
	66
	67	#include <vm/vm.h>
	68	#include <vm/vm_object.h>
	69	#include <vm/vm_page.h>
	70	#include <vm/vm_pager.h>
	71	#include <vm/vm_map.h>
	72	#include <vm/vnode_pager.h>
	73	#include <vm/swap_pager.h>
	74	#include <vm/vm_extern.h>
	75
	76	#include <sys/thread2.h>
	77	#include <vm/vm_page2.h>
	78
	79	static void vnode_pager_dealloc (vm_object_t);
	80	static int vnode_pager_getpage (vm_object_t, vm_page_t *, int);
	81	static void vnode_pager_putpages (vm_object_t, vm_page_t , int, boolean_t, int );
	82	static boolean_t vnode_pager_haspage (vm_object_t, vm_pindex_t);
	83
	84	struct pagerops vnodepagerops = {
	85	vnode_pager_dealloc,
	86	vnode_pager_getpage,
	87	vnode_pager_putpages,
	88	vnode_pager_haspage
	89	};
	90
	91	static struct krate vbadrate = { 1 };
	92	static struct krate vresrate = { 1 };
	93
	94	int vnode_pbuf_freecnt = -1; /* start out unlimited */
	95
	96	/*
	97	* Allocate a VM object for a vnode, typically a regular file vnode.
	98	*
	99	* Some additional information is required to generate a properly sized
	100	* object which covers the entire buffer cache buffer straddling the file
	101	* EOF. Userland does not see the extra pages as the VM fault code tests
	102	* against v_filesize.
	103	*/
	104	vm_object_t
	105	vnode_pager_alloc(void *handle, off_t length, vm_prot_t prot, off_t offset,
	106	int blksize, int boff)
	107	{
	108	vm_object_t object;
	109	struct vnode *vp;
	110	off_t loffset;
	111	vm_pindex_t lsize;
	112
	113	/*
	114	* Pageout to vnode, no can do yet.
	115	*/
	116	if (handle == NULL)
	117	return (NULL);
	118
	119	/*
	120	* XXX hack - This initialization should be put somewhere else.
	121	*/
	122	if (vnode_pbuf_freecnt < 0) {
	123	vnode_pbuf_freecnt = nswbuf / 2 + 1;
	124	}
	125
	126	vp = (struct vnode *) handle;
	127
	128	/*
	129	* Prevent race condition when allocating the object. This
	130	* can happen with NFS vnodes since the nfsnode isn't locked.
	131	*/
	132	while (vp->v_flag & VOLOCK) {
	133	vsetflags(vp, VOWANT);
	134	tsleep(vp, 0, "vnpobj", 0);
	135	}
	136	vsetflags(vp, VOLOCK);
	137
	138	/*
	139	* If the object is being terminated, wait for it to
	140	* go away.
	141	*/
	142	while (((object = vp->v_object) != NULL) &&
	143	(object->flags & OBJ_DEAD)) {
	144	vm_object_dead_sleep(object, "vadead");
	145	}
	146
	147	if (vp->v_sysref.refcnt <= 0)
	148	panic("vnode_pager_alloc: no vnode reference");
	149
	150	/*
	151	* Round up to the next block, then destroy the buffers in question.
	152	* Since we are only removing some of the buffers we must rely on the
	153	* scan count to determine whether a loop is necessary.
	154	*
	155	* Destroy any pages beyond the last buffer.
	156	*/
	157	if (boff < 0)
	158	boff = (int)(length % blksize);
	159	if (boff)
	160	loffset = length + (blksize - boff);
	161	else
	162	loffset = length;
	163	lsize = OFF_TO_IDX(round_page64(loffset));
	164
	165	if (object == NULL) {
	166	/*
	167	* And an object of the appropriate size
	168	*/
	169	object = vm_object_allocate(OBJT_VNODE, lsize);
	170	object->flags = 0;
	171	object->handle = handle;
	172	vp->v_object = object;
	173	vp->v_filesize = length;
	174	if (vp->v_mount && (vp->v_mount->mnt_kern_flag & MNTK_NOMSYNC))
	175	object->flags \|= OBJ_NOMSYNC;
	176	} else {
	177	object->ref_count++;
	178	if (object->size != lsize) {
	179	kprintf("vnode_pager_alloc: Warning, objsize "
	180	"mismatch %jd/%jd vp=%p obj=%p\n",
	181	(intmax_t)object->size,
	182	(intmax_t)lsize,
	183	vp, object);
	184	}
	185	if (vp->v_filesize != length) {
	186	kprintf("vnode_pager_alloc: Warning, filesize "
	187	"mismatch %jd/%jd vp=%p obj=%p\n",
	188	(intmax_t)vp->v_filesize,
	189	(intmax_t)length,
	190	vp, object);
	191	}
	192	}
	193	vref(vp);
	194
	195	vclrflags(vp, VOLOCK);
	196	if (vp->v_flag & VOWANT) {
	197	vclrflags(vp, VOWANT);
	198	wakeup(vp);
	199	}
	200	return (object);
	201	}
	202
	203	/*
	204	* Add a ref to a vnode's existing VM object, return the object or
	205	* NULL if the vnode did not have one. This does not create the
	206	* object (we can't since we don't know what the proper blocksize/boff
	207	* is to match the VFS's use of the buffer cache).
	208	*/
	209	vm_object_t
	210	vnode_pager_reference(struct vnode *vp)
	211	{
	212	vm_object_t object;
	213
	214	/*
	215	* Prevent race condition when allocating the object. This
	216	* can happen with NFS vnodes since the nfsnode isn't locked.
	217	*/
	218	while (vp->v_flag & VOLOCK) {
	219	vsetflags(vp, VOWANT);
	220	tsleep(vp, 0, "vnpobj", 0);
	221	}
	222	vsetflags(vp, VOLOCK);
	223
	224	/*
	225	* Prevent race conditions against deallocation of the VM
	226	* object.
	227	*/
	228	while (((object = vp->v_object) != NULL) &&
	229	(object->flags & OBJ_DEAD)) {
	230	vm_object_dead_sleep(object, "vadead");
	231	}
	232
	233	/*
	234	* The object is expected to exist, the caller will handle
	235	* NULL returns if it does not.
	236	*/
	237	if (object) {
	238	object->ref_count++;
	239	vref(vp);
	240	}
	241
	242	vclrflags(vp, VOLOCK);
	243	if (vp->v_flag & VOWANT) {
	244	vclrflags(vp, VOWANT);
	245	wakeup(vp);
	246	}
	247	return (object);
	248	}
	249
	250	static void
	251	vnode_pager_dealloc(vm_object_t object)
	252	{
	253	struct vnode *vp = object->handle;
	254
	255	if (vp == NULL)
	256	panic("vnode_pager_dealloc: pager already dealloced");
	257
	258	vm_object_pip_wait(object, "vnpdea");
	259
	260	object->handle = NULL;
	261	object->type = OBJT_DEAD;
	262	vp->v_object = NULL;
	263	vp->v_filesize = NOOFFSET;
	264	vclrflags(vp, VTEXT \| VOBJBUF);
	265	swap_pager_freespace_all(object);
	266	}
	267
	268	/*
	269	* Return whether the vnode pager has the requested page. Return the
	270	* number of disk-contiguous pages before and after the requested page,
	271	* not including the requested page.
	272	*/
	273	static boolean_t
	274	vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex)
	275	{
	276	struct vnode *vp = object->handle;
	277	off_t loffset;
	278	off_t doffset;
	279	int voff;
	280	int bsize;
	281	int error;
	282
	283	/*
	284	* If no vp or vp is doomed or marked transparent to VM, we do not
	285	* have the page.
	286	*/
	287	if ((vp == NULL) \|\| (vp->v_flag & VRECLAIMED))
	288	return FALSE;
	289
	290	/*
	291	* If filesystem no longer mounted or offset beyond end of file we do
	292	* not have the page.
	293	*/
	294	loffset = IDX_TO_OFF(pindex);
	295
	296	if (vp->v_mount == NULL \|\| loffset >= vp->v_filesize)
	297	return FALSE;
	298
	299	bsize = vp->v_mount->mnt_stat.f_iosize;
	300	voff = loffset % bsize;
	301
	302	/*
	303	* XXX
	304	*
	305	* BMAP returns byte counts before and after, where after
	306	* is inclusive of the base page. haspage must return page
	307	* counts before and after where after does not include the
	308	* base page.
	309	*
	310	* BMAP is allowed to return a *after of 0 for backwards
	311	* compatibility. The base page is still considered valid if
	312	* no error is returned.
	313	*/
	314	error = VOP_BMAP(vp, loffset - voff, &doffset, NULL, NULL, 0);
	315	if (error)
	316	return TRUE;
	317	if (doffset == NOOFFSET)
	318	return FALSE;
	319	return TRUE;
	320	}
	321
	322	/*
	323	* Lets the VM system know about a change in size for a file.
	324	* We adjust our own internal size and flush any cached pages in
	325	* the associated object that are affected by the size change.
	326	*
	327	* NOTE: This routine may be invoked as a result of a pager put
	328	* operation (possibly at object termination time), so we must be careful.
	329	*
	330	* NOTE: vp->v_filesize is initialized to NOOFFSET (-1), be sure that
	331	* we do not blow up on the case. nsize will always be >= 0, however.
	332	*/
	333	void
	334	vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize)
	335	{
	336	vm_pindex_t nobjsize;
	337	vm_pindex_t oobjsize;
	338	vm_object_t object = vp->v_object;
	339
	340	if (object == NULL)
	341	return;
	342
	343	/*
	344	* Hasn't changed size
	345	*/
	346	if (nsize == vp->v_filesize)
	347	return;
	348
	349	lwkt_gettoken(&vm_token);
	350
	351	/*
	352	* Has changed size. Adjust the VM object's size and v_filesize
	353	* before we start scanning pages to prevent new pages from being
	354	* allocated during the scan.
	355	*/
	356	nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
	357	oobjsize = object->size;
	358	object->size = nobjsize;
	359
	360	/*
	361	* File has shrunk. Toss any cached pages beyond the new EOF.
	362	*/
	363	if (nsize < vp->v_filesize) {
	364	vp->v_filesize = nsize;
	365	if (nobjsize < oobjsize) {
	366	vm_object_page_remove(object, nobjsize, oobjsize,
	367	FALSE);
	368	}
	369	/*
	370	* This gets rid of garbage at the end of a page that is now
	371	* only partially backed by the vnode. Since we are setting
	372	* the entire page valid & clean after we are done we have
	373	* to be sure that the portion of the page within the file
	374	* bounds is already valid. If it isn't then making it
	375	* valid would create a corrupt block.
	376	*/
	377	if (nsize & PAGE_MASK) {
	378	vm_offset_t kva;
	379	vm_page_t m;
	380
	381	do {
	382	m = vm_page_lookup(object, OFF_TO_IDX(nsize));
	383	} while (m && vm_page_sleep_busy(m, TRUE, "vsetsz"));
	384
	385	if (m && m->valid) {
	386	int base = (int)nsize & PAGE_MASK;
	387	int size = PAGE_SIZE - base;
	388	struct lwbuf *lwb;
	389
	390	/*
	391	* Clear out partial-page garbage in case
	392	* the page has been mapped.
	393	*
	394	* This is byte aligned.
	395	*/
	396	vm_page_busy(m);
	397	lwb = lwbuf_alloc(m);
	398	kva = lwbuf_kva(lwb);
	399	bzero((caddr_t)kva + base, size);
	400	lwbuf_free(lwb);
	401
	402	/*
	403	* XXX work around SMP data integrity race
	404	* by unmapping the page from user processes.
	405	* The garbage we just cleared may be mapped
	406	* to a user process running on another cpu
	407	* and this code is not running through normal
	408	* I/O channels which handle SMP issues for
	409	* us, so unmap page to synchronize all cpus.
	410	*
	411	* XXX should vm_pager_unmap_page() have
	412	* dealt with this?
	413	*/
	414	vm_page_protect(m, VM_PROT_NONE);
	415
	416	/*
	417	* Clear out partial-page dirty bits. This
	418	* has the side effect of setting the valid
	419	* bits, but that is ok. There are a bunch
	420	* of places in the VM system where we expected
	421	* m->dirty == VM_PAGE_BITS_ALL. The file EOF
	422	* case is one of them. If the page is still
	423	* partially dirty, make it fully dirty.
	424	*
	425	* NOTE: We do not clear out the valid
	426	* bits. This would prevent bogus_page
	427	* replacement from working properly.
	428	*
	429	* NOTE: We do not want to clear the dirty
	430	* bit for a partial DEV_BSIZE'd truncation!
	431	* This is DEV_BSIZE aligned!
	432	*/
	433	vm_page_clear_dirty_beg_nonincl(m, base, size);
	434	if (m->dirty != 0)
	435	m->dirty = VM_PAGE_BITS_ALL;
	436	vm_page_wakeup(m);
	437	}
	438	}
	439	} else {
	440	vp->v_filesize = nsize;
	441	}
	442	lwkt_reltoken(&vm_token);
	443	}
	444
	445	/*
	446	* Release a page busied for a getpages operation. The page may have become
	447	* wired (typically due to being used by the buffer cache) or otherwise been
	448	* soft-busied and cannot be freed in that case. A held page can still be
	449	* freed.
	450	*/
	451	void
	452	vnode_pager_freepage(vm_page_t m)
	453	{
	454	if (m->busy \|\| m->wire_count) {
	455	vm_page_activate(m);
	456	vm_page_wakeup(m);
	457	} else {
	458	vm_page_free(m);
	459	}
	460	}
	461
	462	/*
	463	* EOPNOTSUPP is no longer legal. For local media VFS's that do not
	464	* implement their own VOP_GETPAGES, their VOP_GETPAGES should call to
	465	* vnode_pager_generic_getpages() to implement the previous behaviour.
	466	*
	467	* All other FS's should use the bypass to get to the local media
	468	* backing vp's VOP_GETPAGES.
	469	*/
	470	static int
	471	vnode_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
	472	{
	473	int rtval;
	474	struct vnode *vp;
	475
	476	vp = object->handle;
	477	rtval = VOP_GETPAGES(vp, mpp, PAGE_SIZE, 0, 0, seqaccess);
	478	if (rtval == EOPNOTSUPP)
	479	panic("vnode_pager: vfs's must implement vop_getpages\n");
	480	return rtval;
	481	}
	482
	483	/*
	484	* This is now called from local media FS's to operate against their
	485	* own vnodes if they fail to implement VOP_GETPAGES.
	486	*
	487	* With all the caching local media devices do these days there is really
	488	* very little point to attempting to restrict the I/O size to contiguous
	489	* blocks on-disk, especially if our caller thinks we need all the specified
	490	* pages. Just construct and issue a READ.
	491	*/
	492	int
	493	vnode_pager_generic_getpages(struct vnode vp, vm_page_t mpp, int bytecount,
	494	int reqpage, int seqaccess)
	495	{
	496	struct iovec aiov;
	497	struct uio auio;
	498	off_t foff;
	499	int error;
	500	int count;
	501	int i;
	502	int ioflags;
	503
	504	/*
	505	* Do not do anything if the vnode is bad.
	506	*/
	507	if (vp->v_mount == NULL)
	508	return VM_PAGER_BAD;
	509
	510	/*
	511	* Calculate the number of pages. Since we are paging in whole
	512	* pages, adjust bytecount to be an integral multiple of the page
	513	* size. It will be clipped to the file EOF later on.
	514	*/
	515	bytecount = round_page(bytecount);
	516	count = bytecount / PAGE_SIZE;
	517
	518	/*
	519	* We could check m[reqpage]->valid here and shortcut the operation,
	520	* but doing so breaks read-ahead. Instead assume that the VM
	521	* system has already done at least the check, don't worry about
	522	* any races, and issue the VOP_READ to allow read-ahead to function.
	523	*
	524	* This keeps the pipeline full for I/O bound sequentially scanned
	525	* mmap()'s
	526	*/
	527	/* don't shortcut */
	528
	529	/*
	530	* Discard pages past the file EOF. If the requested page is past
	531	* the file EOF we just leave its valid bits set to 0, the caller
	532	* expects to maintain ownership of the requested page. If the
	533	* entire range is past file EOF discard everything and generate
	534	* a pagein error.
	535	*/
	536	foff = IDX_TO_OFF(mpp[0]->pindex);
	537	if (foff >= vp->v_filesize) {
	538	for (i = 0; i < count; i++) {
	539	if (i != reqpage)
	540	vnode_pager_freepage(mpp[i]);
	541	}
	542	return VM_PAGER_ERROR;
	543	}
	544
	545	if (foff + bytecount > vp->v_filesize) {
	546	bytecount = vp->v_filesize - foff;
	547	i = round_page(bytecount) / PAGE_SIZE;
	548	while (count > i) {
	549	--count;
	550	if (count != reqpage)
	551	vnode_pager_freepage(mpp[count]);
	552	}
	553	}
	554
	555	/*
	556	* The size of the transfer is bytecount. bytecount will be an
	557	* integral multiple of the page size unless it has been clipped
	558	* to the file EOF. The transfer cannot exceed the file EOF.
	559	*
	560	* When dealing with real devices we must round-up to the device
	561	* sector size.
	562	*/
	563	if (vp->v_type == VBLK \|\| vp->v_type == VCHR) {
	564	int secmask = vp->v_rdev->si_bsize_phys - 1;
	565	KASSERT(secmask < PAGE_SIZE, ("vnode_pager_generic_getpages: sector size %d too large\n", secmask + 1));
	566	bytecount = (bytecount + secmask) & ~secmask;
	567	}
	568
	569	/*
	570	* Severe hack to avoid deadlocks with the buffer cache
	571	*/
	572	for (i = 0; i < count; ++i) {
	573	vm_page_t mt = mpp[i];
	574
	575	vm_page_io_start(mt);
	576	vm_page_wakeup(mt);
	577	}
	578
	579	/*
	580	* Issue the I/O with some read-ahead if bytecount > PAGE_SIZE
	581	*/
	582	ioflags = IO_VMIO;
	583	if (seqaccess)
	584	ioflags \|= IO_SEQMAX << IO_SEQSHIFT;
	585
	586	aiov.iov_base = NULL;
	587	aiov.iov_len = bytecount;
	588	auio.uio_iov = &aiov;
	589	auio.uio_iovcnt = 1;
	590	auio.uio_offset = foff;
	591	auio.uio_segflg = UIO_NOCOPY;
	592	auio.uio_rw = UIO_READ;
	593	auio.uio_resid = bytecount;
	594	auio.uio_td = NULL;
	595	mycpu->gd_cnt.v_vnodein++;
	596	mycpu->gd_cnt.v_vnodepgsin += count;
	597
	598	error = VOP_READ(vp, &auio, ioflags, proc0.p_ucred);
	599
	600	/*
	601	* Severe hack to avoid deadlocks with the buffer cache
	602	*/
	603	lwkt_gettoken(&vm_token);
	604	for (i = 0; i < count; ++i) {
	605	vm_page_t mt = mpp[i];
	606
	607	while (vm_page_sleep_busy(mt, FALSE, "getpgs"))
	608	;
	609	vm_page_busy(mt);
	610	vm_page_io_finish(mt);
	611	}
	612	lwkt_reltoken(&vm_token);
	613
	614	/*
	615	* Calculate the actual number of bytes read and clean up the
	616	* page list.
	617	*/
	618	bytecount -= auio.uio_resid;
	619
	620	for (i = 0; i < count; ++i) {
	621	vm_page_t mt = mpp[i];
	622
	623	if (i != reqpage) {
	624	if (error == 0 && mt->valid) {
	625	if (mt->flags & PG_WANTED)
	626	vm_page_activate(mt);
	627	else
	628	vm_page_deactivate(mt);
	629	vm_page_wakeup(mt);
	630	} else {
	631	vnode_pager_freepage(mt);
	632	}
	633	} else if (mt->valid == 0) {
	634	if (error == 0) {
	635	kprintf("page failed but no I/O error page %p object %p pindex %d\n", mt, mt->object, (int) mt->pindex);
	636	/* whoops, something happened */
	637	error = EINVAL;
	638	}
	639	} else if (mt->valid != VM_PAGE_BITS_ALL) {
	640	/*
	641	* Zero-extend the requested page if necessary (if
	642	* the filesystem is using a small block size).
	643	*/
	644	vm_page_zero_invalid(mt, TRUE);
	645	}
	646	}
	647	if (error) {
	648	kprintf("vnode_pager_getpage: I/O read error\n");
	649	}
	650	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
	651	}
	652
	653	/*
	654	* EOPNOTSUPP is no longer legal. For local media VFS's that do not
	655	* implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to
	656	* vnode_pager_generic_putpages() to implement the previous behaviour.
	657	*
	658	* Caller has already cleared the pmap modified bits, if any.
	659	*
	660	* All other FS's should use the bypass to get to the local media
	661	* backing vp's VOP_PUTPAGES.
	662	*/
	663	static void
	664	vnode_pager_putpages(vm_object_t object, vm_page_t *m, int count,
	665	boolean_t sync, int *rtvals)
	666	{
	667	int rtval;
	668	struct vnode *vp;
	669	int bytes = count * PAGE_SIZE;
	670
	671	/*
	672	* Force synchronous operation if we are extremely low on memory
	673	* to prevent a low-memory deadlock. VOP operations often need to
	674	* allocate more memory to initiate the I/O ( i.e. do a BMAP
	675	* operation ). The swapper handles the case by limiting the amount
	676	* of asynchronous I/O, but that sort of solution doesn't scale well
	677	* for the vnode pager without a lot of work.
	678	*
	679	* Also, the backing vnode's iodone routine may not wake the pageout
	680	* daemon up. This should be probably be addressed XXX.
	681	*/
	682
	683	if ((vmstats.v_free_count + vmstats.v_cache_count) < vmstats.v_pageout_free_min)
	684	sync \|= OBJPC_SYNC;
	685
	686	/*
	687	* Call device-specific putpages function
	688	*/
	689	vp = object->handle;
	690	rtval = VOP_PUTPAGES(vp, m, bytes, sync, rtvals, 0);
	691	if (rtval == EOPNOTSUPP) {
	692	kprintf("vnode_pager: * WARNING * stale FS putpages\n");
	693	rtval = vnode_pager_generic_putpages( vp, m, bytes, sync, rtvals);
	694	}
	695	}
	696
	697
	698	/*
	699	* This is now called from local media FS's to operate against their
	700	* own vnodes if they fail to implement VOP_PUTPAGES.
	701	*
	702	* This is typically called indirectly via the pageout daemon and
	703	* clustering has already typically occured, so in general we ask the
	704	* underlying filesystem to write the data out asynchronously rather
	705	* then delayed.
	706	*/
	707	int
	708	vnode_pager_generic_putpages(struct vnode vp, vm_page_t m, int bytecount,
	709	int flags, int *rtvals)
	710	{
	711	int i;
	712	vm_object_t object;
	713	int maxsize, ncount, count;
	714	vm_ooffset_t poffset;
	715	struct uio auio;
	716	struct iovec aiov;
	717	int error;
	718	int ioflags;
	719
	720	object = vp->v_object;
	721	count = bytecount / PAGE_SIZE;
	722
	723	for (i = 0; i < count; i++)
	724	rtvals[i] = VM_PAGER_AGAIN;
	725
	726	if ((int) m[0]->pindex < 0) {
	727	kprintf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%x)\n",
	728	(long)m[0]->pindex, m[0]->dirty);
	729	rtvals[0] = VM_PAGER_BAD;
	730	return VM_PAGER_BAD;
	731	}
	732
	733	maxsize = count * PAGE_SIZE;
	734	ncount = count;
	735
	736	poffset = IDX_TO_OFF(m[0]->pindex);
	737
	738	/*
	739	* If the page-aligned write is larger then the actual file we
	740	* have to invalidate pages occuring beyond the file EOF.
	741	*
	742	* If the file EOF resides in the middle of a page we still clear
	743	* all of that page's dirty bits later on. If we didn't it would
	744	* endlessly re-write.
	745	*
	746	* We do not under any circumstances truncate the valid bits, as
	747	* this will screw up bogus page replacement.
	748	*
	749	* The caller has already read-protected the pages. The VFS must
	750	* use the buffer cache to wrap the pages. The pages might not
	751	* be immediately flushed by the buffer cache but once under its
	752	* control the pages themselves can wind up being marked clean
	753	* and their covering buffer cache buffer can be marked dirty.
	754	*/
	755	if (poffset + maxsize > vp->v_filesize) {
	756	if (poffset < vp->v_filesize) {
	757	maxsize = vp->v_filesize - poffset;
	758	ncount = btoc(maxsize);
	759	} else {
	760	maxsize = 0;
	761	ncount = 0;
	762	}
	763	if (ncount < count) {
	764	for (i = ncount; i < count; i++) {
	765	rtvals[i] = VM_PAGER_BAD;
	766	}
	767	}
	768	}
	769
	770	/*
	771	* pageouts are already clustered, use IO_ASYNC to force a bawrite()
	772	* rather then a bdwrite() to prevent paging I/O from saturating
	773	* the buffer cache. Dummy-up the sequential heuristic to cause
	774	* large ranges to cluster. If neither IO_SYNC or IO_ASYNC is set,
	775	* the system decides how to cluster.
	776	*/
	777	ioflags = IO_VMIO;
	778	if (flags & (VM_PAGER_PUT_SYNC \| VM_PAGER_PUT_INVAL))
	779	ioflags \|= IO_SYNC;
	780	else if ((flags & VM_PAGER_CLUSTER_OK) == 0)
	781	ioflags \|= IO_ASYNC;
	782	ioflags \|= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
	783	ioflags \|= IO_SEQMAX << IO_SEQSHIFT;
	784
	785	aiov.iov_base = (caddr_t) 0;
	786	aiov.iov_len = maxsize;
	787	auio.uio_iov = &aiov;
	788	auio.uio_iovcnt = 1;
	789	auio.uio_offset = poffset;
	790	auio.uio_segflg = UIO_NOCOPY;
	791	auio.uio_rw = UIO_WRITE;
	792	auio.uio_resid = maxsize;
	793	auio.uio_td = NULL;
	794	error = VOP_WRITE(vp, &auio, ioflags, proc0.p_ucred);
	795	mycpu->gd_cnt.v_vnodeout++;
	796	mycpu->gd_cnt.v_vnodepgsout += ncount;
	797
	798	if (error) {
	799	krateprintf(&vbadrate,
	800	"vnode_pager_putpages: I/O error %d\n", error);
	801	}
	802	if (auio.uio_resid) {
	803	krateprintf(&vresrate,
	804	"vnode_pager_putpages: residual I/O %zd at %lu\n",
	805	auio.uio_resid, (u_long)m[0]->pindex);
	806	}
	807	if (error == 0) {
	808	for (i = 0; i < ncount; i++) {
	809	rtvals[i] = VM_PAGER_OK;
	810	vm_page_undirty(m[i]);
	811	}
	812	}
	813	return rtvals[0];
	814	}
	815
	816	struct vnode *
	817	vnode_pager_lock(vm_object_t object)
	818	{
	819	struct thread td = curthread; / XXX */
	820	int error;
	821
	822	for (; object != NULL; object = object->backing_object) {
	823	if (object->type != OBJT_VNODE)
	824	continue;
	825	if (object->flags & OBJ_DEAD)
	826	return NULL;
	827
	828	for (;;) {
	829	struct vnode *vp = object->handle;
	830	error = vget(vp, LK_SHARED \| LK_RETRY \| LK_CANRECURSE);
	831	if (error == 0) {
	832	if (object->handle != vp) {
	833	vput(vp);
	834	continue;
	835	}
	836	return (vp);
	837	}
	838	if ((object->flags & OBJ_DEAD) \|\|
	839	(object->type != OBJT_VNODE)) {
	840	return NULL;
	841	}
	842	kprintf("vnode_pager_lock: vp %p error %d lockstatus %d, retrying\n", vp, error, lockstatus(&vp->v_lock, td));
	843	tsleep(object->handle, 0, "vnpgrl", hz);
	844	}
	845	}
	846	return NULL;
	847	}