gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* (MPSAFE)
	3	*
	4	* Copyright (c) 1989, 1993
	5	* The Regents of the University of California. All rights reserved.
	6	* (c) UNIX System Laboratories, Inc.
	7	* All or some portions of this file are derived from material licensed
	8	* to the University of California by American Telephone and Telegraph
	9	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	10	* the permission of UNIX System Laboratories, Inc.
	11	*
	12	* Redistribution and use in source and binary forms, with or without
	13	* modification, are permitted provided that the following conditions
	14	* are met:
	15	* 1. Redistributions of source code must retain the above copyright
	16	* notice, this list of conditions and the following disclaimer.
	17	* 2. Redistributions in binary form must reproduce the above copyright
	18	* notice, this list of conditions and the following disclaimer in the
	19	* documentation and/or other materials provided with the distribution.
	20	* 3. All advertising materials mentioning features or use of this software
	21	* must display the following acknowledgement:
	22	* This product includes software developed by the University of
	23	* California, Berkeley and its contributors.
	24	* 4. Neither the name of the University nor the names of its contributors
	25	* may be used to endorse or promote products derived from this software
	26	* without specific prior written permission.
	27	*
	28	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	29	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	30	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	31	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	32	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	33	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	34	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	35	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	36	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	37	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	38	* SUCH DAMAGE.
	39	*
	40	* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
	41	* $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $
	42	* $DragonFly: src/sys/kern/vfs_sync.c,v 1.18 2008/05/18 05:54:25 dillon Exp $
	43	*/
	44
	45	/*
	46	* External virtual filesystem routines
	47	*/
	48	#include "opt_ddb.h"
	49
	50	#include <sys/param.h>
	51	#include <sys/systm.h>
	52	#include <sys/buf.h>
	53	#include <sys/conf.h>
	54	#include <sys/dirent.h>
	55	#include <sys/domain.h>
	56	#include <sys/eventhandler.h>
	57	#include <sys/fcntl.h>
	58	#include <sys/kernel.h>
	59	#include <sys/kthread.h>
	60	#include <sys/malloc.h>
	61	#include <sys/mbuf.h>
	62	#include <sys/mount.h>
	63	#include <sys/proc.h>
	64	#include <sys/namei.h>
	65	#include <sys/reboot.h>
	66	#include <sys/socket.h>
	67	#include <sys/stat.h>
	68	#include <sys/sysctl.h>
	69	#include <sys/syslog.h>
	70	#include <sys/vmmeter.h>
	71	#include <sys/vnode.h>
	72
	73	#include <machine/limits.h>
	74
	75	#include <vm/vm.h>
	76	#include <vm/vm_object.h>
	77	#include <vm/vm_extern.h>
	78	#include <vm/vm_kern.h>
	79	#include <vm/pmap.h>
	80	#include <vm/vm_map.h>
	81	#include <vm/vm_page.h>
	82	#include <vm/vm_pager.h>
	83	#include <vm/vnode_pager.h>
	84
	85	#include <sys/buf2.h>
	86	#include <sys/thread2.h>
	87	#include <sys/mplock2.h>
	88
	89	/*
	90	* The workitem queue.
	91	*/
	92	#define SYNCER_MAXDELAY 32
	93	static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
	94	time_t syncdelay = 30; /* max time to delay syncing data */
	95	SYSCTL_INT(_kern, OID_AUTO, syncdelay, CTLFLAG_RW,
	96	&syncdelay, 0, "VFS data synchronization delay");
	97	time_t filedelay = 30; /* time to delay syncing files */
	98	SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW,
	99	&filedelay, 0, "File synchronization delay");
	100	time_t dirdelay = 29; /* time to delay syncing directories */
	101	SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW,
	102	&dirdelay, 0, "Directory synchronization delay");
	103	time_t metadelay = 28; /* time to delay syncing metadata */
	104	SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW,
	105	&metadelay, 0, "VFS metadata synchronization delay");
	106	static int rushjob; /* number of slots to run ASAP */
	107	static int stat_rush_requests; /* number of times I/O speeded up */
	108	SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW,
	109	&stat_rush_requests, 0, "");
	110
	111	static int syncer_delayno = 0;
	112	static long syncer_mask;
	113	static struct lwkt_token syncer_token;
	114	LIST_HEAD(synclist, vnode);
	115	static struct synclist *syncer_workitem_pending;
	116
	117	/*
	118	* Called from vfsinit()
	119	*/
	120	void
	121	vfs_sync_init(void)
	122	{
	123	syncer_workitem_pending = hashinit(syncer_maxdelay, M_DEVBUF,
	124	&syncer_mask);
	125	syncer_maxdelay = syncer_mask + 1;
	126	lwkt_token_init(&syncer_token, 1, "syncer");
	127	}
	128
	129	/*
	130	* The workitem queue.
	131	*
	132	* It is useful to delay writes of file data and filesystem metadata
	133	* for tens of seconds so that quickly created and deleted files need
	134	* not waste disk bandwidth being created and removed. To realize this,
	135	* we append vnodes to a "workitem" queue. When running with a soft
	136	* updates implementation, most pending metadata dependencies should
	137	* not wait for more than a few seconds. Thus, mounted on block devices
	138	* are delayed only about a half the time that file data is delayed.
	139	* Similarly, directory updates are more critical, so are only delayed
	140	* about a third the time that file data is delayed. Thus, there are
	141	* SYNCER_MAXDELAY queues that are processed round-robin at a rate of
	142	* one each second (driven off the filesystem syncer process). The
	143	* syncer_delayno variable indicates the next queue that is to be processed.
	144	* Items that need to be processed soon are placed in this queue:
	145	*
	146	* syncer_workitem_pending[syncer_delayno]
	147	*
	148	* A delay of fifteen seconds is done by placing the request fifteen
	149	* entries later in the queue:
	150	*
	151	* syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
	152	*
	153	*/
	154
	155	/*
	156	* Add an item to the syncer work queue.
	157	*
	158	* WARNING: Cannot get vp->v_token here if not already held, we must
	159	* depend on the syncer_token (which might already be held by
	160	* the caller) to protect v_synclist and VONWORKLST.
	161	*
	162	* MPSAFE
	163	*/
	164	void
	165	vn_syncer_add(struct vnode *vp, int delay)
	166	{
	167	int slot;
	168
	169	lwkt_gettoken(&syncer_token);
	170
	171	if (vp->v_flag & VONWORKLST)
	172	LIST_REMOVE(vp, v_synclist);
	173	if (delay > syncer_maxdelay - 2)
	174	delay = syncer_maxdelay - 2;
	175	slot = (syncer_delayno + delay) & syncer_mask;
	176
	177	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
	178	vsetflags(vp, VONWORKLST);
	179
	180	lwkt_reltoken(&syncer_token);
	181	}
	182
	183	/*
	184	* Removes the vnode from the syncer list. Since we might block while
	185	* acquiring the syncer_token we have to recheck conditions.
	186	*
	187	* vp->v_token held on call
	188	*/
	189	void
	190	vn_syncer_remove(struct vnode *vp)
	191	{
	192	lwkt_gettoken(&syncer_token);
	193
	194	if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree)) {
	195	vclrflags(vp, VONWORKLST);
	196	LIST_REMOVE(vp, v_synclist);
	197	}
	198
	199	lwkt_reltoken(&syncer_token);
	200	}
	201
	202	struct thread *updatethread;
	203
	204	/*
	205	* System filesystem synchronizer daemon.
	206	*/
	207	static void
	208	syncer_thread(void)
	209	{
	210	struct thread *td = curthread;
	211	struct synclist *slp;
	212	struct vnode *vp;
	213	long starttime;
	214
	215	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
	216	SHUTDOWN_PRI_LAST);
	217	for (;;) {
	218	kproc_suspend_loop();
	219
	220	starttime = time_second;
	221	lwkt_gettoken(&syncer_token);
	222
	223	/*
	224	* Push files whose dirty time has expired. Be careful
	225	* of interrupt race on slp queue.
	226	*/
	227	slp = &syncer_workitem_pending[syncer_delayno];
	228	syncer_delayno += 1;
	229	if (syncer_delayno == syncer_maxdelay)
	230	syncer_delayno = 0;
	231
	232	while ((vp = LIST_FIRST(slp)) != NULL) {
	233	if (vget(vp, LK_EXCLUSIVE \| LK_NOWAIT) == 0) {
	234	VOP_FSYNC(vp, MNT_LAZY, 0);
	235	vput(vp);
	236	}
	237
	238	/*
	239	* vp is stale but can still be used if we can
	240	* verify that it remains at the head of the list.
	241	* Be careful not to try to get vp->v_token as
	242	* vp can become stale if this blocks.
	243	*
	244	* If the vp is still at the head of the list were
	245	* unable to completely flush it and move it to
	246	* a later slot to give other vnodes a fair shot.
	247	*
	248	* Note that v_tag VT_VFS vnodes can remain on the
	249	* worklist with no dirty blocks, but sync_fsync()
	250	* moves it to a later slot so we will never see it
	251	* here.
	252	*
	253	* It is possible to race a vnode with no dirty
	254	* buffers being removed from the list. If this
	255	* occurs we will move the vnode in the synclist
	256	* and then the other thread will remove it. Do
	257	* not try to remove it here.
	258	*/
	259	if (LIST_FIRST(slp) == vp)
	260	vn_syncer_add(vp, syncdelay);
	261	}
	262	lwkt_reltoken(&syncer_token);
	263
	264	/*
	265	* Do sync processing for each mount.
	266	*/
	267	bio_ops_sync(NULL);
	268
	269	/*
	270	* The variable rushjob allows the kernel to speed up the
	271	* processing of the filesystem syncer process. A rushjob
	272	* value of N tells the filesystem syncer to process the next
	273	* N seconds worth of work on its queue ASAP. Currently rushjob
	274	* is used by the soft update code to speed up the filesystem
	275	* syncer process when the incore state is getting so far
	276	* ahead of the disk that the kernel memory pool is being
	277	* threatened with exhaustion.
	278	*/
	279	if (rushjob > 0) {
	280	atomic_subtract_int(&rushjob, 1);
	281	continue;
	282	}
	283	/*
	284	* If it has taken us less than a second to process the
	285	* current work, then wait. Otherwise start right over
	286	* again. We can still lose time if any single round
	287	* takes more than two seconds, but it does not really
	288	* matter as we are just trying to generally pace the
	289	* filesystem activity.
	290	*/
	291	if (time_second == starttime)
	292	tsleep(&lbolt_syncer, 0, "syncer", 0);
	293	}
	294	}
	295
	296	static struct kproc_desc up_kp = {
	297	"syncer",
	298	syncer_thread,
	299	&updatethread
	300	};
	301	SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
	302
	303	/*
	304	* Request the syncer daemon to speed up its work.
	305	* We never push it to speed up more than half of its
	306	* normal turn time, otherwise it could take over the cpu.
	307	*/
	308	int
	309	speedup_syncer(void)
	310	{
	311	/*
	312	* Don't bother protecting the test. unsleep_and_wakeup_thread()
	313	* will only do something real if the thread is in the right state.
	314	*/
	315	wakeup(&lbolt_syncer);
	316	if (rushjob < syncdelay / 2) {
	317	atomic_add_int(&rushjob, 1);
	318	stat_rush_requests += 1;
	319	return (1);
	320	}
	321	return(0);
	322	}
	323
	324	/*
	325	* Routine to create and manage a filesystem syncer vnode.
	326	*/
	327	static int sync_close(struct vop_close_args *);
	328	static int sync_fsync(struct vop_fsync_args *);
	329	static int sync_inactive(struct vop_inactive_args *);
	330	static int sync_reclaim (struct vop_reclaim_args *);
	331	static int sync_print(struct vop_print_args *);
	332
	333	static struct vop_ops sync_vnode_vops = {
	334	.vop_default = vop_eopnotsupp,
	335	.vop_close = sync_close,
	336	.vop_fsync = sync_fsync,
	337	.vop_inactive = sync_inactive,
	338	.vop_reclaim = sync_reclaim,
	339	.vop_print = sync_print,
	340	};
	341
	342	static struct vop_ops *sync_vnode_vops_p = &sync_vnode_vops;
	343
	344	VNODEOP_SET(sync_vnode_vops);
	345
	346	/*
	347	* Create a new filesystem syncer vnode for the specified mount point.
	348	* This vnode is placed on the worklist and is responsible for sync'ing
	349	* the filesystem.
	350	*
	351	* NOTE: read-only mounts are also placed on the worklist. The filesystem
	352	* sync code is also responsible for cleaning up vnodes.
	353	*/
	354	int
	355	vfs_allocate_syncvnode(struct mount *mp)
	356	{
	357	struct vnode *vp;
	358	static long start, incr, next;
	359	int error;
	360
	361	/* Allocate a new vnode */
	362	error = getspecialvnode(VT_VFS, mp, &sync_vnode_vops_p, &vp, 0, 0);
	363	if (error) {
	364	mp->mnt_syncer = NULL;
	365	return (error);
	366	}
	367	vp->v_type = VNON;
	368	/*
	369	* Place the vnode onto the syncer worklist. We attempt to
	370	* scatter them about on the list so that they will go off
	371	* at evenly distributed times even if all the filesystems
	372	* are mounted at once.
	373	*/
	374	next += incr;
	375	if (next == 0 \|\| next > syncer_maxdelay) {
	376	start /= 2;
	377	incr /= 2;
	378	if (start == 0) {
	379	start = syncer_maxdelay / 2;
	380	incr = syncer_maxdelay;
	381	}
	382	next = start;
	383	}
	384	vn_syncer_add(vp, syncdelay > 0 ? next % syncdelay : 0);
	385
	386	/*
	387	* The mnt_syncer field inherits the vnode reference, which is
	388	* held until later decomissioning.
	389	*/
	390	mp->mnt_syncer = vp;
	391	vx_unlock(vp);
	392	return (0);
	393	}
	394
	395	static int
	396	sync_close(struct vop_close_args *ap)
	397	{
	398	return (0);
	399	}
	400
	401	/*
	402	* Do a lazy sync of the filesystem.
	403	*
	404	* sync_fsync { struct vnode *a_vp, int a_waitfor }
	405	*/
	406	static int
	407	sync_fsync(struct vop_fsync_args *ap)
	408	{
	409	struct vnode *syncvp = ap->a_vp;
	410	struct mount *mp = syncvp->v_mount;
	411	int asyncflag;
	412
	413	/*
	414	* We only need to do something if this is a lazy evaluation.
	415	*/
	416	if ((ap->a_waitfor & MNT_LAZY) == 0)
	417	return (0);
	418
	419	/*
	420	* Move ourselves to the back of the sync list.
	421	*/
	422	vn_syncer_add(syncvp, syncdelay);
	423
	424	/*
	425	* Walk the list of vnodes pushing all that are dirty and
	426	* not already on the sync list, and freeing vnodes which have
	427	* no refs and whos VM objects are empty. vfs_msync() handles
	428	* the VM issues and must be called whether the mount is readonly
	429	* or not.
	430	*/
	431	if (vfs_busy(mp, LK_NOWAIT) != 0)
	432	return (0);
	433	if (mp->mnt_flag & MNT_RDONLY) {
	434	vfs_msync(mp, MNT_NOWAIT);
	435	} else {
	436	asyncflag = mp->mnt_flag & MNT_ASYNC;
	437	mp->mnt_flag &= ~MNT_ASYNC; /* ZZZ hack */
	438	vfs_msync(mp, MNT_NOWAIT);
	439	VFS_SYNC(mp, MNT_NOWAIT \| MNT_LAZY);
	440	if (asyncflag)
	441	mp->mnt_flag \|= MNT_ASYNC;
	442	}
	443	vfs_unbusy(mp);
	444	return (0);
	445	}
	446
	447	/*
	448	* The syncer vnode is no longer referenced.
	449	*
	450	* sync_inactive { struct vnode a_vp, struct proc a_p }
	451	*/
	452	static int
	453	sync_inactive(struct vop_inactive_args *ap)
	454	{
	455	vgone_vxlocked(ap->a_vp);
	456	return (0);
	457	}
	458
	459	/*
	460	* The syncer vnode is no longer needed and is being decommissioned.
	461	* This can only occur when the last reference has been released on
	462	* mp->mnt_syncer, so mp->mnt_syncer had better be NULL.
	463	*
	464	* Modifications to the worklist must be protected with a critical
	465	* section.
	466	*
	467	* sync_reclaim { struct vnode *a_vp }
	468	*/
	469	static int
	470	sync_reclaim(struct vop_reclaim_args *ap)
	471	{
	472	struct vnode *vp = ap->a_vp;
	473
	474	lwkt_gettoken(&syncer_token);
	475	KKASSERT(vp->v_mount->mnt_syncer != vp);
	476	if (vp->v_flag & VONWORKLST) {
	477	LIST_REMOVE(vp, v_synclist);
	478	vclrflags(vp, VONWORKLST);
	479	}
	480	lwkt_reltoken(&syncer_token);
	481
	482	return (0);
	483	}
	484
	485	/*
	486	* Print out a syncer vnode.
	487	*
	488	* sync_print { struct vnode *a_vp }
	489	*/
	490	static int
	491	sync_print(struct vop_print_args *ap)
	492	{
	493	struct vnode *vp = ap->a_vp;
	494
	495	kprintf("syncer vnode");
	496	lockmgr_printinfo(&vp->v_lock);
	497	kprintf("\n");
	498	return (0);
	499	}
	500