gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1989, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	* (c) UNIX System Laboratories, Inc.
	5	* All or some portions of this file are derived from material licensed
	6	* to the University of California by American Telephone and Telegraph
	7	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	8	* the permission of UNIX System Laboratories, Inc.
	9	*
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	* 1. Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* 2. Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* 3. All advertising materials mentioning features or use of this software
	19	* must display the following acknowledgement:
	20	* This product includes software developed by the University of
	21	* California, Berkeley and its contributors.
	22	* 4. Neither the name of the University nor the names of its contributors
	23	* may be used to endorse or promote products derived from this software
	24	* without specific prior written permission.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	27	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	28	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	29	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	30	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
	39	* $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $
	40	* $DragonFly: src/sys/kern/vfs_sync.c,v 1.10 2006/05/06 02:43:12 dillon Exp $
	41	*/
	42
	43	/*
	44	* External virtual filesystem routines
	45	*/
	46	#include "opt_ddb.h"
	47
	48	#include <sys/param.h>
	49	#include <sys/systm.h>
	50	#include <sys/buf.h>
	51	#include <sys/conf.h>
	52	#include <sys/dirent.h>
	53	#include <sys/domain.h>
	54	#include <sys/eventhandler.h>
	55	#include <sys/fcntl.h>
	56	#include <sys/kernel.h>
	57	#include <sys/kthread.h>
	58	#include <sys/malloc.h>
	59	#include <sys/mbuf.h>
	60	#include <sys/mount.h>
	61	#include <sys/proc.h>
	62	#include <sys/namei.h>
	63	#include <sys/reboot.h>
	64	#include <sys/socket.h>
	65	#include <sys/stat.h>
	66	#include <sys/sysctl.h>
	67	#include <sys/syslog.h>
	68	#include <sys/vmmeter.h>
	69	#include <sys/vnode.h>
	70
	71	#include <machine/limits.h>
	72
	73	#include <vm/vm.h>
	74	#include <vm/vm_object.h>
	75	#include <vm/vm_extern.h>
	76	#include <vm/vm_kern.h>
	77	#include <vm/pmap.h>
	78	#include <vm/vm_map.h>
	79	#include <vm/vm_page.h>
	80	#include <vm/vm_pager.h>
	81	#include <vm/vnode_pager.h>
	82
	83	#include <sys/buf2.h>
	84	#include <sys/thread2.h>
	85
	86	/*
	87	* The workitem queue.
	88	*/
	89	#define SYNCER_MAXDELAY 32
	90	static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
	91	time_t syncdelay = 30; /* max time to delay syncing data */
	92	SYSCTL_INT(_kern, OID_AUTO, syncdelay, CTLFLAG_RW,
	93	&syncdelay, 0, "VFS data synchronization delay");
	94	time_t filedelay = 30; /* time to delay syncing files */
	95	SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW,
	96	&filedelay, 0, "File synchronization delay");
	97	time_t dirdelay = 29; /* time to delay syncing directories */
	98	SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW,
	99	&dirdelay, 0, "Directory synchronization delay");
	100	time_t metadelay = 28; /* time to delay syncing metadata */
	101	SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW,
	102	&metadelay, 0, "VFS metadata synchronization delay");
	103	static int rushjob; /* number of slots to run ASAP */
	104	static int stat_rush_requests; /* number of times I/O speeded up */
	105	SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW,
	106	&stat_rush_requests, 0, "");
	107
	108	static int syncer_delayno = 0;
	109	static long syncer_mask;
	110	LIST_HEAD(synclist, vnode);
	111	static struct synclist *syncer_workitem_pending;
	112
	113	/*
	114	* Called from vfsinit()
	115	*/
	116	void
	117	vfs_sync_init(void)
	118	{
	119	syncer_workitem_pending = hashinit(syncer_maxdelay, M_DEVBUF,
	120	&syncer_mask);
	121	syncer_maxdelay = syncer_mask + 1;
	122	}
	123
	124	/*
	125	* The workitem queue.
	126	*
	127	* It is useful to delay writes of file data and filesystem metadata
	128	* for tens of seconds so that quickly created and deleted files need
	129	* not waste disk bandwidth being created and removed. To realize this,
	130	* we append vnodes to a "workitem" queue. When running with a soft
	131	* updates implementation, most pending metadata dependencies should
	132	* not wait for more than a few seconds. Thus, mounted on block devices
	133	* are delayed only about a half the time that file data is delayed.
	134	* Similarly, directory updates are more critical, so are only delayed
	135	* about a third the time that file data is delayed. Thus, there are
	136	* SYNCER_MAXDELAY queues that are processed round-robin at a rate of
	137	* one each second (driven off the filesystem syncer process). The
	138	* syncer_delayno variable indicates the next queue that is to be processed.
	139	* Items that need to be processed soon are placed in this queue:
	140	*
	141	* syncer_workitem_pending[syncer_delayno]
	142	*
	143	* A delay of fifteen seconds is done by placing the request fifteen
	144	* entries later in the queue:
	145	*
	146	* syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
	147	*
	148	*/
	149
	150	/*
	151	* Add an item to the syncer work queue.
	152	*/
	153	void
	154	vn_syncer_add_to_worklist(struct vnode *vp, int delay)
	155	{
	156	int slot;
	157
	158	crit_enter();
	159
	160	if (vp->v_flag & VONWORKLST) {
	161	LIST_REMOVE(vp, v_synclist);
	162	}
	163
	164	if (delay > syncer_maxdelay - 2)
	165	delay = syncer_maxdelay - 2;
	166	slot = (syncer_delayno + delay) & syncer_mask;
	167
	168	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
	169	vp->v_flag \|= VONWORKLST;
	170	crit_exit();
	171	}
	172
	173	struct thread *updatethread;
	174	static void sched_sync (void);
	175	static struct kproc_desc up_kp = {
	176	"syncer",
	177	sched_sync,
	178	&updatethread
	179	};
	180	SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
	181
	182	/*
	183	* System filesystem synchronizer daemon.
	184	*/
	185	void
	186	sched_sync(void)
	187	{
	188	struct synclist *slp;
	189	struct vnode *vp;
	190	long starttime;
	191	struct thread *td = curthread;
	192
	193	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
	194	SHUTDOWN_PRI_LAST);
	195
	196	for (;;) {
	197	kproc_suspend_loop();
	198
	199	starttime = time_second;
	200
	201	/*
	202	* Push files whose dirty time has expired. Be careful
	203	* of interrupt race on slp queue.
	204	*/
	205	crit_enter();
	206	slp = &syncer_workitem_pending[syncer_delayno];
	207	syncer_delayno += 1;
	208	if (syncer_delayno == syncer_maxdelay)
	209	syncer_delayno = 0;
	210	crit_exit();
	211
	212	while ((vp = LIST_FIRST(slp)) != NULL) {
	213	if (vget(vp, LK_EXCLUSIVE \| LK_NOWAIT) == 0) {
	214	VOP_FSYNC(vp, MNT_LAZY);
	215	vput(vp);
	216	}
	217	crit_enter();
	218
	219	/*
	220	* If the vnode is still at the head of the list
	221	* we were not able to completely flush it. To
	222	* give other vnodes a fair shake we move it to
	223	* a later slot.
	224	*
	225	* Note that v_tag VT_VFS vnodes can remain on the
	226	* worklist with no dirty blocks, but sync_fsync()
	227	* moves it to a later slot so we will never see it
	228	* here.
	229	*/
	230	if (LIST_FIRST(slp) == vp) {
	231	if (RB_EMPTY(&vp->v_rbdirty_tree) &&
	232	!vn_isdisk(vp, NULL)) {
	233	panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
	234	}
	235	vn_syncer_add_to_worklist(vp, syncdelay);
	236	}
	237	crit_exit();
	238	}
	239
	240	/*
	241	* Do soft update processing.
	242	*/
	243	if (bioops.io_sync)
	244	(*bioops.io_sync)(NULL);
	245
	246	/*
	247	* The variable rushjob allows the kernel to speed up the
	248	* processing of the filesystem syncer process. A rushjob
	249	* value of N tells the filesystem syncer to process the next
	250	* N seconds worth of work on its queue ASAP. Currently rushjob
	251	* is used by the soft update code to speed up the filesystem
	252	* syncer process when the incore state is getting so far
	253	* ahead of the disk that the kernel memory pool is being
	254	* threatened with exhaustion.
	255	*/
	256	if (rushjob > 0) {
	257	rushjob -= 1;
	258	continue;
	259	}
	260	/*
	261	* If it has taken us less than a second to process the
	262	* current work, then wait. Otherwise start right over
	263	* again. We can still lose time if any single round
	264	* takes more than two seconds, but it does not really
	265	* matter as we are just trying to generally pace the
	266	* filesystem activity.
	267	*/
	268	if (time_second == starttime)
	269	tsleep(&lbolt_syncer, 0, "syncer", 0);
	270	}
	271	}
	272
	273	/*
	274	* Request the syncer daemon to speed up its work.
	275	* We never push it to speed up more than half of its
	276	* normal turn time, otherwise it could take over the cpu.
	277	*
	278	* YYY wchan field protected by the BGL.
	279	*/
	280	int
	281	speedup_syncer(void)
	282	{
	283	/*
	284	* Don't bother protecting the test. unsleep_and_wakeup_thread()
	285	* will only do something real if the thread is in the right state.
	286	*/
	287	wakeup(&lbolt_syncer);
	288	if (rushjob < syncdelay / 2) {
	289	rushjob += 1;
	290	stat_rush_requests += 1;
	291	return (1);
	292	}
	293	return(0);
	294	}
	295
	296	/*
	297	* Routine to create and manage a filesystem syncer vnode.
	298	*/
	299	#define sync_close ((int () (struct vop_close_args ))nullop)
	300	static int sync_fsync (struct vop_fsync_args *);
	301	static int sync_inactive (struct vop_inactive_args *);
	302	static int sync_reclaim (struct vop_reclaim_args *);
	303	#define sync_lock ((int () (struct vop_lock_args ))vop_stdlock)
	304	#define sync_unlock ((int () (struct vop_unlock_args ))vop_stdunlock)
	305	static int sync_print (struct vop_print_args *);
	306	#define sync_islocked ((int() (struct vop_islocked_args ))vop_stdislocked)
	307
	308	static struct vop_ops *sync_vnode_vops;
	309	static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
	310	{ &vop_default_desc, vop_eopnotsupp },
	311	{ &vop_close_desc, (void ) sync_close }, / close */
	312	{ &vop_fsync_desc, (void ) sync_fsync }, / fsync */
	313	{ &vop_inactive_desc, (void ) sync_inactive }, / inactive */
	314	{ &vop_reclaim_desc, (void ) sync_reclaim }, / reclaim */
	315	{ &vop_lock_desc, (void ) sync_lock }, / lock */
	316	{ &vop_unlock_desc, (void ) sync_unlock }, / unlock */
	317	{ &vop_print_desc, (void ) sync_print }, / print */
	318	{ &vop_islocked_desc, (void ) sync_islocked }, / islocked */
	319	{ NULL, NULL }
	320	};
	321
	322	static struct vnodeopv_desc sync_vnodeop_opv_desc =
	323	{ &sync_vnode_vops, sync_vnodeop_entries, 0 };
	324
	325	VNODEOP_SET(sync_vnodeop_opv_desc);
	326
	327	/*
	328	* Create a new filesystem syncer vnode for the specified mount point.
	329	* This vnode is placed on the worklist and is responsible for sync'ing
	330	* the filesystem.
	331	*
	332	* NOTE: read-only mounts are also placed on the worklist. The filesystem
	333	* sync code is also responsible for cleaning up vnodes.
	334	*/
	335	int
	336	vfs_allocate_syncvnode(struct mount *mp)
	337	{
	338	struct vnode *vp;
	339	static long start, incr, next;
	340	int error;
	341
	342	/* Allocate a new vnode */
	343	error = getspecialvnode(VT_VFS, mp, &sync_vnode_vops, &vp, 0, 0);
	344	if (error) {
	345	mp->mnt_syncer = NULL;
	346	return (error);
	347	}
	348	vp->v_type = VNON;
	349	/*
	350	* Place the vnode onto the syncer worklist. We attempt to
	351	* scatter them about on the list so that they will go off
	352	* at evenly distributed times even if all the filesystems
	353	* are mounted at once.
	354	*/
	355	next += incr;
	356	if (next == 0 \|\| next > syncer_maxdelay) {
	357	start /= 2;
	358	incr /= 2;
	359	if (start == 0) {
	360	start = syncer_maxdelay / 2;
	361	incr = syncer_maxdelay;
	362	}
	363	next = start;
	364	}
	365	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
	366	mp->mnt_syncer = vp;
	367	vx_unlock(vp);
	368	return (0);
	369	}
	370
	371	/*
	372	* Do a lazy sync of the filesystem.
	373	*
	374	* sync_fsync { struct vnode a_vp, struct ucred a_cred, int a_waitfor,
	375	* struct thread *a_td }
	376	*/
	377	static int
	378	sync_fsync(struct vop_fsync_args *ap)
	379	{
	380	struct vnode *syncvp = ap->a_vp;
	381	struct mount *mp = syncvp->v_mount;
	382	int asyncflag;
	383
	384	/*
	385	* We only need to do something if this is a lazy evaluation.
	386	*/
	387	if (ap->a_waitfor != MNT_LAZY)
	388	return (0);
	389
	390	/*
	391	* Move ourselves to the back of the sync list.
	392	*/
	393	vn_syncer_add_to_worklist(syncvp, syncdelay);
	394
	395	/*
	396	* Walk the list of vnodes pushing all that are dirty and
	397	* not already on the sync list, and freeing vnodes which have
	398	* no refs and whos VM objects are empty. vfs_msync() handles
	399	* the VM issues and must be called whether the mount is readonly
	400	* or not.
	401	*/
	402	if (vfs_busy(mp, LK_NOWAIT) != 0)
	403	return (0);
	404	if (mp->mnt_flag & MNT_RDONLY) {
	405	vfs_msync(mp, MNT_NOWAIT);
	406	} else {
	407	asyncflag = mp->mnt_flag & MNT_ASYNC;
	408	mp->mnt_flag &= ~MNT_ASYNC; /* ZZZ hack */
	409	vfs_msync(mp, MNT_NOWAIT);
	410	VFS_SYNC(mp, MNT_LAZY);
	411	if (asyncflag)
	412	mp->mnt_flag \|= MNT_ASYNC;
	413	}
	414	vfs_unbusy(mp);
	415	return (0);
	416	}
	417
	418	/*
	419	* The syncer vnode is no referenced.
	420	*
	421	* sync_inactive { struct vnode a_vp, struct proc a_p }
	422	*/
	423	static int
	424	sync_inactive(struct vop_inactive_args *ap)
	425	{
	426	vgone(ap->a_vp);
	427	return (0);
	428	}
	429
	430	/*
	431	* The syncer vnode is no longer needed and is being decommissioned.
	432	*
	433	* Modifications to the worklist must be protected with a critical
	434	* section.
	435	*
	436	* sync_reclaim { struct vnode *a_vp }
	437	*/
	438	static int
	439	sync_reclaim(struct vop_reclaim_args *ap)
	440	{
	441	struct vnode *vp = ap->a_vp;
	442
	443	crit_enter();
	444	vp->v_mount->mnt_syncer = NULL;
	445	if (vp->v_flag & VONWORKLST) {
	446	LIST_REMOVE(vp, v_synclist);
	447	vp->v_flag &= ~VONWORKLST;
	448	}
	449	crit_exit();
	450
	451	return (0);
	452	}
	453
	454	/*
	455	* Print out a syncer vnode.
	456	*
	457	* sync_print { struct vnode *a_vp }
	458	*/
	459	static int
	460	sync_print(struct vop_print_args *ap)
	461	{
	462	struct vnode *vp = ap->a_vp;
	463
	464	printf("syncer vnode");
	465	lockmgr_printinfo(&vp->v_lock);
	466	printf("\n");
	467	return (0);
	468	}
	469