sys/kern/vfs_sync.c

   1 /*
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
  39  * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $
  40  */
  41
  42 /*
  43  * External virtual filesystem routines
  44  */
  45 #include "opt_ddb.h"
  46
  47 #include <sys/param.h>
  48 #include <sys/systm.h>
  49 #include <sys/buf.h>
  50 #include <sys/conf.h>
  51 #include <sys/dirent.h>
  52 #include <sys/domain.h>
  53 #include <sys/eventhandler.h>
  54 #include <sys/fcntl.h>
  55 #include <sys/kernel.h>
  56 #include <sys/kthread.h>
  57 #include <sys/malloc.h>
  58 #include <sys/mbuf.h>
  59 #include <sys/mount.h>
  60 #include <sys/proc.h>
  61 #include <sys/namei.h>
  62 #include <sys/reboot.h>
  63 #include <sys/socket.h>
  64 #include <sys/stat.h>
  65 #include <sys/sysctl.h>
  66 #include <sys/syslog.h>
  67 #include <sys/vmmeter.h>
  68 #include <sys/vnode.h>
  69
  70 #include <machine/limits.h>
  71
  72 #include <vm/vm.h>
  73 #include <vm/vm_object.h>
  74 #include <vm/vm_extern.h>
  75 #include <vm/vm_kern.h>
  76 #include <vm/pmap.h>
  77 #include <vm/vm_map.h>
  78 #include <vm/vm_page.h>
  79 #include <vm/vm_pager.h>
  80 #include <vm/vnode_pager.h>
  81
  82 #include <sys/buf2.h>
  83 #include <sys/thread2.h>
  84
  85 /*
  86  * The workitem queue.
  87  */
  88 #define SYNCER_MAXDELAY         32
  89 static int syncer_maxdelay = SYNCER_MAXDELAY;   /* maximum delay time */
  90 time_t syncdelay = 30;          /* max time to delay syncing data */
  91 SYSCTL_INT(_kern, OID_AUTO, syncdelay, CTLFLAG_RW,
  92                 &syncdelay, 0, "VFS data synchronization delay");
  93 time_t filedelay = 30;          /* time to delay syncing files */
  94 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW,
  95                 &filedelay, 0, "File synchronization delay");
  96 time_t dirdelay = 29;           /* time to delay syncing directories */
  97 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW,
  98                 &dirdelay, 0, "Directory synchronization delay");
  99 time_t metadelay = 28;          /* time to delay syncing metadata */
 100 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW,
 101                 &metadelay, 0, "VFS metadata synchronization delay");
 102 static int rushjob;                     /* number of slots to run ASAP */
 103 static int stat_rush_requests;  /* number of times I/O speeded up */
 104 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW,
 105                 &stat_rush_requests, 0, "");
 106
 107 LIST_HEAD(synclist, vnode);
 108
 109 #define SC_FLAG_EXIT            (0x1)           /* request syncer exit */
 110 #define SC_FLAG_DONE            (0x2)           /* syncer confirm exit */
 111 #define         SC_FLAG_BIOOPS_ALL      (0x4)           /* do bufops_sync(NULL) */
 112
 113 struct syncer_ctx {
 114         struct mount            *sc_mp;
 115         struct lwkt_token       sc_token;
 116         struct thread           *sc_thread;
 117         int                     sc_flags;
 118
 119         struct synclist         *syncer_workitem_pending;
 120         long                    syncer_mask;
 121         int                     syncer_delayno;
 122 };
 123
 124 static struct syncer_ctx syncer_ctx0;
 125
 126 static void syncer_thread(void *);
 127
 128 static void
 129 syncer_ctx_init(struct syncer_ctx *ctx, struct mount *mp)
 130 {
 131         ctx->sc_mp = mp;
 132         lwkt_token_init(&ctx->sc_token, "syncer");
 133         ctx->sc_flags = 0;
 134
 135         ctx->syncer_workitem_pending = hashinit(syncer_maxdelay, M_DEVBUF,
 136                                                 &ctx->syncer_mask);
 137         ctx->syncer_delayno = 0;
 138 }
 139
 140 /*
 141  * Called from vfsinit()
 142  */
 143 void
 144 vfs_sync_init(void)
 145 {
 146         syncer_ctx_init(&syncer_ctx0, NULL);
 147         syncer_maxdelay = syncer_ctx0.syncer_mask + 1;
 148         syncer_ctx0.sc_flags |= SC_FLAG_BIOOPS_ALL;
 149
 150         /* Support schedcpu wakeup of syncer0 */
 151         lbolt_syncer = &syncer_ctx0;
 152 }
 153
 154 static struct syncer_ctx *
 155 vn_get_syncer(struct vnode *vp) {
 156         struct mount *mp;
 157         struct syncer_ctx *ctx;
 158
 159         ctx = NULL;
 160         mp = vp->v_mount;
 161         if (mp)
 162                 ctx = mp->mnt_syncer_ctx;
 163         if (ctx == NULL)
 164                 ctx = &syncer_ctx0;
 165
 166         return (ctx);
 167 }
 168
 169 /*
 170  * The workitem queue.
 171  *
 172  * It is useful to delay writes of file data and filesystem metadata
 173  * for tens of seconds so that quickly created and deleted files need
 174  * not waste disk bandwidth being created and removed. To realize this,
 175  * we append vnodes to a "workitem" queue. When running with a soft
 176  * updates implementation, most pending metadata dependencies should
 177  * not wait for more than a few seconds. Thus, mounted on block devices
 178  * are delayed only about a half the time that file data is delayed.
 179  * Similarly, directory updates are more critical, so are only delayed
 180  * about a third the time that file data is delayed. Thus, there are
 181  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
 182  * one each second (driven off the filesystem syncer process). The
 183  * syncer_delayno variable indicates the next queue that is to be processed.
 184  * Items that need to be processed soon are placed in this queue:
 185  *
 186  *      syncer_workitem_pending[syncer_delayno]
 187  *
 188  * A delay of fifteen seconds is done by placing the request fifteen
 189  * entries later in the queue:
 190  *
 191  *      syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
 192  *
 193  */
 194
 195 /*
 196  * Add an item to the syncer work queue.
 197  *
 198  * WARNING: Cannot get vp->v_token here if not already held, we must
 199  *          depend on the syncer_token (which might already be held by
 200  *          the caller) to protect v_synclist and VONWORKLST.
 201  *
 202  * MPSAFE
 203  */
 204 void
 205 vn_syncer_add(struct vnode *vp, int delay)
 206 {
 207         struct syncer_ctx *ctx;
 208         int slot;
 209
 210         ctx = vn_get_syncer(vp);
 211
 212         lwkt_gettoken(&ctx->sc_token);
 213
 214         if (vp->v_flag & VONWORKLST)
 215                 LIST_REMOVE(vp, v_synclist);
 216         if (delay > syncer_maxdelay - 2)
 217                 delay = syncer_maxdelay - 2;
 218         slot = (ctx->syncer_delayno + delay) & ctx->syncer_mask;
 219
 220         LIST_INSERT_HEAD(&ctx->syncer_workitem_pending[slot], vp, v_synclist);
 221         vsetflags(vp, VONWORKLST);
 222
 223         lwkt_reltoken(&ctx->sc_token);
 224 }
 225
 226 /*
 227  * Removes the vnode from the syncer list.  Since we might block while
 228  * acquiring the syncer_token we have to recheck conditions.
 229  *
 230  * vp->v_token held on call
 231  */
 232 void
 233 vn_syncer_remove(struct vnode *vp)
 234 {
 235         struct syncer_ctx *ctx;
 236
 237         ctx = vn_get_syncer(vp);
 238
 239         lwkt_gettoken(&ctx->sc_token);
 240
 241         if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree)) {
 242                 vclrflags(vp, VONWORKLST);
 243                 LIST_REMOVE(vp, v_synclist);
 244         }
 245
 246         lwkt_reltoken(&ctx->sc_token);
 247 }
 248
 249 /*
 250  * Create per-filesystem syncer process
 251  */
 252 void
 253 vn_syncer_thr_create(struct mount *mp)
 254 {
 255         struct syncer_ctx *ctx;
 256         static int syncalloc = 0;
 257         int rc;
 258
 259         ctx = kmalloc(sizeof(struct syncer_ctx), M_TEMP, M_WAITOK);
 260
 261         syncer_ctx_init(ctx, mp);
 262         mp->mnt_syncer_ctx = ctx;
 263
 264         rc = kthread_create(syncer_thread, ctx, &ctx->sc_thread,
 265                             "syncer%d", ++syncalloc);
 266 }
 267
 268 void *
 269 vn_syncer_thr_getctx(struct mount *mp)
 270 {
 271         return (mp->mnt_syncer_ctx);
 272 }
 273
 274 /*
 275  * Stop per-filesystem syncer process
 276  */
 277 void
 278 vn_syncer_thr_stop(void *ctxp)
 279 {
 280         struct syncer_ctx *ctx;
 281
 282         ctx = ctxp;
 283
 284         lwkt_gettoken(&ctx->sc_token);
 285
 286         /* Signal the syncer process to exit */
 287         ctx->sc_flags |= SC_FLAG_EXIT;
 288         wakeup(ctx);
 289
 290         /* Wait till syncer process exits */
 291         while ((ctx->sc_flags & SC_FLAG_DONE) == 0)
 292                 tsleep(&ctx->sc_flags, 0, "syncexit", hz);
 293
 294         lwkt_reltoken(&ctx->sc_token);
 295
 296         kfree(ctx->syncer_workitem_pending, M_DEVBUF);
 297         kfree(ctx, M_TEMP);
 298 }
 299
 300 struct  thread *updatethread;
 301
 302 /*
 303  * System filesystem synchronizer daemon.
 304  */
 305 static void
 306 syncer_thread(void *_ctx)
 307 {
 308         struct thread *td = curthread;
 309         struct syncer_ctx *ctx = _ctx;
 310         struct synclist *slp;
 311         struct vnode *vp;
 312         long starttime;
 313         int *sc_flagsp;
 314         int sc_flags;
 315         int vnodes_synced = 0;
 316
 317         /*
 318          * syncer0 runs till system shutdown; per-filesystem syncers are
 319          * terminated on filesystem unmount
 320          */
 321         if (ctx == &syncer_ctx0)
 322                 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
 323                                       SHUTDOWN_PRI_LAST);
 324         for (;;) {
 325                 kproc_suspend_loop();
 326
 327                 starttime = time_second;
 328                 lwkt_gettoken(&ctx->sc_token);
 329
 330                 /*
 331                  * Push files whose dirty time has expired.  Be careful
 332                  * of interrupt race on slp queue.
 333                  */
 334                 slp = &ctx->syncer_workitem_pending[ctx->syncer_delayno];
 335                 ctx->syncer_delayno += 1;
 336                 if (ctx->syncer_delayno == syncer_maxdelay)
 337                         ctx->syncer_delayno = 0;
 338
 339                 while ((vp = LIST_FIRST(slp)) != NULL) {
 340                         if (vget(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
 341                                 VOP_FSYNC(vp, MNT_LAZY, 0);
 342                                 vput(vp);
 343                                 vnodes_synced++;
 344                         }
 345
 346                         /*
 347                          * vp is stale but can still be used if we can
 348                          * verify that it remains at the head of the list.
 349                          * Be careful not to try to get vp->v_token as
 350                          * vp can become stale if this blocks.
 351                          *
 352                          * If the vp is still at the head of the list were
 353                          * unable to completely flush it and move it to
 354                          * a later slot to give other vnodes a fair shot.
 355                          *
 356                          * Note that v_tag VT_VFS vnodes can remain on the
 357                          * worklist with no dirty blocks, but sync_fsync()
 358                          * moves it to a later slot so we will never see it
 359                          * here.
 360                          *
 361                          * It is possible to race a vnode with no dirty
 362                          * buffers being removed from the list.  If this
 363                          * occurs we will move the vnode in the synclist
 364                          * and then the other thread will remove it.  Do
 365                          * not try to remove it here.
 366                          */
 367                         if (LIST_FIRST(slp) == vp)
 368                                 vn_syncer_add(vp, syncdelay);
 369                 }
 370
 371                 sc_flags = ctx->sc_flags;
 372
 373                 /* Exit on unmount */
 374                 if (sc_flags & SC_FLAG_EXIT)
 375                         break;
 376
 377                 lwkt_reltoken(&ctx->sc_token);
 378
 379                 /*
 380                  * Do sync processing for each mount.
 381                  */
 382                 if (ctx->sc_mp || sc_flags & SC_FLAG_BIOOPS_ALL)
 383                         bio_ops_sync(ctx->sc_mp);
 384
 385                 /*
 386                  * The variable rushjob allows the kernel to speed up the
 387                  * processing of the filesystem syncer process. A rushjob
 388                  * value of N tells the filesystem syncer to process the next
 389                  * N seconds worth of work on its queue ASAP. Currently rushjob
 390                  * is used by the soft update code to speed up the filesystem
 391                  * syncer process when the incore state is getting so far
 392                  * ahead of the disk that the kernel memory pool is being
 393                  * threatened with exhaustion.
 394                  */
 395                 if (ctx == &syncer_ctx0 && rushjob > 0) {
 396                         atomic_subtract_int(&rushjob, 1);
 397                         continue;
 398                 }
 399                 /*
 400                  * If it has taken us less than a second to process the
 401                  * current work, then wait. Otherwise start right over
 402                  * again. We can still lose time if any single round
 403                  * takes more than two seconds, but it does not really
 404                  * matter as we are just trying to generally pace the
 405                  * filesystem activity.
 406                  */
 407                 if (time_second == starttime)
 408                         tsleep(ctx, 0, "syncer", hz);
 409         }
 410
 411         /*
 412          * Unmount/exit path for per-filesystem syncers; sc_token held
 413          */
 414         ctx->sc_flags |= SC_FLAG_DONE;
 415         sc_flagsp = &ctx->sc_flags;
 416         lwkt_reltoken(&ctx->sc_token);
 417         wakeup(sc_flagsp);
 418
 419         kthread_exit();
 420 }
 421
 422 static void
 423 syncer_thread_start(void) {
 424         syncer_thread(&syncer_ctx0);
 425 }
 426
 427 static struct kproc_desc up_kp = {
 428         "syncer0",
 429         syncer_thread_start,
 430         &updatethread
 431 };
 432 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 433
 434 /*
 435  * Request the syncer daemon to speed up its work.
 436  * We never push it to speed up more than half of its
 437  * normal turn time, otherwise it could take over the cpu.
 438  */
 439 int
 440 speedup_syncer(void)
 441 {
 442         /*
 443          * Don't bother protecting the test.  unsleep_and_wakeup_thread()
 444          * will only do something real if the thread is in the right state.
 445          */
 446         wakeup(lbolt_syncer);
 447         if (rushjob < syncdelay / 2) {
 448                 atomic_add_int(&rushjob, 1);
 449                 stat_rush_requests += 1;
 450                 return (1);
 451         }
 452         return(0);
 453 }
 454
 455 /*
 456  * Routine to create and manage a filesystem syncer vnode.
 457  */
 458 static int sync_close(struct vop_close_args *);
 459 static int sync_fsync(struct vop_fsync_args *);
 460 static int sync_inactive(struct vop_inactive_args *);
 461 static int sync_reclaim (struct vop_reclaim_args *);
 462 static int sync_print(struct vop_print_args *);
 463
 464 static struct vop_ops sync_vnode_vops = {
 465         .vop_default =  vop_eopnotsupp,
 466         .vop_close =    sync_close,
 467         .vop_fsync =    sync_fsync,
 468         .vop_inactive = sync_inactive,
 469         .vop_reclaim =  sync_reclaim,
 470         .vop_print =    sync_print,
 471 };
 472
 473 static struct vop_ops *sync_vnode_vops_p = &sync_vnode_vops;
 474
 475 VNODEOP_SET(sync_vnode_vops);
 476
 477 /*
 478  * Create a new filesystem syncer vnode for the specified mount point.
 479  * This vnode is placed on the worklist and is responsible for sync'ing
 480  * the filesystem.
 481  *
 482  * NOTE: read-only mounts are also placed on the worklist.  The filesystem
 483  * sync code is also responsible for cleaning up vnodes.
 484  */
 485 int
 486 vfs_allocate_syncvnode(struct mount *mp)
 487 {
 488         struct vnode *vp;
 489         static long start, incr, next;
 490         int error;
 491
 492         /* Allocate a new vnode */
 493         error = getspecialvnode(VT_VFS, mp, &sync_vnode_vops_p, &vp, 0, 0);
 494         if (error) {
 495                 mp->mnt_syncer = NULL;
 496                 return (error);
 497         }
 498         vp->v_type = VNON;
 499         /*
 500          * Place the vnode onto the syncer worklist. We attempt to
 501          * scatter them about on the list so that they will go off
 502          * at evenly distributed times even if all the filesystems
 503          * are mounted at once.
 504          */
 505         next += incr;
 506         if (next == 0 || next > syncer_maxdelay) {
 507                 start /= 2;
 508                 incr /= 2;
 509                 if (start == 0) {
 510                         start = syncer_maxdelay / 2;
 511                         incr = syncer_maxdelay;
 512                 }
 513                 next = start;
 514         }
 515         vn_syncer_add(vp, syncdelay > 0 ? next % syncdelay : 0);
 516
 517         /*
 518          * The mnt_syncer field inherits the vnode reference, which is
 519          * held until later decomissioning.
 520          */
 521         mp->mnt_syncer = vp;
 522         vx_unlock(vp);
 523         return (0);
 524 }
 525
 526 static int
 527 sync_close(struct vop_close_args *ap)
 528 {
 529         return (0);
 530 }
 531
 532 /*
 533  * Do a lazy sync of the filesystem.
 534  *
 535  * sync_fsync { struct vnode *a_vp, int a_waitfor }
 536  */
 537 static int
 538 sync_fsync(struct vop_fsync_args *ap)
 539 {
 540         struct vnode *syncvp = ap->a_vp;
 541         struct mount *mp = syncvp->v_mount;
 542         int asyncflag;
 543
 544         /*
 545          * We only need to do something if this is a lazy evaluation.
 546          */
 547         if ((ap->a_waitfor & MNT_LAZY) == 0)
 548                 return (0);
 549
 550         /*
 551          * Move ourselves to the back of the sync list.
 552          */
 553         vn_syncer_add(syncvp, syncdelay);
 554
 555         /*
 556          * Walk the list of vnodes pushing all that are dirty and
 557          * not already on the sync list, and freeing vnodes which have
 558          * no refs and whos VM objects are empty.  vfs_msync() handles
 559          * the VM issues and must be called whether the mount is readonly
 560          * or not.
 561          */
 562         if (vfs_busy(mp, LK_NOWAIT) != 0)
 563                 return (0);
 564         if (mp->mnt_flag & MNT_RDONLY) {
 565                 vfs_msync(mp, MNT_NOWAIT);
 566         } else {
 567                 asyncflag = mp->mnt_flag & MNT_ASYNC;
 568                 mp->mnt_flag &= ~MNT_ASYNC;     /* ZZZ hack */
 569                 vfs_msync(mp, MNT_NOWAIT);
 570                 VFS_SYNC(mp, MNT_NOWAIT | MNT_LAZY);
 571                 if (asyncflag)
 572                         mp->mnt_flag |= MNT_ASYNC;
 573         }
 574         vfs_unbusy(mp);
 575         return (0);
 576 }
 577
 578 /*
 579  * The syncer vnode is no longer referenced.
 580  *
 581  * sync_inactive { struct vnode *a_vp, struct proc *a_p }
 582  */
 583 static int
 584 sync_inactive(struct vop_inactive_args *ap)
 585 {
 586         vgone_vxlocked(ap->a_vp);
 587         return (0);
 588 }
 589
 590 /*
 591  * The syncer vnode is no longer needed and is being decommissioned.
 592  * This can only occur when the last reference has been released on
 593  * mp->mnt_syncer, so mp->mnt_syncer had better be NULL.
 594  *
 595  * Modifications to the worklist must be protected with a critical
 596  * section.
 597  *
 598  *      sync_reclaim { struct vnode *a_vp }
 599  */
 600 static int
 601 sync_reclaim(struct vop_reclaim_args *ap)
 602 {
 603         struct vnode *vp = ap->a_vp;
 604         struct syncer_ctx *ctx;
 605
 606         ctx = vn_get_syncer(vp);
 607
 608         lwkt_gettoken(&ctx->sc_token);
 609         KKASSERT(vp->v_mount->mnt_syncer != vp);
 610         if (vp->v_flag & VONWORKLST) {
 611                 LIST_REMOVE(vp, v_synclist);
 612                 vclrflags(vp, VONWORKLST);
 613         }
 614         lwkt_reltoken(&ctx->sc_token);
 615
 616         return (0);
 617 }
 618
 619 /*
 620  * Print out a syncer vnode.
 621  *
 622  *      sync_print { struct vnode *a_vp }
 623  */
 624 static int
 625 sync_print(struct vop_print_args *ap)
 626 {
 627         struct vnode *vp = ap->a_vp;
 628
 629         kprintf("syncer vnode");
 630         lockmgr_printinfo(&vp->v_lock);
 631         kprintf("\n");
 632         return (0);
 633 }
 634