sys/kern/vfs_sync.c

   1 /*
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
  39  * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $
  40  */
  41
  42 /*
  43  * External virtual filesystem routines
  44  */
  45
  46 #include <sys/param.h>
  47 #include <sys/systm.h>
  48 #include <sys/buf.h>
  49 #include <sys/conf.h>
  50 #include <sys/dirent.h>
  51 #include <sys/domain.h>
  52 #include <sys/eventhandler.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/kernel.h>
  55 #include <sys/kthread.h>
  56 #include <sys/malloc.h>
  57 #include <sys/mbuf.h>
  58 #include <sys/mount.h>
  59 #include <sys/proc.h>
  60 #include <sys/namei.h>
  61 #include <sys/reboot.h>
  62 #include <sys/socket.h>
  63 #include <sys/stat.h>
  64 #include <sys/sysctl.h>
  65 #include <sys/syslog.h>
  66 #include <sys/vmmeter.h>
  67 #include <sys/vnode.h>
  68
  69 #include <machine/limits.h>
  70
  71 #include <vm/vm.h>
  72 #include <vm/vm_object.h>
  73 #include <vm/vm_extern.h>
  74 #include <vm/vm_kern.h>
  75 #include <vm/pmap.h>
  76 #include <vm/vm_map.h>
  77 #include <vm/vm_page.h>
  78 #include <vm/vm_pager.h>
  79 #include <vm/vnode_pager.h>
  80
  81 #include <sys/buf2.h>
  82 #include <sys/thread2.h>
  83
  84 /*
  85  * The workitem queue.
  86  */
  87 #define SYNCER_MAXDELAY         32
  88 static int syncer_maxdelay = SYNCER_MAXDELAY;   /* maximum delay time */
  89 time_t syncdelay = 30;          /* max time to delay syncing data */
  90 SYSCTL_INT(_kern, OID_AUTO, syncdelay, CTLFLAG_RW,
  91                 &syncdelay, 0, "VFS data synchronization delay");
  92 time_t filedelay = 30;          /* time to delay syncing files */
  93 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW,
  94                 &filedelay, 0, "File synchronization delay");
  95 time_t dirdelay = 29;           /* time to delay syncing directories */
  96 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW,
  97                 &dirdelay, 0, "Directory synchronization delay");
  98 time_t metadelay = 28;          /* time to delay syncing metadata */
  99 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW,
 100                 &metadelay, 0, "VFS metadata synchronization delay");
 101 static int rushjob;                     /* number of slots to run ASAP */
 102 static int stat_rush_requests;  /* number of times I/O speeded up */
 103 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW,
 104                 &stat_rush_requests, 0, "");
 105
 106 LIST_HEAD(synclist, vnode);
 107
 108 #define SC_FLAG_EXIT            (0x1)           /* request syncer exit */
 109 #define SC_FLAG_DONE            (0x2)           /* syncer confirm exit */
 110 #define         SC_FLAG_BIOOPS_ALL      (0x4)           /* do bufops_sync(NULL) */
 111
 112 struct syncer_ctx {
 113         struct mount            *sc_mp;
 114         struct lwkt_token       sc_token;
 115         struct thread           *sc_thread;
 116         int                     sc_flags;
 117
 118         struct synclist         *syncer_workitem_pending;
 119         long                    syncer_mask;
 120         int                     syncer_delayno;
 121 };
 122
 123 static struct syncer_ctx syncer_ctx0;
 124
 125 static void syncer_thread(void *);
 126
 127 static void
 128 syncer_ctx_init(struct syncer_ctx *ctx, struct mount *mp)
 129 {
 130         ctx->sc_mp = mp;
 131         lwkt_token_init(&ctx->sc_token, "syncer");
 132         ctx->sc_flags = 0;
 133
 134         ctx->syncer_workitem_pending = hashinit(syncer_maxdelay, M_DEVBUF,
 135                                                 &ctx->syncer_mask);
 136         ctx->syncer_delayno = 0;
 137 }
 138
 139 /*
 140  * Called from vfsinit()
 141  */
 142 void
 143 vfs_sync_init(void)
 144 {
 145         syncer_ctx_init(&syncer_ctx0, NULL);
 146         syncer_maxdelay = syncer_ctx0.syncer_mask + 1;
 147         syncer_ctx0.sc_flags |= SC_FLAG_BIOOPS_ALL;
 148
 149         /* Support schedcpu wakeup of syncer0 */
 150         lbolt_syncer = &syncer_ctx0;
 151 }
 152
 153 static struct syncer_ctx *
 154 vn_get_syncer(struct vnode *vp) {
 155         struct mount *mp;
 156         struct syncer_ctx *ctx;
 157
 158         ctx = NULL;
 159         mp = vp->v_mount;
 160         if (mp)
 161                 ctx = mp->mnt_syncer_ctx;
 162         if (ctx == NULL)
 163                 ctx = &syncer_ctx0;
 164
 165         return (ctx);
 166 }
 167
 168 /*
 169  * The workitem queue.
 170  *
 171  * It is useful to delay writes of file data and filesystem metadata
 172  * for tens of seconds so that quickly created and deleted files need
 173  * not waste disk bandwidth being created and removed. To realize this,
 174  * we append vnodes to a "workitem" queue. When running with a soft
 175  * updates implementation, most pending metadata dependencies should
 176  * not wait for more than a few seconds. Thus, mounted on block devices
 177  * are delayed only about a half the time that file data is delayed.
 178  * Similarly, directory updates are more critical, so are only delayed
 179  * about a third the time that file data is delayed. Thus, there are
 180  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
 181  * one each second (driven off the filesystem syncer process). The
 182  * syncer_delayno variable indicates the next queue that is to be processed.
 183  * Items that need to be processed soon are placed in this queue:
 184  *
 185  *      syncer_workitem_pending[syncer_delayno]
 186  *
 187  * A delay of fifteen seconds is done by placing the request fifteen
 188  * entries later in the queue:
 189  *
 190  *      syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
 191  *
 192  */
 193
 194 /*
 195  * Add an item to the syncer work queue.
 196  *
 197  * WARNING: Cannot get vp->v_token here if not already held, we must
 198  *          depend on the syncer_token (which might already be held by
 199  *          the caller) to protect v_synclist and VONWORKLST.
 200  *
 201  * MPSAFE
 202  */
 203 void
 204 vn_syncer_add(struct vnode *vp, int delay)
 205 {
 206         struct syncer_ctx *ctx;
 207         int slot;
 208
 209         ctx = vn_get_syncer(vp);
 210
 211         lwkt_gettoken(&ctx->sc_token);
 212
 213         if (vp->v_flag & VONWORKLST)
 214                 LIST_REMOVE(vp, v_synclist);
 215         if (delay > syncer_maxdelay - 2)
 216                 delay = syncer_maxdelay - 2;
 217         slot = (ctx->syncer_delayno + delay) & ctx->syncer_mask;
 218
 219         LIST_INSERT_HEAD(&ctx->syncer_workitem_pending[slot], vp, v_synclist);
 220         vsetflags(vp, VONWORKLST);
 221
 222         lwkt_reltoken(&ctx->sc_token);
 223 }
 224
 225 /*
 226  * Removes the vnode from the syncer list.  Since we might block while
 227  * acquiring the syncer_token we have to recheck conditions.
 228  *
 229  * vp->v_token held on call
 230  */
 231 void
 232 vn_syncer_remove(struct vnode *vp)
 233 {
 234         struct syncer_ctx *ctx;
 235
 236         ctx = vn_get_syncer(vp);
 237
 238         lwkt_gettoken(&ctx->sc_token);
 239
 240         if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree)) {
 241                 vclrflags(vp, VONWORKLST);
 242                 LIST_REMOVE(vp, v_synclist);
 243         }
 244
 245         lwkt_reltoken(&ctx->sc_token);
 246 }
 247
 248 /*
 249  * Create per-filesystem syncer process
 250  */
 251 void
 252 vn_syncer_thr_create(struct mount *mp)
 253 {
 254         struct syncer_ctx *ctx;
 255         static int syncalloc = 0;
 256         int rc;
 257
 258         ctx = kmalloc(sizeof(struct syncer_ctx), M_TEMP, M_WAITOK);
 259
 260         syncer_ctx_init(ctx, mp);
 261         mp->mnt_syncer_ctx = ctx;
 262
 263         rc = kthread_create(syncer_thread, ctx, &ctx->sc_thread,
 264                             "syncer%d", ++syncalloc);
 265 }
 266
 267 void *
 268 vn_syncer_thr_getctx(struct mount *mp)
 269 {
 270         return (mp->mnt_syncer_ctx);
 271 }
 272
 273 /*
 274  * Stop per-filesystem syncer process
 275  */
 276 void
 277 vn_syncer_thr_stop(void *ctxp)
 278 {
 279         struct syncer_ctx *ctx;
 280
 281         ctx = ctxp;
 282
 283         lwkt_gettoken(&ctx->sc_token);
 284
 285         /* Signal the syncer process to exit */
 286         ctx->sc_flags |= SC_FLAG_EXIT;
 287         wakeup(ctx);
 288
 289         /* Wait till syncer process exits */
 290         while ((ctx->sc_flags & SC_FLAG_DONE) == 0)
 291                 tsleep(&ctx->sc_flags, 0, "syncexit", hz);
 292
 293         lwkt_reltoken(&ctx->sc_token);
 294
 295         kfree(ctx->syncer_workitem_pending, M_DEVBUF);
 296         kfree(ctx, M_TEMP);
 297 }
 298
 299 struct  thread *updatethread;
 300
 301 /*
 302  * System filesystem synchronizer daemon.
 303  */
 304 static void
 305 syncer_thread(void *_ctx)
 306 {
 307         struct thread *td = curthread;
 308         struct syncer_ctx *ctx = _ctx;
 309         struct synclist *slp;
 310         struct vnode *vp;
 311         long starttime;
 312         int *sc_flagsp;
 313         int sc_flags;
 314         int vnodes_synced = 0;
 315
 316         /*
 317          * syncer0 runs till system shutdown; per-filesystem syncers are
 318          * terminated on filesystem unmount
 319          */
 320         if (ctx == &syncer_ctx0)
 321                 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
 322                                       SHUTDOWN_PRI_LAST);
 323         for (;;) {
 324                 kproc_suspend_loop();
 325
 326                 starttime = time_second;
 327                 lwkt_gettoken(&ctx->sc_token);
 328
 329                 /*
 330                  * Push files whose dirty time has expired.  Be careful
 331                  * of interrupt race on slp queue.
 332                  */
 333                 slp = &ctx->syncer_workitem_pending[ctx->syncer_delayno];
 334                 ctx->syncer_delayno += 1;
 335                 if (ctx->syncer_delayno == syncer_maxdelay)
 336                         ctx->syncer_delayno = 0;
 337
 338                 while ((vp = LIST_FIRST(slp)) != NULL) {
 339                         if (vget(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
 340                                 VOP_FSYNC(vp, MNT_LAZY, 0);
 341                                 vput(vp);
 342                                 vnodes_synced++;
 343                         }
 344
 345                         /*
 346                          * vp is stale but can still be used if we can
 347                          * verify that it remains at the head of the list.
 348                          * Be careful not to try to get vp->v_token as
 349                          * vp can become stale if this blocks.
 350                          *
 351                          * If the vp is still at the head of the list were
 352                          * unable to completely flush it and move it to
 353                          * a later slot to give other vnodes a fair shot.
 354                          *
 355                          * Note that v_tag VT_VFS vnodes can remain on the
 356                          * worklist with no dirty blocks, but sync_fsync()
 357                          * moves it to a later slot so we will never see it
 358                          * here.
 359                          *
 360                          * It is possible to race a vnode with no dirty
 361                          * buffers being removed from the list.  If this
 362                          * occurs we will move the vnode in the synclist
 363                          * and then the other thread will remove it.  Do
 364                          * not try to remove it here.
 365                          */
 366                         if (LIST_FIRST(slp) == vp)
 367                                 vn_syncer_add(vp, syncdelay);
 368                 }
 369
 370                 sc_flags = ctx->sc_flags;
 371
 372                 /* Exit on unmount */
 373                 if (sc_flags & SC_FLAG_EXIT)
 374                         break;
 375
 376                 lwkt_reltoken(&ctx->sc_token);
 377
 378                 /*
 379                  * Do sync processing for each mount.
 380                  */
 381                 if (ctx->sc_mp || sc_flags & SC_FLAG_BIOOPS_ALL)
 382                         bio_ops_sync(ctx->sc_mp);
 383
 384                 /*
 385                  * The variable rushjob allows the kernel to speed up the
 386                  * processing of the filesystem syncer process. A rushjob
 387                  * value of N tells the filesystem syncer to process the next
 388                  * N seconds worth of work on its queue ASAP. Currently rushjob
 389                  * is used by the soft update code to speed up the filesystem
 390                  * syncer process when the incore state is getting so far
 391                  * ahead of the disk that the kernel memory pool is being
 392                  * threatened with exhaustion.
 393                  */
 394                 if (ctx == &syncer_ctx0 && rushjob > 0) {
 395                         atomic_subtract_int(&rushjob, 1);
 396                         continue;
 397                 }
 398                 /*
 399                  * If it has taken us less than a second to process the
 400                  * current work, then wait. Otherwise start right over
 401                  * again. We can still lose time if any single round
 402                  * takes more than two seconds, but it does not really
 403                  * matter as we are just trying to generally pace the
 404                  * filesystem activity.
 405                  */
 406                 if (time_second == starttime)
 407                         tsleep(ctx, 0, "syncer", hz);
 408         }
 409
 410         /*
 411          * Unmount/exit path for per-filesystem syncers; sc_token held
 412          */
 413         ctx->sc_flags |= SC_FLAG_DONE;
 414         sc_flagsp = &ctx->sc_flags;
 415         lwkt_reltoken(&ctx->sc_token);
 416         wakeup(sc_flagsp);
 417
 418         kthread_exit();
 419 }
 420
 421 static void
 422 syncer_thread_start(void) {
 423         syncer_thread(&syncer_ctx0);
 424 }
 425
 426 static struct kproc_desc up_kp = {
 427         "syncer0",
 428         syncer_thread_start,
 429         &updatethread
 430 };
 431 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 432
 433 /*
 434  * Request the syncer daemon to speed up its work.
 435  * We never push it to speed up more than half of its
 436  * normal turn time, otherwise it could take over the cpu.
 437  */
 438 int
 439 speedup_syncer(void)
 440 {
 441         /*
 442          * Don't bother protecting the test.  unsleep_and_wakeup_thread()
 443          * will only do something real if the thread is in the right state.
 444          */
 445         wakeup(lbolt_syncer);
 446         if (rushjob < syncdelay / 2) {
 447                 atomic_add_int(&rushjob, 1);
 448                 stat_rush_requests += 1;
 449                 return (1);
 450         }
 451         return(0);
 452 }
 453
 454 /*
 455  * Routine to create and manage a filesystem syncer vnode.
 456  */
 457 static int sync_close(struct vop_close_args *);
 458 static int sync_fsync(struct vop_fsync_args *);
 459 static int sync_inactive(struct vop_inactive_args *);
 460 static int sync_reclaim (struct vop_reclaim_args *);
 461 static int sync_print(struct vop_print_args *);
 462
 463 static struct vop_ops sync_vnode_vops = {
 464         .vop_default =  vop_eopnotsupp,
 465         .vop_close =    sync_close,
 466         .vop_fsync =    sync_fsync,
 467         .vop_inactive = sync_inactive,
 468         .vop_reclaim =  sync_reclaim,
 469         .vop_print =    sync_print,
 470 };
 471
 472 static struct vop_ops *sync_vnode_vops_p = &sync_vnode_vops;
 473
 474 VNODEOP_SET(sync_vnode_vops);
 475
 476 /*
 477  * Create a new filesystem syncer vnode for the specified mount point.
 478  * This vnode is placed on the worklist and is responsible for sync'ing
 479  * the filesystem.
 480  *
 481  * NOTE: read-only mounts are also placed on the worklist.  The filesystem
 482  * sync code is also responsible for cleaning up vnodes.
 483  */
 484 int
 485 vfs_allocate_syncvnode(struct mount *mp)
 486 {
 487         struct vnode *vp;
 488         static long start, incr, next;
 489         int error;
 490
 491         /* Allocate a new vnode */
 492         error = getspecialvnode(VT_VFS, mp, &sync_vnode_vops_p, &vp, 0, 0);
 493         if (error) {
 494                 mp->mnt_syncer = NULL;
 495                 return (error);
 496         }
 497         vp->v_type = VNON;
 498         /*
 499          * Place the vnode onto the syncer worklist. We attempt to
 500          * scatter them about on the list so that they will go off
 501          * at evenly distributed times even if all the filesystems
 502          * are mounted at once.
 503          */
 504         next += incr;
 505         if (next == 0 || next > syncer_maxdelay) {
 506                 start /= 2;
 507                 incr /= 2;
 508                 if (start == 0) {
 509                         start = syncer_maxdelay / 2;
 510                         incr = syncer_maxdelay;
 511                 }
 512                 next = start;
 513         }
 514         vn_syncer_add(vp, syncdelay > 0 ? next % syncdelay : 0);
 515
 516         /*
 517          * The mnt_syncer field inherits the vnode reference, which is
 518          * held until later decomissioning.
 519          */
 520         mp->mnt_syncer = vp;
 521         vx_unlock(vp);
 522         return (0);
 523 }
 524
 525 static int
 526 sync_close(struct vop_close_args *ap)
 527 {
 528         return (0);
 529 }
 530
 531 /*
 532  * Do a lazy sync of the filesystem.
 533  *
 534  * sync_fsync { struct vnode *a_vp, int a_waitfor }
 535  */
 536 static int
 537 sync_fsync(struct vop_fsync_args *ap)
 538 {
 539         struct vnode *syncvp = ap->a_vp;
 540         struct mount *mp = syncvp->v_mount;
 541         int asyncflag;
 542
 543         /*
 544          * We only need to do something if this is a lazy evaluation.
 545          */
 546         if ((ap->a_waitfor & MNT_LAZY) == 0)
 547                 return (0);
 548
 549         /*
 550          * Move ourselves to the back of the sync list.
 551          */
 552         vn_syncer_add(syncvp, syncdelay);
 553
 554         /*
 555          * Walk the list of vnodes pushing all that are dirty and
 556          * not already on the sync list, and freeing vnodes which have
 557          * no refs and whos VM objects are empty.  vfs_msync() handles
 558          * the VM issues and must be called whether the mount is readonly
 559          * or not.
 560          */
 561         if (vfs_busy(mp, LK_NOWAIT) != 0)
 562                 return (0);
 563         if (mp->mnt_flag & MNT_RDONLY) {
 564                 vfs_msync(mp, MNT_NOWAIT);
 565         } else {
 566                 asyncflag = mp->mnt_flag & MNT_ASYNC;
 567                 mp->mnt_flag &= ~MNT_ASYNC;     /* ZZZ hack */
 568                 vfs_msync(mp, MNT_NOWAIT);
 569                 VFS_SYNC(mp, MNT_NOWAIT | MNT_LAZY);
 570                 if (asyncflag)
 571                         mp->mnt_flag |= MNT_ASYNC;
 572         }
 573         vfs_unbusy(mp);
 574         return (0);
 575 }
 576
 577 /*
 578  * The syncer vnode is no longer referenced.
 579  *
 580  * sync_inactive { struct vnode *a_vp, struct proc *a_p }
 581  */
 582 static int
 583 sync_inactive(struct vop_inactive_args *ap)
 584 {
 585         vgone_vxlocked(ap->a_vp);
 586         return (0);
 587 }
 588
 589 /*
 590  * The syncer vnode is no longer needed and is being decommissioned.
 591  * This can only occur when the last reference has been released on
 592  * mp->mnt_syncer, so mp->mnt_syncer had better be NULL.
 593  *
 594  * Modifications to the worklist must be protected with a critical
 595  * section.
 596  *
 597  *      sync_reclaim { struct vnode *a_vp }
 598  */
 599 static int
 600 sync_reclaim(struct vop_reclaim_args *ap)
 601 {
 602         struct vnode *vp = ap->a_vp;
 603         struct syncer_ctx *ctx;
 604
 605         ctx = vn_get_syncer(vp);
 606
 607         lwkt_gettoken(&ctx->sc_token);
 608         KKASSERT(vp->v_mount->mnt_syncer != vp);
 609         if (vp->v_flag & VONWORKLST) {
 610                 LIST_REMOVE(vp, v_synclist);
 611                 vclrflags(vp, VONWORKLST);
 612         }
 613         lwkt_reltoken(&ctx->sc_token);
 614
 615         return (0);
 616 }
 617
 618 /*
 619  * Print out a syncer vnode.
 620  *
 621  *      sync_print { struct vnode *a_vp }
 622  */
 623 static int
 624 sync_print(struct vop_print_args *ap)
 625 {
 626         struct vnode *vp = ap->a_vp;
 627
 628         kprintf("syncer vnode");
 629         lockmgr_printinfo(&vp->v_lock);
 630         kprintf("\n");
 631         return (0);
 632 }
 633