sys/kern/vfs_lock.c

   1 /*
   2  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * $DragonFly: src/sys/kern/vfs_lock.c,v 1.24 2006/09/05 00:55:45 dillon Exp $
  35  */
  36
  37 /*
  38  * External virtual filesystem routines
  39  */
  40 #include "opt_ddb.h"
  41
  42 #include <sys/param.h>
  43 #include <sys/systm.h>
  44 #include <sys/kernel.h>
  45 #include <sys/malloc.h>
  46 #include <sys/mount.h>
  47 #include <sys/proc.h>
  48 #include <sys/vnode.h>
  49 #include <sys/buf.h>
  50 #include <sys/sysctl.h>
  51
  52 #include <machine/limits.h>
  53
  54 #include <vm/vm.h>
  55 #include <vm/vm_object.h>
  56
  57 #include <sys/buf2.h>
  58 #include <sys/thread2.h>
  59
  60
  61 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
  62
  63 static TAILQ_HEAD(freelst, vnode) vnode_free_list;      /* vnode free list */
  64
  65 int  freevnodes = 0;
  66 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD,
  67                 &freevnodes, 0, "");
  68 static int wantfreevnodes = 25;
  69 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
  70                 &wantfreevnodes, 0, "");
  71 static int minvnodes;
  72 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
  73                 &minvnodes, 0, "Minimum number of vnodes");
  74
  75 /*
  76  * Called from vfsinit()
  77  */
  78 void
  79 vfs_lock_init(void)
  80 {
  81         minvnodes = desiredvnodes / 4;
  82
  83         TAILQ_INIT(&vnode_free_list);
  84 }
  85
  86 /*
  87  * Inline helper functions.  vbusy() and vfree() must be called while in a
  88  * critical section.
  89  *
  90  * Warning: must be callable if the caller holds a read spinlock to something
  91  * else, meaning we can't use read spinlocks here.
  92  */
  93 static __inline
  94 void
  95 __vbusy(struct vnode *vp)
  96 {
  97         TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
  98         freevnodes--;
  99         vp->v_flag &= ~(VFREE|VAGE);
 100 }
 101
 102 static __inline
 103 void
 104 __vfree(struct vnode *vp)
 105 {
 106         if (vp->v_flag & (VAGE|VRECLAIMED))
 107                 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 108         else
 109                 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 110         freevnodes++;
 111         vp->v_flag &= ~VAGE;
 112         vp->v_flag |= VFREE;
 113 }
 114
 115 /*
 116  * Return 1 if we can immediately place the vnode on the freelist.
 117  */
 118 static __inline int
 119 vshouldfree(struct vnode *vp, int usecount)
 120 {
 121         if (vp->v_flag & VFREE)
 122                 return (0);             /* already free */
 123         if (vp->v_holdcnt != 0 || vp->v_usecount != usecount)
 124                 return (0);             /* other holderse */
 125         if (vp->v_object &&
 126             (vp->v_object->ref_count || vp->v_object->resident_page_count)) {
 127                 return (0);
 128         }
 129         return (1);
 130 }
 131
 132 /*
 133  * Add another ref to a vnode.  The vnode must already have at least one
 134  * ref.
 135  *
 136  * NOTE: The vnode may continue to reside on the free list
 137  */
 138 void
 139 vref(struct vnode *vp)
 140 {
 141         KKASSERT(vp->v_usecount > 0 && (vp->v_flag & VINACTIVE) == 0);
 142         atomic_add_int(&vp->v_usecount, 1);
 143 }
 144
 145 /*
 146  * Add a ref to a vnode which may not have any refs.  This routine is called
 147  * from the namecache and vx_get().  If requested, the vnode will be
 148  * reactivated.
 149  *
 150  * Removal of the vnode from the free list is optional.  Since most vnodes
 151  * are temporary in nature we opt not do it.  This also means we don't have
 152  * to deal with lock ordering issues between the freelist and vnode
 153  * spinlocks.
 154  *
 155  * We must acquire the vnode's spinlock to interlock against vrele().
 156  *
 157  * vget(), cache_vget(), and cache_vref() reactives vnodes.  vx_get() does
 158  * not.
 159  */
 160 void
 161 vref_initial(struct vnode *vp, int reactivate)
 162 {
 163         spin_lock_wr(&vp->v_spinlock);
 164         atomic_add_int(&vp->v_usecount, 1);
 165         if (reactivate)
 166                 vp->v_flag &= ~VINACTIVE;
 167         spin_unlock_wr(&vp->v_spinlock);
 168 }
 169
 170 /*
 171  * Release a ref on the vnode.  Since 0->1 transitions can only be made
 172  * by vref_initial(), 1->0 transitions will be protected by the spinlock.
 173  *
 174  * When handling a 1->0 transition the vnode is guarenteed to not be locked
 175  * and we can set the exclusive lock atomically while interlocked with our
 176  * spinlock.  A panic will occur if the lock is held.
 177  */
 178 void
 179 vrele(struct vnode *vp)
 180 {
 181         spin_lock_wr(&vp->v_spinlock);
 182         if (vp->v_usecount > 1) {
 183                 atomic_subtract_int(&vp->v_usecount, 1);
 184                 spin_unlock_wr(&vp->v_spinlock);
 185                 return;
 186         }
 187         KKASSERT(vp->v_usecount == 1);
 188
 189         /*
 190          * This is roughly equivalent to obtaining an exclusive
 191          * lock, but the spinlock is already held (and remains held
 192          * on return) and the lock must be obtainable without
 193          * blocking, which it is in a 1->0 transition.
 194          */
 195         lockmgr_setexclusive_interlocked(&vp->v_lock);
 196
 197         /*
 198          * VINACTIVE is interlocked by the spinlock, so we have to re-check
 199          * the bit if we release and reacquire the spinlock even though
 200          * we are holding the exclusive lockmgr lock throughout.
 201          *
 202          * VOP_INACTIVE can race other VOPs even though we hold an exclusive
 203          * lock.  This is ok.  The ref count of 1 must remain intact through
 204          * the VOP_INACTIVE call to avoid a recursion.
 205          */
 206         while ((vp->v_flag & VINACTIVE) == 0 && vp->v_usecount == 1) {
 207                 vp->v_flag |= VINACTIVE;
 208                 spin_unlock_wr(&vp->v_spinlock);
 209                 VOP_INACTIVE(vp);
 210                 spin_lock_wr(&vp->v_spinlock);
 211         }
 212
 213         /*
 214          * NOTE: v_usecount might no longer be 1
 215          */
 216         atomic_subtract_int(&vp->v_usecount, 1);
 217         if (vshouldfree(vp, 0))
 218                 __vfree(vp);
 219         lockmgr_clrexclusive_interlocked(&vp->v_lock);
 220         /* spinlock unlocked */
 221 }
 222
 223 /*
 224  * Hold a vnode, preventing it from being recycled (unless it is already
 225  * undergoing a recyclement or already has been recycled).
 226  *
 227  * Opting not to remove a vnode from the freelist simply means that
 228  * allocvnode must do it for us if it finds an unsuitable vnode.
 229  */
 230 void
 231 vhold(struct vnode *vp)
 232 {
 233         spin_lock_wr(&vp->v_spinlock);
 234         atomic_add_int(&vp->v_holdcnt, 1);
 235         spin_unlock_wr(&vp->v_spinlock);
 236 }
 237
 238 /*
 239  * Like vrele(), we must atomically place the vnode on the free list if
 240  * it becomes suitable.  vhold()/vdrop() do not mess with VINACTIVE.
 241  */
 242 void
 243 vdrop(struct vnode *vp)
 244 {
 245         KKASSERT(vp->v_holdcnt > 0);
 246         spin_lock_wr(&vp->v_spinlock);
 247         atomic_subtract_int(&vp->v_holdcnt, 1);
 248         if (vshouldfree(vp, 0))
 249                 __vfree(vp);
 250         spin_unlock_wr(&vp->v_spinlock);
 251 }
 252
 253 /****************************************************************
 254  *                      VX LOCKING FUNCTIONS                    *
 255  ****************************************************************
 256  *
 257  * These functions lock vnodes for reclamation and deactivation related
 258  * activities.  Only vp->v_lock, the top layer of the VFS, is locked.
 259  * You must be holding a normal reference in order to be able to safely
 260  * call vx_lock() and vx_unlock().
 261  *
 262  * vx_get() also differs from vget() in that it does not clear the
 263  * VINACTIVE bit on a vnode.
 264  */
 265
 266 void
 267 vx_lock(struct vnode *vp)
 268 {
 269         lockmgr(&vp->v_lock, LK_EXCLUSIVE);
 270 }
 271
 272 void
 273 vx_unlock(struct vnode *vp)
 274 {
 275         lockmgr(&vp->v_lock, LK_RELEASE);
 276 }
 277
 278 void
 279 vx_get(struct vnode *vp)
 280 {
 281         vref_initial(vp, 0);
 282         lockmgr(&vp->v_lock, LK_EXCLUSIVE);
 283 }
 284
 285 int
 286 vx_get_nonblock(struct vnode *vp)
 287 {
 288         int error;
 289
 290         vref_initial(vp, 0);
 291         error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
 292         if (error)
 293                 vrele(vp);
 294         return(error);
 295 }
 296
 297 void
 298 vx_put(struct vnode *vp)
 299 {
 300         lockmgr(&vp->v_lock, LK_RELEASE);
 301         vrele(vp);
 302 }
 303
 304 /****************************************************************
 305  *                      VNODE ACQUISITION FUNCTIONS             *
 306  ****************************************************************
 307  *
 308  * vget() and vput() access a vnode for the intent of executing an
 309  * operation other then a reclamation or deactivation.  vget() will ref
 310  * and lock the vnode, vput() will unlock and deref the vnode.
 311  * The VOP_*() locking functions are used.
 312  *
 313  * CALLING VGET IS MANDATORY PRIOR TO ANY MODIFYING OPERATION ON A VNODE.
 314  * This is because vget handles the VINACTIVE interlock and is responsible
 315  * for clearing the bit.  If the bit is not cleared inode updates may not
 316  * make it to disk.
 317  *
 318  * Special cases: If vget()'s locking operation fails the vrele() call may
 319  * cause the vnode to be deactivated (VOP_INACTIVE called).  However, this
 320  * never occurs if the vnode is in a reclaimed state.  Vnodes in reclaimed
 321  * states always return an error code of ENOENT.
 322  *
 323  * Special cases: vput() will unlock and, if it is the last reference,
 324  * deactivate the vnode.  The deactivation uses a separate non-layered
 325  * VX lock after the normal unlock.  XXX make it more efficient.
 326  */
 327 int
 328 vget(struct vnode *vp, int flags)
 329 {
 330         int error;
 331
 332         vref_initial(vp, 0);
 333         if (flags & LK_TYPE_MASK) {
 334                 if ((error = vn_lock(vp, flags)) != 0) {
 335                         vrele(vp);
 336                 } else if (vp->v_flag & VRECLAIMED) {
 337                         vn_unlock(vp);
 338                         vrele(vp);
 339                         error = ENOENT;
 340                 } else {
 341                         vp->v_flag &= ~VINACTIVE;       /* XXX not MP safe */
 342                         error = 0;
 343                 }
 344         } else {
 345                 panic("vget() called with no lock specified!");
 346                 error = ENOENT; /* not reached, compiler opt */
 347         }
 348         return(error);
 349 }
 350
 351 void
 352 vput(struct vnode *vp)
 353 {
 354         vn_unlock(vp);
 355         vrele(vp);
 356 }
 357
 358 void
 359 vsetflags(struct vnode *vp, int flags)
 360 {
 361         crit_enter();
 362         vp->v_flag |= flags;
 363         crit_exit();
 364 }
 365
 366 void
 367 vclrflags(struct vnode *vp, int flags)
 368 {
 369         crit_enter();
 370         vp->v_flag &= ~flags;
 371         crit_exit();
 372 }
 373
 374 /*
 375  * Obtain a new vnode from the freelist, allocating more if necessary.
 376  * The returned vnode is VX locked & refd.
 377  */
 378 struct vnode *
 379 allocvnode(int lktimeout, int lkflags)
 380 {
 381         struct thread *td;
 382         struct vnode *vp;
 383
 384         /*
 385          * Try to reuse vnodes if we hit the max.  This situation only
 386          * occurs in certain large-memory (2G+) situations.  We cannot
 387          * attempt to directly reclaim vnodes due to nasty recursion
 388          * problems.
 389          */
 390         while (numvnodes - freevnodes > desiredvnodes)
 391                 vnlru_proc_wait();
 392
 393         td = curthread;
 394         vp = NULL;
 395
 396         /*
 397          * Attempt to reuse a vnode already on the free list, allocating
 398          * a new vnode if we can't find one or if we have not reached a
 399          * good minimum for good LRU performance.
 400          */
 401         if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
 402                 int count;
 403
 404                 for (count = 0; count < freevnodes; count++) {
 405                         /*
 406                          * __VNODESCAN__
 407                          *
 408                          * Pull the next vnode off the free list and do some
 409                          * sanity checks.  Note that regardless of how we
 410                          * block, if freevnodes is non-zero there had better
 411                          * be something on the list.
 412                          */
 413                         vp = TAILQ_FIRST(&vnode_free_list);
 414                         if (vp == NULL)
 415                                 panic("getnewvnode: free vnode isn't");
 416
 417                         /* XXX for now */
 418                         KKASSERT(vp->v_flag & VFREE);
 419
 420                         /*
 421                          * Handle the case where the vnode was pulled off
 422                          * the free list while we were waiting for the
 423                          * spinlock.
 424                          */
 425                         spin_lock_wr(&vp->v_spinlock);
 426                         if ((vp->v_flag & VFREE) == 0) {
 427                                 spin_unlock_wr(&vp->v_spinlock);
 428                                 vp = NULL;
 429                                 continue;
 430                         }
 431
 432                         /*
 433                          * Lazy removal of the vnode from the freelist if
 434                          * the vnode has references.
 435                          */
 436                         if (vp->v_usecount || vp->v_holdcnt) {
 437                                 __vbusy(vp);
 438                                 spin_unlock_wr(&vp->v_spinlock);
 439                                 vp = NULL;
 440                                 continue;
 441                         }
 442
 443                         /*
 444                          * vx_get() equivalent, but atomic with the
 445                          * spinlock held.  Since 0->1 transitions and the
 446                          * lockmgr are protected by the spinlock we must
 447                          * be able to get an exclusive lock without blocking
 448                          * here.
 449                          *
 450                          * Also take the vnode off of the free list and
 451                          * assert that it is inactive.
 452                          */
 453                         vp->v_usecount = 1;
 454                         lockmgr_setexclusive_interlocked(&vp->v_lock);
 455                         __vbusy(vp);
 456                         KKASSERT(vp->v_flag & VINACTIVE);
 457
 458                         /*
 459                          * Reclaim the vnode.  VRECLAIMED will be set
 460                          * atomically before the spinlock is released
 461                          * by vgone_interlocked().
 462                          */
 463                         if ((vp->v_flag & VRECLAIMED) == 0) {
 464                                 vgone_interlocked(vp);
 465                                 /* spinlock unlocked */
 466                         } else {
 467                                 spin_unlock_wr(&vp->v_spinlock);
 468                         }
 469
 470                         /*
 471                          * We reclaimed the vnode but other claimants may
 472                          * have referenced it while we were blocked.  We
 473                          * cannot reuse a vnode until all refs are gone and
 474                          * the vnode has completed reclamation.
 475                          */
 476                         KKASSERT(vp->v_flag & VRECLAIMED);
 477                         if (vp->v_usecount != 1 || vp->v_holdcnt) {
 478                                 vx_put(vp);
 479                                 vp = NULL;
 480                                 continue;
 481                         }
 482
 483                         /*
 484                          * There are no more structural references to the
 485                          * vnode, referenced or otherwise.  We have a vnode!
 486                          *
 487                          * The vnode may have been placed on the free list
 488                          * while we were blocked.
 489                          */
 490                         if (vp->v_flag & VFREE)
 491                                 __vbusy(vp);
 492                         KKASSERT(vp->v_flag & VINACTIVE);
 493                         break;
 494                 }
 495         }
 496
 497         /*
 498          * If we have a vp it will be refd and VX locked.
 499          */
 500         if (vp) {
 501 #ifdef INVARIANTS
 502                 if (vp->v_data)
 503                         panic("cleaned vnode isn't");
 504                 if (vp->v_track_read.bk_active + vp->v_track_write.bk_active)
 505                         panic("Clean vnode has pending I/O's");
 506                 KKASSERT(vp->v_mount == NULL);
 507 #endif
 508                 vp->v_flag = 0;
 509                 vp->v_lastw = 0;
 510                 vp->v_lasta = 0;
 511                 vp->v_cstart = 0;
 512                 vp->v_clen = 0;
 513                 vp->v_socket = 0;
 514                 vp->v_opencount = 0;
 515                 vp->v_writecount = 0;   /* XXX */
 516                 lockreinit(&vp->v_lock, "vnode", lktimeout, lkflags);
 517                 KKASSERT(TAILQ_FIRST(&vp->v_namecache) == NULL);
 518         } else {
 519                 /*
 520                  * A brand-new vnode (we could use malloc() here I think) XXX
 521                  */
 522                 vp = kmalloc(sizeof(struct vnode), M_VNODE, M_WAITOK|M_ZERO);
 523                 lwkt_token_init(&vp->v_pollinfo.vpi_token);
 524                 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags);
 525                 ccms_dataspace_init(&vp->v_ccms);
 526                 TAILQ_INIT(&vp->v_namecache);
 527
 528                 /*
 529                  * short cut around vfreeing it and looping, just set it up
 530                  * as if we had pulled a reclaimed vnode off the freelist
 531                  * and reinitialized it.
 532                  */
 533                 vp->v_usecount = 1;
 534                 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
 535                 numvnodes++;
 536         }
 537
 538         RB_INIT(&vp->v_rbclean_tree);
 539         RB_INIT(&vp->v_rbdirty_tree);
 540         RB_INIT(&vp->v_rbhash_tree);
 541         vp->v_filesize = NOOFFSET;
 542         vp->v_type = VNON;
 543         vp->v_tag = 0;
 544         vp->v_ops = NULL;
 545         vp->v_data = NULL;
 546         KKASSERT(vp->v_mount == NULL);
 547         return (vp);
 548 }
 549