sys/vfs/ufs/ffs_softdep.c

   1 /*
   2  * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
   3  *
   4  * The soft updates code is derived from the appendix of a University
   5  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
   6  * "Soft Updates: A Solution to the Metadata Update Problem in File
   7  * Systems", CSE-TR-254-95, August 1995).
   8  *
   9  * Further information about soft updates can be obtained from:
  10  *
  11  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
  12  *      1614 Oxford Street              mckusick@mckusick.com
  13  *      Berkeley, CA 94709-1608         +1-510-843-9542
  14  *      USA
  15  *
  16  * Redistribution and use in source and binary forms, with or without
  17  * modification, are permitted provided that the following conditions
  18  * are met:
  19  *
  20  * 1. Redistributions of source code must retain the above copyright
  21  *    notice, this list of conditions and the following disclaimer.
  22  * 2. Redistributions in binary form must reproduce the above copyright
  23  *    notice, this list of conditions and the following disclaimer in the
  24  *    documentation and/or other materials provided with the distribution.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
  27  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  28  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  29  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
  30  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
  39  * $FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.57.2.11 2002/02/05 18:46:53 dillon Exp $
  40  * $DragonFly: src/sys/vfs/ufs/ffs_softdep.c,v 1.4 2003/06/25 03:56:11 dillon Exp $
  41  */
  42
  43 /*
  44  * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
  45  */
  46 #ifndef DIAGNOSTIC
  47 #define DIAGNOSTIC
  48 #endif
  49 #ifndef DEBUG
  50 #define DEBUG
  51 #endif
  52
  53 #include <sys/param.h>
  54 #include <sys/kernel.h>
  55 #include <sys/systm.h>
  56 #include <sys/buf.h>
  57 #include <sys/malloc.h>
  58 #include <sys/mount.h>
  59 #include <sys/proc.h>
  60 #include <sys/syslog.h>
  61 #include <sys/vnode.h>
  62 #include <sys/conf.h>
  63 #include <sys/buf2.h>
  64 #include <ufs/ufs/dir.h>
  65 #include <ufs/ufs/quota.h>
  66 #include <ufs/ufs/inode.h>
  67 #include <ufs/ufs/ufsmount.h>
  68 #include <ufs/ffs/fs.h>
  69 #include <ufs/ffs/softdep.h>
  70 #include <ufs/ffs/ffs_extern.h>
  71 #include <ufs/ufs/ufs_extern.h>
  72
  73 /*
  74  * These definitions need to be adapted to the system to which
  75  * this file is being ported.
  76  */
  77 /*
  78  * malloc types defined for the softdep system.
  79  */
  80 MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
  81 MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
  82 MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
  83 MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
  84 MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
  85 MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
  86 MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
  87 MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
  88 MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
  89 MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
  90 MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
  91 MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
  92 MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
  93
  94 #define M_SOFTDEP_FLAGS         (M_WAITOK | M_USE_RESERVE)
  95
  96 #define D_PAGEDEP       0
  97 #define D_INODEDEP      1
  98 #define D_NEWBLK        2
  99 #define D_BMSAFEMAP     3
 100 #define D_ALLOCDIRECT   4
 101 #define D_INDIRDEP      5
 102 #define D_ALLOCINDIR    6
 103 #define D_FREEFRAG      7
 104 #define D_FREEBLKS      8
 105 #define D_FREEFILE      9
 106 #define D_DIRADD        10
 107 #define D_MKDIR         11
 108 #define D_DIRREM        12
 109 #define D_LAST          D_DIRREM
 110
 111 /*
 112  * translate from workitem type to memory type
 113  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
 114  */
 115 static struct malloc_type *memtype[] = {
 116         M_PAGEDEP,
 117         M_INODEDEP,
 118         M_NEWBLK,
 119         M_BMSAFEMAP,
 120         M_ALLOCDIRECT,
 121         M_INDIRDEP,
 122         M_ALLOCINDIR,
 123         M_FREEFRAG,
 124         M_FREEBLKS,
 125         M_FREEFILE,
 126         M_DIRADD,
 127         M_MKDIR,
 128         M_DIRREM
 129 };
 130
 131 #define DtoM(type) (memtype[type])
 132
 133 /*
 134  * Names of malloc types.
 135  */
 136 #define TYPENAME(type)  \
 137         ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
 138 /*
 139  * End system adaptaion definitions.
 140  */
 141
 142 /*
 143  * Internal function prototypes.
 144  */
 145 static  void softdep_error __P((char *, int));
 146 static  void drain_output __P((struct vnode *, int));
 147 static  int getdirtybuf __P((struct buf **, int));
 148 static  void clear_remove __P((struct thread *));
 149 static  void clear_inodedeps __P((struct thread *));
 150 static  int flush_pagedep_deps __P((struct vnode *, struct mount *,
 151             struct diraddhd *));
 152 static  int flush_inodedep_deps __P((struct fs *, ino_t));
 153 static  int handle_written_filepage __P((struct pagedep *, struct buf *));
 154 static  void diradd_inode_written __P((struct diradd *, struct inodedep *));
 155 static  int handle_written_inodeblock __P((struct inodedep *, struct buf *));
 156 static  void handle_allocdirect_partdone __P((struct allocdirect *));
 157 static  void handle_allocindir_partdone __P((struct allocindir *));
 158 static  void initiate_write_filepage __P((struct pagedep *, struct buf *));
 159 static  void handle_written_mkdir __P((struct mkdir *, int));
 160 static  void initiate_write_inodeblock __P((struct inodedep *, struct buf *));
 161 static  void handle_workitem_freefile __P((struct freefile *));
 162 static  void handle_workitem_remove __P((struct dirrem *));
 163 static  struct dirrem *newdirrem __P((struct buf *, struct inode *,
 164             struct inode *, int, struct dirrem **));
 165 static  void free_diradd __P((struct diradd *));
 166 static  void free_allocindir __P((struct allocindir *, struct inodedep *));
 167 static  int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t,
 168             long *));
 169 static  void deallocate_dependencies __P((struct buf *, struct inodedep *));
 170 static  void free_allocdirect __P((struct allocdirectlst *,
 171             struct allocdirect *, int));
 172 static  int check_inode_unwritten __P((struct inodedep *));
 173 static  int free_inodedep __P((struct inodedep *));
 174 static  void handle_workitem_freeblocks __P((struct freeblks *));
 175 static  void merge_inode_lists __P((struct inodedep *));
 176 static  void setup_allocindir_phase2 __P((struct buf *, struct inode *,
 177             struct allocindir *));
 178 static  struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t,
 179             ufs_daddr_t));
 180 static  void handle_workitem_freefrag __P((struct freefrag *));
 181 static  struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long));
 182 static  void allocdirect_merge __P((struct allocdirectlst *,
 183             struct allocdirect *, struct allocdirect *));
 184 static  struct bmsafemap *bmsafemap_lookup __P((struct buf *));
 185 static  int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
 186             struct newblk **));
 187 static  int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **));
 188 static  int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
 189             struct pagedep **));
 190 static  void pause_timer __P((void *));
 191 static  int request_cleanup __P((int, int));
 192 static  int process_worklist_item __P((struct mount *, int));
 193 static  void add_to_worklist __P((struct worklist *));
 194
 195 /*
 196  * Exported softdep operations.
 197  */
 198 static  void softdep_disk_io_initiation __P((struct buf *));
 199 static  void softdep_disk_write_complete __P((struct buf *));
 200 static  void softdep_deallocate_dependencies __P((struct buf *));
 201 static  int softdep_fsync __P((struct vnode *));
 202 static  int softdep_process_worklist __P((struct mount *));
 203 static  void softdep_move_dependencies __P((struct buf *, struct buf *));
 204 static  int softdep_count_dependencies __P((struct buf *bp, int));
 205
 206 struct bio_ops bioops = {
 207         softdep_disk_io_initiation,             /* io_start */
 208         softdep_disk_write_complete,            /* io_complete */
 209         softdep_deallocate_dependencies,        /* io_deallocate */
 210         softdep_fsync,                          /* io_fsync */
 211         softdep_process_worklist,               /* io_sync */
 212         softdep_move_dependencies,              /* io_movedeps */
 213         softdep_count_dependencies,             /* io_countdeps */
 214 };
 215
 216 /*
 217  * Locking primitives.
 218  *
 219  * For a uniprocessor, all we need to do is protect against disk
 220  * interrupts. For a multiprocessor, this lock would have to be
 221  * a mutex. A single mutex is used throughout this file, though
 222  * finer grain locking could be used if contention warranted it.
 223  *
 224  * For a multiprocessor, the sleep call would accept a lock and
 225  * release it after the sleep processing was complete. In a uniprocessor
 226  * implementation there is no such interlock, so we simple mark
 227  * the places where it needs to be done with the `interlocked' form
 228  * of the lock calls. Since the uniprocessor sleep already interlocks
 229  * the spl, there is nothing that really needs to be done.
 230  */
 231 #ifndef /* NOT */ DEBUG
 232 static struct lockit {
 233         int     lkt_spl;
 234 } lk = { 0 };
 235 #define ACQUIRE_LOCK(lk)                (lk)->lkt_spl = splbio()
 236 #define FREE_LOCK(lk)                   splx((lk)->lkt_spl)
 237
 238 #else /* DEBUG */
 239 #define NOHOLDER        ((struct thread *)-1)
 240 #define SPECIAL_FLAG    ((struct thread *)-2)
 241 static struct lockit {
 242         int     lkt_spl;
 243         struct thread *lkt_held;
 244 } lk = { 0, NOHOLDER };
 245 static int lockcnt;
 246
 247 static  void acquire_lock __P((struct lockit *));
 248 static  void free_lock __P((struct lockit *));
 249 void    softdep_panic __P((char *));
 250
 251 #define ACQUIRE_LOCK(lk)                acquire_lock(lk)
 252 #define FREE_LOCK(lk)                   free_lock(lk)
 253
 254 static void
 255 acquire_lock(lk)
 256         struct lockit *lk;
 257 {
 258         thread_t holder;
 259
 260         if (lk->lkt_held != NOHOLDER) {
 261                 holder = lk->lkt_held;
 262                 FREE_LOCK(lk);
 263                 if (holder == curthread)
 264                         panic("softdep_lock: locking against myself");
 265                 else
 266                         panic("softdep_lock: lock held by %p", holder);
 267         }
 268         lk->lkt_spl = splbio();
 269         lk->lkt_held = curthread;
 270         lockcnt++;
 271 }
 272
 273 static void
 274 free_lock(lk)
 275         struct lockit *lk;
 276 {
 277
 278         if (lk->lkt_held == NOHOLDER)
 279                 panic("softdep_unlock: lock not held");
 280         lk->lkt_held = NOHOLDER;
 281         splx(lk->lkt_spl);
 282 }
 283
 284 /*
 285  * Function to release soft updates lock and panic.
 286  */
 287 void
 288 softdep_panic(msg)
 289         char *msg;
 290 {
 291
 292         if (lk.lkt_held != NOHOLDER)
 293                 FREE_LOCK(&lk);
 294         panic(msg);
 295 }
 296 #endif /* DEBUG */
 297
 298 static  int interlocked_sleep __P((struct lockit *, int, void *, int,
 299             const char *, int));
 300
 301 /*
 302  * When going to sleep, we must save our SPL so that it does
 303  * not get lost if some other process uses the lock while we
 304  * are sleeping. We restore it after we have slept. This routine
 305  * wraps the interlocking with functions that sleep. The list
 306  * below enumerates the available set of operations.
 307  */
 308 #define UNKNOWN         0
 309 #define SLEEP           1
 310 #define LOCKBUF         2
 311
 312 static int
 313 interlocked_sleep(lk, op, ident, flags, wmesg, timo)
 314         struct lockit *lk;
 315         int op;
 316         void *ident;
 317         int flags;
 318         const char *wmesg;
 319         int timo;
 320 {
 321         thread_t holder;
 322         int s, retval;
 323
 324         s = lk->lkt_spl;
 325 #       ifdef DEBUG
 326         if (lk->lkt_held == NOHOLDER)
 327                 panic("interlocked_sleep: lock not held");
 328         lk->lkt_held = NOHOLDER;
 329 #       endif /* DEBUG */
 330         switch (op) {
 331         case SLEEP:
 332                 retval = tsleep(ident, flags, wmesg, timo);
 333                 break;
 334         case LOCKBUF:
 335                 retval = BUF_LOCK((struct buf *)ident, flags);
 336                 break;
 337         default:
 338                 panic("interlocked_sleep: unknown operation");
 339         }
 340 #       ifdef DEBUG
 341         if (lk->lkt_held != NOHOLDER) {
 342                 holder = lk->lkt_held;
 343                 FREE_LOCK(lk);
 344                 if (holder == curthread)
 345                         panic("interlocked_sleep: locking against self");
 346                 else
 347                         panic("interlocked_sleep: lock held by %p", holder);
 348         }
 349         lk->lkt_held = curthread;
 350         lockcnt++;
 351 #       endif /* DEBUG */
 352         lk->lkt_spl = s;
 353         return (retval);
 354 }
 355
 356 /*
 357  * Place holder for real semaphores.
 358  */
 359 struct sema {
 360         int     value;
 361         thread_t holder;
 362         char    *name;
 363         int     prio;
 364         int     timo;
 365 };
 366 static  void sema_init __P((struct sema *, char *, int, int));
 367 static  int sema_get __P((struct sema *, struct lockit *));
 368 static  void sema_release __P((struct sema *));
 369
 370 static void
 371 sema_init(semap, name, prio, timo)
 372         struct sema *semap;
 373         char *name;
 374         int prio, timo;
 375 {
 376
 377         semap->holder = NOHOLDER;
 378         semap->value = 0;
 379         semap->name = name;
 380         semap->prio = prio;
 381         semap->timo = timo;
 382 }
 383
 384 static int
 385 sema_get(semap, interlock)
 386         struct sema *semap;
 387         struct lockit *interlock;
 388 {
 389
 390         if (semap->value++ > 0) {
 391                 if (interlock != NULL) {
 392                         interlocked_sleep(interlock, SLEEP, (caddr_t)semap,
 393                             semap->prio, semap->name, semap->timo);
 394                         FREE_LOCK(interlock);
 395                 } else {
 396                         tsleep((caddr_t)semap, semap->prio, semap->name,
 397                             semap->timo);
 398                 }
 399                 return (0);
 400         }
 401         semap->holder = curthread;
 402         if (interlock != NULL)
 403                 FREE_LOCK(interlock);
 404         return (1);
 405 }
 406
 407 static void
 408 sema_release(semap)
 409         struct sema *semap;
 410 {
 411
 412         if (semap->value <= 0 || semap->holder != curthread) {
 413                 if (lk.lkt_held != NOHOLDER)
 414                         FREE_LOCK(&lk);
 415                 panic("sema_release: not held");
 416         }
 417         if (--semap->value > 0) {
 418                 semap->value = 0;
 419                 wakeup(semap);
 420         }
 421         semap->holder = NOHOLDER;
 422 }
 423
 424 /*
 425  * Worklist queue management.
 426  * These routines require that the lock be held.
 427  */
 428 #ifndef /* NOT */ DEBUG
 429 #define WORKLIST_INSERT(head, item) do {        \
 430         (item)->wk_state |= ONWORKLIST;         \
 431         LIST_INSERT_HEAD(head, item, wk_list);  \
 432 } while (0)
 433 #define WORKLIST_REMOVE(item) do {              \
 434         (item)->wk_state &= ~ONWORKLIST;        \
 435         LIST_REMOVE(item, wk_list);             \
 436 } while (0)
 437 #define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
 438
 439 #else /* DEBUG */
 440 static  void worklist_insert __P((struct workhead *, struct worklist *));
 441 static  void worklist_remove __P((struct worklist *));
 442 static  void workitem_free __P((struct worklist *, int));
 443
 444 #define WORKLIST_INSERT(head, item) worklist_insert(head, item)
 445 #define WORKLIST_REMOVE(item) worklist_remove(item)
 446 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
 447
 448 static void
 449 worklist_insert(head, item)
 450         struct workhead *head;
 451         struct worklist *item;
 452 {
 453
 454         if (lk.lkt_held == NOHOLDER)
 455                 panic("worklist_insert: lock not held");
 456         if (item->wk_state & ONWORKLIST) {
 457                 FREE_LOCK(&lk);
 458                 panic("worklist_insert: already on list");
 459         }
 460         item->wk_state |= ONWORKLIST;
 461         LIST_INSERT_HEAD(head, item, wk_list);
 462 }
 463
 464 static void
 465 worklist_remove(item)
 466         struct worklist *item;
 467 {
 468
 469         if (lk.lkt_held == NOHOLDER)
 470                 panic("worklist_remove: lock not held");
 471         if ((item->wk_state & ONWORKLIST) == 0) {
 472                 FREE_LOCK(&lk);
 473                 panic("worklist_remove: not on list");
 474         }
 475         item->wk_state &= ~ONWORKLIST;
 476         LIST_REMOVE(item, wk_list);
 477 }
 478
 479 static void
 480 workitem_free(item, type)
 481         struct worklist *item;
 482         int type;
 483 {
 484
 485         if (item->wk_state & ONWORKLIST) {
 486                 if (lk.lkt_held != NOHOLDER)
 487                         FREE_LOCK(&lk);
 488                 panic("workitem_free: still on list");
 489         }
 490         if (item->wk_type != type) {
 491                 if (lk.lkt_held != NOHOLDER)
 492                         FREE_LOCK(&lk);
 493                 panic("workitem_free: type mismatch");
 494         }
 495         FREE(item, DtoM(type));
 496 }
 497 #endif /* DEBUG */
 498
 499 /*
 500  * Workitem queue management
 501  */
 502 static struct workhead softdep_workitem_pending;
 503 static int num_on_worklist;     /* number of worklist items to be processed */
 504 static int softdep_worklist_busy; /* 1 => trying to do unmount */
 505 static int softdep_worklist_req; /* serialized waiters */
 506 static int max_softdeps;        /* maximum number of structs before slowdown */
 507 static int tickdelay = 2;       /* number of ticks to pause during slowdown */
 508 static int *stat_countp;        /* statistic to count in proc_waiting timeout */
 509 static int proc_waiting;        /* tracks whether we have a timeout posted */
 510 static struct callout_handle handle; /* handle on posted proc_waiting timeout */
 511 static struct thread *filesys_syncer; /* proc of filesystem syncer process */
 512 static int req_clear_inodedeps; /* syncer process flush some inodedeps */
 513 #define FLUSH_INODES    1
 514 static int req_clear_remove;    /* syncer process flush some freeblks */
 515 #define FLUSH_REMOVE    2
 516 /*
 517  * runtime statistics
 518  */
 519 static int stat_worklist_push;  /* number of worklist cleanups */
 520 static int stat_blk_limit_push; /* number of times block limit neared */
 521 static int stat_ino_limit_push; /* number of times inode limit neared */
 522 static int stat_blk_limit_hit;  /* number of times block slowdown imposed */
 523 static int stat_ino_limit_hit;  /* number of times inode slowdown imposed */
 524 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
 525 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
 526 static int stat_inode_bitmap;   /* bufs redirtied as inode bitmap not written */
 527 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
 528 static int stat_dir_entry;      /* bufs redirtied as dir entry cannot write */
 529 #ifdef DEBUG
 530 #include <vm/vm.h>
 531 #include <sys/sysctl.h>
 532 SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
 533 SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
 534 SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
 535 SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
 536 SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
 537 SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
 538 SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
 539 SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
 540 SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
 541 SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
 542 SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
 543 SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
 544 #endif /* DEBUG */
 545
 546 /*
 547  * Add an item to the end of the work queue.
 548  * This routine requires that the lock be held.
 549  * This is the only routine that adds items to the list.
 550  * The following routine is the only one that removes items
 551  * and does so in order from first to last.
 552  */
 553 static void
 554 add_to_worklist(wk)
 555         struct worklist *wk;
 556 {
 557         static struct worklist *worklist_tail;
 558
 559         if (wk->wk_state & ONWORKLIST) {
 560                 if (lk.lkt_held != NOHOLDER)
 561                         FREE_LOCK(&lk);
 562                 panic("add_to_worklist: already on list");
 563         }
 564         wk->wk_state |= ONWORKLIST;
 565         if (LIST_FIRST(&softdep_workitem_pending) == NULL)
 566                 LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
 567         else
 568                 LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
 569         worklist_tail = wk;
 570         num_on_worklist += 1;
 571 }
 572
 573 /*
 574  * Process that runs once per second to handle items in the background queue.
 575  *
 576  * Note that we ensure that everything is done in the order in which they
 577  * appear in the queue. The code below depends on this property to ensure
 578  * that blocks of a file are freed before the inode itself is freed. This
 579  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
 580  * until all the old ones have been purged from the dependency lists.
 581  */
 582 static int
 583 softdep_process_worklist(matchmnt)
 584         struct mount *matchmnt;
 585 {
 586         thread_t td = curthread;
 587         int matchcnt, loopcount;
 588         long starttime;
 589
 590         /*
 591          * Record the process identifier of our caller so that we can give
 592          * this process preferential treatment in request_cleanup below.
 593          */
 594         filesys_syncer = td;
 595         matchcnt = 0;
 596
 597         /*
 598          * There is no danger of having multiple processes run this
 599          * code, but we have to single-thread it when softdep_flushfiles()
 600          * is in operation to get an accurate count of the number of items
 601          * related to its mount point that are in the list.
 602          */
 603         if (matchmnt == NULL) {
 604                 if (softdep_worklist_busy < 0)
 605                         return(-1);
 606                 softdep_worklist_busy += 1;
 607         }
 608
 609         /*
 610          * If requested, try removing inode or removal dependencies.
 611          */
 612         if (req_clear_inodedeps) {
 613                 clear_inodedeps(td);
 614                 req_clear_inodedeps -= 1;
 615                 wakeup_one(&proc_waiting);
 616         }
 617         if (req_clear_remove) {
 618                 clear_remove(td);
 619                 req_clear_remove -= 1;
 620                 wakeup_one(&proc_waiting);
 621         }
 622         loopcount = 1;
 623         starttime = time_second;
 624         while (num_on_worklist > 0) {
 625                 matchcnt += process_worklist_item(matchmnt, 0);
 626
 627                 /*
 628                  * If a umount operation wants to run the worklist
 629                  * accurately, abort.
 630                  */
 631                 if (softdep_worklist_req && matchmnt == NULL) {
 632                         matchcnt = -1;
 633                         break;
 634                 }
 635
 636                 /*
 637                  * If requested, try removing inode or removal dependencies.
 638                  */
 639                 if (req_clear_inodedeps) {
 640                         clear_inodedeps(td);
 641                         req_clear_inodedeps -= 1;
 642                         wakeup_one(&proc_waiting);
 643                 }
 644                 if (req_clear_remove) {
 645                         clear_remove(td);
 646                         req_clear_remove -= 1;
 647                         wakeup_one(&proc_waiting);
 648                 }
 649                 /*
 650                  * We do not generally want to stop for buffer space, but if
 651                  * we are really being a buffer hog, we will stop and wait.
 652                  */
 653                 if (loopcount++ % 128 == 0)
 654                         bwillwrite();
 655                 /*
 656                  * Never allow processing to run for more than one
 657                  * second. Otherwise the other syncer tasks may get
 658                  * excessively backlogged.
 659                  */
 660                 if (starttime != time_second && matchmnt == NULL) {
 661                         matchcnt = -1;
 662                         break;
 663                 }
 664         }
 665         if (matchmnt == NULL) {
 666                 --softdep_worklist_busy;
 667                 if (softdep_worklist_req && softdep_worklist_busy == 0)
 668                         wakeup(&softdep_worklist_req);
 669         }
 670         return (matchcnt);
 671 }
 672
 673 /*
 674  * Process one item on the worklist.
 675  */
 676 static int
 677 process_worklist_item(matchmnt, flags)
 678         struct mount *matchmnt;
 679         int flags;
 680 {
 681         struct worklist *wk;
 682         struct dirrem *dirrem;
 683         struct fs *matchfs;
 684         struct vnode *vp;
 685         int matchcnt = 0;
 686
 687         matchfs = NULL;
 688         if (matchmnt != NULL)
 689                 matchfs = VFSTOUFS(matchmnt)->um_fs;
 690         ACQUIRE_LOCK(&lk);
 691         /*
 692          * Normally we just process each item on the worklist in order.
 693          * However, if we are in a situation where we cannot lock any
 694          * inodes, we have to skip over any dirrem requests whose
 695          * vnodes are resident and locked.
 696          */
 697         LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
 698                 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
 699                         break;
 700                 dirrem = WK_DIRREM(wk);
 701                 vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev,
 702                     dirrem->dm_oldinum);
 703                 if (vp == NULL || !VOP_ISLOCKED(vp, curthread))
 704                         break;
 705         }
 706         if (wk == 0) {
 707                 FREE_LOCK(&lk);
 708                 return (0);
 709         }
 710         WORKLIST_REMOVE(wk);
 711         num_on_worklist -= 1;
 712         FREE_LOCK(&lk);
 713         switch (wk->wk_type) {
 714
 715         case D_DIRREM:
 716                 /* removal of a directory entry */
 717                 if (WK_DIRREM(wk)->dm_mnt == matchmnt)
 718                         matchcnt += 1;
 719                 handle_workitem_remove(WK_DIRREM(wk));
 720                 break;
 721
 722         case D_FREEBLKS:
 723                 /* releasing blocks and/or fragments from a file */
 724                 if (WK_FREEBLKS(wk)->fb_fs == matchfs)
 725                         matchcnt += 1;
 726                 handle_workitem_freeblocks(WK_FREEBLKS(wk));
 727                 break;
 728
 729         case D_FREEFRAG:
 730                 /* releasing a fragment when replaced as a file grows */
 731                 if (WK_FREEFRAG(wk)->ff_fs == matchfs)
 732                         matchcnt += 1;
 733                 handle_workitem_freefrag(WK_FREEFRAG(wk));
 734                 break;
 735
 736         case D_FREEFILE:
 737                 /* releasing an inode when its link count drops to 0 */
 738                 if (WK_FREEFILE(wk)->fx_fs == matchfs)
 739                         matchcnt += 1;
 740                 handle_workitem_freefile(WK_FREEFILE(wk));
 741                 break;
 742
 743         default:
 744                 panic("%s_process_worklist: Unknown type %s",
 745                     "softdep", TYPENAME(wk->wk_type));
 746                 /* NOTREACHED */
 747         }
 748         return (matchcnt);
 749 }
 750
 751 /*
 752  * Move dependencies from one buffer to another.
 753  */
 754 static void
 755 softdep_move_dependencies(oldbp, newbp)
 756         struct buf *oldbp;
 757         struct buf *newbp;
 758 {
 759         struct worklist *wk, *wktail;
 760
 761         if (LIST_FIRST(&newbp->b_dep) != NULL)
 762                 panic("softdep_move_dependencies: need merge code");
 763         wktail = 0;
 764         ACQUIRE_LOCK(&lk);
 765         while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
 766                 LIST_REMOVE(wk, wk_list);
 767                 if (wktail == 0)
 768                         LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
 769                 else
 770                         LIST_INSERT_AFTER(wktail, wk, wk_list);
 771                 wktail = wk;
 772         }
 773         FREE_LOCK(&lk);
 774 }
 775
 776 /*
 777  * Purge the work list of all items associated with a particular mount point.
 778  */
 779 int
 780 softdep_flushfiles(struct mount *oldmnt, int flags, struct thread *td)
 781 {
 782         struct vnode *devvp;
 783         struct ucred *cred;
 784         int error, loopcnt;
 785
 786         KKASSERT(td->td_proc);
 787         cred = td->td_proc->p_ucred;
 788
 789         /*
 790          * Await our turn to clear out the queue, then serialize access.
 791          */
 792         while (softdep_worklist_busy != 0) {
 793                 softdep_worklist_req += 1;
 794                 tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0);
 795                 softdep_worklist_req -= 1;
 796         }
 797         softdep_worklist_busy = -1;
 798
 799         if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) {
 800                 softdep_worklist_busy = 0;
 801                 if (softdep_worklist_req)
 802                         wakeup(&softdep_worklist_req);
 803                 return (error);
 804         }
 805         /*
 806          * Alternately flush the block device associated with the mount
 807          * point and process any dependencies that the flushing
 808          * creates. In theory, this loop can happen at most twice,
 809          * but we give it a few extra just to be sure.
 810          */
 811         devvp = VFSTOUFS(oldmnt)->um_devvp;
 812         for (loopcnt = 10; loopcnt > 0; ) {
 813                 if (softdep_process_worklist(oldmnt) == 0) {
 814                         loopcnt--;
 815                         /*
 816                          * Do another flush in case any vnodes were brought in
 817                          * as part of the cleanup operations.
 818                          */
 819                         if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
 820                                 break;
 821                         /*
 822                          * If we still found nothing to do, we are really done.
 823                          */
 824                         if (softdep_process_worklist(oldmnt) == 0)
 825                                 break;
 826                 }
 827                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
 828                 error = VOP_FSYNC(devvp, cred, MNT_WAIT, td);
 829                 VOP_UNLOCK(devvp, 0, td);
 830                 if (error)
 831                         break;
 832         }
 833         softdep_worklist_busy = 0;
 834         if (softdep_worklist_req)
 835                 wakeup(&softdep_worklist_req);
 836
 837         /*
 838          * If we are unmounting then it is an error to fail. If we
 839          * are simply trying to downgrade to read-only, then filesystem
 840          * activity can keep us busy forever, so we just fail with EBUSY.
 841          */
 842         if (loopcnt == 0) {
 843                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
 844                         panic("softdep_flushfiles: looping");
 845                 error = EBUSY;
 846         }
 847         return (error);
 848 }
 849
 850 /*
 851  * Structure hashing.
 852  *
 853  * There are three types of structures that can be looked up:
 854  *      1) pagedep structures identified by mount point, inode number,
 855  *         and logical block.
 856  *      2) inodedep structures identified by mount point and inode number.
 857  *      3) newblk structures identified by mount point and
 858  *         physical block number.
 859  *
 860  * The "pagedep" and "inodedep" dependency structures are hashed
 861  * separately from the file blocks and inodes to which they correspond.
 862  * This separation helps when the in-memory copy of an inode or
 863  * file block must be replaced. It also obviates the need to access
 864  * an inode or file page when simply updating (or de-allocating)
 865  * dependency structures. Lookup of newblk structures is needed to
 866  * find newly allocated blocks when trying to associate them with
 867  * their allocdirect or allocindir structure.
 868  *
 869  * The lookup routines optionally create and hash a new instance when
 870  * an existing entry is not found.
 871  */
 872 #define DEPALLOC        0x0001  /* allocate structure if lookup fails */
 873 #define NODELAY         0x0002  /* cannot do background work */
 874
 875 /*
 876  * Structures and routines associated with pagedep caching.
 877  */
 878 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
 879 u_long  pagedep_hash;           /* size of hash table - 1 */
 880 #define PAGEDEP_HASH(mp, inum, lbn) \
 881         (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
 882             pagedep_hash])
 883 static struct sema pagedep_in_progress;
 884
 885 /*
 886  * Look up a pagedep. Return 1 if found, 0 if not found.
 887  * If not found, allocate if DEPALLOC flag is passed.
 888  * Found or allocated entry is returned in pagedeppp.
 889  * This routine must be called with splbio interrupts blocked.
 890  */
 891 static int
 892 pagedep_lookup(ip, lbn, flags, pagedeppp)
 893         struct inode *ip;
 894         ufs_lbn_t lbn;
 895         int flags;
 896         struct pagedep **pagedeppp;
 897 {
 898         struct pagedep *pagedep;
 899         struct pagedep_hashhead *pagedephd;
 900         struct mount *mp;
 901         int i;
 902
 903 #ifdef DEBUG
 904         if (lk.lkt_held == NOHOLDER)
 905                 panic("pagedep_lookup: lock not held");
 906 #endif
 907         mp = ITOV(ip)->v_mount;
 908         pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
 909 top:
 910         LIST_FOREACH(pagedep, pagedephd, pd_hash)
 911                 if (ip->i_number == pagedep->pd_ino &&
 912                     lbn == pagedep->pd_lbn &&
 913                     mp == pagedep->pd_mnt)
 914                         break;
 915         if (pagedep) {
 916                 *pagedeppp = pagedep;
 917                 return (1);
 918         }
 919         if ((flags & DEPALLOC) == 0) {
 920                 *pagedeppp = NULL;
 921                 return (0);
 922         }
 923         if (sema_get(&pagedep_in_progress, &lk) == 0) {
 924                 ACQUIRE_LOCK(&lk);
 925                 goto top;
 926         }
 927         MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
 928                 M_SOFTDEP_FLAGS);
 929         bzero(pagedep, sizeof(struct pagedep));
 930         pagedep->pd_list.wk_type = D_PAGEDEP;
 931         pagedep->pd_mnt = mp;
 932         pagedep->pd_ino = ip->i_number;
 933         pagedep->pd_lbn = lbn;
 934         LIST_INIT(&pagedep->pd_dirremhd);
 935         LIST_INIT(&pagedep->pd_pendinghd);
 936         for (i = 0; i < DAHASHSZ; i++)
 937                 LIST_INIT(&pagedep->pd_diraddhd[i]);
 938         ACQUIRE_LOCK(&lk);
 939         LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
 940         sema_release(&pagedep_in_progress);
 941         *pagedeppp = pagedep;
 942         return (0);
 943 }
 944
 945 /*
 946  * Structures and routines associated with inodedep caching.
 947  */
 948 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
 949 static u_long   inodedep_hash;  /* size of hash table - 1 */
 950 static long     num_inodedep;   /* number of inodedep allocated */
 951 #define INODEDEP_HASH(fs, inum) \
 952       (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
 953 static struct sema inodedep_in_progress;
 954
 955 /*
 956  * Look up a inodedep. Return 1 if found, 0 if not found.
 957  * If not found, allocate if DEPALLOC flag is passed.
 958  * Found or allocated entry is returned in inodedeppp.
 959  * This routine must be called with splbio interrupts blocked.
 960  */
 961 static int
 962 inodedep_lookup(fs, inum, flags, inodedeppp)
 963         struct fs *fs;
 964         ino_t inum;
 965         int flags;
 966         struct inodedep **inodedeppp;
 967 {
 968         struct inodedep *inodedep;
 969         struct inodedep_hashhead *inodedephd;
 970         int firsttry;
 971
 972 #ifdef DEBUG
 973         if (lk.lkt_held == NOHOLDER)
 974                 panic("inodedep_lookup: lock not held");
 975 #endif
 976         firsttry = 1;
 977         inodedephd = INODEDEP_HASH(fs, inum);
 978 top:
 979         LIST_FOREACH(inodedep, inodedephd, id_hash)
 980                 if (inum == inodedep->id_ino && fs == inodedep->id_fs)
 981                         break;
 982         if (inodedep) {
 983                 *inodedeppp = inodedep;
 984                 return (1);
 985         }
 986         if ((flags & DEPALLOC) == 0) {
 987                 *inodedeppp = NULL;
 988                 return (0);
 989         }
 990         /*
 991          * If we are over our limit, try to improve the situation.
 992          */
 993         if (num_inodedep > max_softdeps && firsttry &&
 994             speedup_syncer() == 0 && (flags & NODELAY) == 0 &&
 995             request_cleanup(FLUSH_INODES, 1)) {
 996                 firsttry = 0;
 997                 goto top;
 998         }
 999         if (sema_get(&inodedep_in_progress, &lk) == 0) {
1000                 ACQUIRE_LOCK(&lk);
1001                 goto top;
1002         }
1003         num_inodedep += 1;
1004         MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
1005                 M_INODEDEP, M_SOFTDEP_FLAGS);
1006         inodedep->id_list.wk_type = D_INODEDEP;
1007         inodedep->id_fs = fs;
1008         inodedep->id_ino = inum;
1009         inodedep->id_state = ALLCOMPLETE;
1010         inodedep->id_nlinkdelta = 0;
1011         inodedep->id_savedino = NULL;
1012         inodedep->id_savedsize = -1;
1013         inodedep->id_buf = NULL;
1014         LIST_INIT(&inodedep->id_pendinghd);
1015         LIST_INIT(&inodedep->id_inowait);
1016         LIST_INIT(&inodedep->id_bufwait);
1017         TAILQ_INIT(&inodedep->id_inoupdt);
1018         TAILQ_INIT(&inodedep->id_newinoupdt);
1019         ACQUIRE_LOCK(&lk);
1020         LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
1021         sema_release(&inodedep_in_progress);
1022         *inodedeppp = inodedep;
1023         return (0);
1024 }
1025
1026 /*
1027  * Structures and routines associated with newblk caching.
1028  */
1029 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1030 u_long  newblk_hash;            /* size of hash table - 1 */
1031 #define NEWBLK_HASH(fs, inum) \
1032         (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1033 static struct sema newblk_in_progress;
1034
1035 /*
1036  * Look up a newblk. Return 1 if found, 0 if not found.
1037  * If not found, allocate if DEPALLOC flag is passed.
1038  * Found or allocated entry is returned in newblkpp.
1039  */
1040 static int
1041 newblk_lookup(fs, newblkno, flags, newblkpp)
1042         struct fs *fs;
1043         ufs_daddr_t newblkno;
1044         int flags;
1045         struct newblk **newblkpp;
1046 {
1047         struct newblk *newblk;
1048         struct newblk_hashhead *newblkhd;
1049
1050         newblkhd = NEWBLK_HASH(fs, newblkno);
1051 top:
1052         LIST_FOREACH(newblk, newblkhd, nb_hash)
1053                 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
1054                         break;
1055         if (newblk) {
1056                 *newblkpp = newblk;
1057                 return (1);
1058         }
1059         if ((flags & DEPALLOC) == 0) {
1060                 *newblkpp = NULL;
1061                 return (0);
1062         }
1063         if (sema_get(&newblk_in_progress, 0) == 0)
1064                 goto top;
1065         MALLOC(newblk, struct newblk *, sizeof(struct newblk),
1066                 M_NEWBLK, M_SOFTDEP_FLAGS);
1067         newblk->nb_state = 0;
1068         newblk->nb_fs = fs;
1069         newblk->nb_newblkno = newblkno;
1070         LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1071         sema_release(&newblk_in_progress);
1072         *newblkpp = newblk;
1073         return (0);
1074 }
1075
1076 /*
1077  * Executed during filesystem system initialization before
1078  * mounting any file systems.
1079  */
1080 void
1081 softdep_initialize()
1082 {
1083
1084         LIST_INIT(&mkdirlisthd);
1085         LIST_INIT(&softdep_workitem_pending);
1086         max_softdeps = min(desiredvnodes * 8,
1087                 M_INODEDEP->ks_limit / (2 * sizeof(struct inodedep)));
1088         pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
1089             &pagedep_hash);
1090         sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
1091         inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
1092         sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
1093         newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
1094         sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
1095 }
1096
1097 /*
1098  * Called at mount time to notify the dependency code that a
1099  * filesystem wishes to use it.
1100  */
1101 int
1102 softdep_mount(devvp, mp, fs, cred)
1103         struct vnode *devvp;
1104         struct mount *mp;
1105         struct fs *fs;
1106         struct ucred *cred;
1107 {
1108         struct csum cstotal;
1109         struct cg *cgp;
1110         struct buf *bp;
1111         int error, cyl;
1112
1113         mp->mnt_flag &= ~MNT_ASYNC;
1114         mp->mnt_flag |= MNT_SOFTDEP;
1115         /*
1116          * When doing soft updates, the counters in the
1117          * superblock may have gotten out of sync, so we have
1118          * to scan the cylinder groups and recalculate them.
1119          */
1120         if (fs->fs_clean != 0)
1121                 return (0);
1122         bzero(&cstotal, sizeof cstotal);
1123         for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
1124                 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
1125                     fs->fs_cgsize, cred, &bp)) != 0) {
1126                         brelse(bp);
1127                         return (error);
1128                 }
1129                 cgp = (struct cg *)bp->b_data;
1130                 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
1131                 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1132                 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1133                 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1134                 fs->fs_cs(fs, cyl) = cgp->cg_cs;
1135                 brelse(bp);
1136         }
1137 #ifdef DEBUG
1138         if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1139                 printf("ffs_mountfs: superblock updated for soft updates\n");
1140 #endif
1141         bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1142         return (0);
1143 }
1144
1145 /*
1146  * Protecting the freemaps (or bitmaps).
1147  *
1148  * To eliminate the need to execute fsck before mounting a file system
1149  * after a power failure, one must (conservatively) guarantee that the
1150  * on-disk copy of the bitmaps never indicate that a live inode or block is
1151  * free.  So, when a block or inode is allocated, the bitmap should be
1152  * updated (on disk) before any new pointers.  When a block or inode is
1153  * freed, the bitmap should not be updated until all pointers have been
1154  * reset.  The latter dependency is handled by the delayed de-allocation
1155  * approach described below for block and inode de-allocation.  The former
1156  * dependency is handled by calling the following procedure when a block or
1157  * inode is allocated. When an inode is allocated an "inodedep" is created
1158  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1159  * Each "inodedep" is also inserted into the hash indexing structure so
1160  * that any additional link additions can be made dependent on the inode
1161  * allocation.
1162  *
1163  * The ufs file system maintains a number of free block counts (e.g., per
1164  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1165  * in addition to the bitmaps.  These counts are used to improve efficiency
1166  * during allocation and therefore must be consistent with the bitmaps.
1167  * There is no convenient way to guarantee post-crash consistency of these
1168  * counts with simple update ordering, for two main reasons: (1) The counts
1169  * and bitmaps for a single cylinder group block are not in the same disk
1170  * sector.  If a disk write is interrupted (e.g., by power failure), one may
1171  * be written and the other not.  (2) Some of the counts are located in the
1172  * superblock rather than the cylinder group block. So, we focus our soft
1173  * updates implementation on protecting the bitmaps. When mounting a
1174  * filesystem, we recompute the auxiliary counts from the bitmaps.
1175  */
1176
1177 /*
1178  * Called just after updating the cylinder group block to allocate an inode.
1179  */
1180 void
1181 softdep_setup_inomapdep(bp, ip, newinum)
1182         struct buf *bp;         /* buffer for cylgroup block with inode map */
1183         struct inode *ip;       /* inode related to allocation */
1184         ino_t newinum;          /* new inode number being allocated */
1185 {
1186         struct inodedep *inodedep;
1187         struct bmsafemap *bmsafemap;
1188
1189         /*
1190          * Create a dependency for the newly allocated inode.
1191          * Panic if it already exists as something is seriously wrong.
1192          * Otherwise add it to the dependency list for the buffer holding
1193          * the cylinder group map from which it was allocated.
1194          */
1195         ACQUIRE_LOCK(&lk);
1196         if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC|NODELAY, &inodedep))) {
1197                 FREE_LOCK(&lk);
1198                 panic("softdep_setup_inomapdep: found inode");
1199         }
1200         inodedep->id_buf = bp;
1201         inodedep->id_state &= ~DEPCOMPLETE;
1202         bmsafemap = bmsafemap_lookup(bp);
1203         LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1204         FREE_LOCK(&lk);
1205 }
1206
1207 /*
1208  * Called just after updating the cylinder group block to
1209  * allocate block or fragment.
1210  */
1211 void
1212 softdep_setup_blkmapdep(bp, fs, newblkno)
1213         struct buf *bp;         /* buffer for cylgroup block with block map */
1214         struct fs *fs;          /* filesystem doing allocation */
1215         ufs_daddr_t newblkno;   /* number of newly allocated block */
1216 {
1217         struct newblk *newblk;
1218         struct bmsafemap *bmsafemap;
1219
1220         /*
1221          * Create a dependency for the newly allocated block.
1222          * Add it to the dependency list for the buffer holding
1223          * the cylinder group map from which it was allocated.
1224          */
1225         if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1226                 panic("softdep_setup_blkmapdep: found block");
1227         ACQUIRE_LOCK(&lk);
1228         newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1229         LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1230         FREE_LOCK(&lk);
1231 }
1232
1233 /*
1234  * Find the bmsafemap associated with a cylinder group buffer.
1235  * If none exists, create one. The buffer must be locked when
1236  * this routine is called and this routine must be called with
1237  * splbio interrupts blocked.
1238  */
1239 static struct bmsafemap *
1240 bmsafemap_lookup(bp)
1241         struct buf *bp;
1242 {
1243         struct bmsafemap *bmsafemap;
1244         struct worklist *wk;
1245
1246 #ifdef DEBUG
1247         if (lk.lkt_held == NOHOLDER)
1248                 panic("bmsafemap_lookup: lock not held");
1249 #endif
1250         LIST_FOREACH(wk, &bp->b_dep, wk_list)
1251                 if (wk->wk_type == D_BMSAFEMAP)
1252                         return (WK_BMSAFEMAP(wk));
1253         FREE_LOCK(&lk);
1254         MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1255                 M_BMSAFEMAP, M_SOFTDEP_FLAGS);
1256         bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1257         bmsafemap->sm_list.wk_state = 0;
1258         bmsafemap->sm_buf = bp;
1259         LIST_INIT(&bmsafemap->sm_allocdirecthd);
1260         LIST_INIT(&bmsafemap->sm_allocindirhd);
1261         LIST_INIT(&bmsafemap->sm_inodedephd);
1262         LIST_INIT(&bmsafemap->sm_newblkhd);
1263         ACQUIRE_LOCK(&lk);
1264         WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1265         return (bmsafemap);
1266 }
1267
1268 /*
1269  * Direct block allocation dependencies.
1270  *
1271  * When a new block is allocated, the corresponding disk locations must be
1272  * initialized (with zeros or new data) before the on-disk inode points to
1273  * them.  Also, the freemap from which the block was allocated must be
1274  * updated (on disk) before the inode's pointer. These two dependencies are
1275  * independent of each other and are needed for all file blocks and indirect
1276  * blocks that are pointed to directly by the inode.  Just before the
1277  * "in-core" version of the inode is updated with a newly allocated block
1278  * number, a procedure (below) is called to setup allocation dependency
1279  * structures.  These structures are removed when the corresponding
1280  * dependencies are satisfied or when the block allocation becomes obsolete
1281  * (i.e., the file is deleted, the block is de-allocated, or the block is a
1282  * fragment that gets upgraded).  All of these cases are handled in
1283  * procedures described later.
1284  *
1285  * When a file extension causes a fragment to be upgraded, either to a larger
1286  * fragment or to a full block, the on-disk location may change (if the
1287  * previous fragment could not simply be extended). In this case, the old
1288  * fragment must be de-allocated, but not until after the inode's pointer has
1289  * been updated. In most cases, this is handled by later procedures, which
1290  * will construct a "freefrag" structure to be added to the workitem queue
1291  * when the inode update is complete (or obsolete).  The main exception to
1292  * this is when an allocation occurs while a pending allocation dependency
1293  * (for the same block pointer) remains.  This case is handled in the main
1294  * allocation dependency setup procedure by immediately freeing the
1295  * unreferenced fragments.
1296  */
1297 void
1298 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1299         struct inode *ip;       /* inode to which block is being added */
1300         ufs_lbn_t lbn;          /* block pointer within inode */
1301         ufs_daddr_t newblkno;   /* disk block number being added */
1302         ufs_daddr_t oldblkno;   /* previous block number, 0 unless frag */
1303         long newsize;           /* size of new block */
1304         long oldsize;           /* size of new block */
1305         struct buf *bp;         /* bp for allocated block */
1306 {
1307         struct allocdirect *adp, *oldadp;
1308         struct allocdirectlst *adphead;
1309         struct bmsafemap *bmsafemap;
1310         struct inodedep *inodedep;
1311         struct pagedep *pagedep;
1312         struct newblk *newblk;
1313
1314         MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1315                 M_ALLOCDIRECT, M_SOFTDEP_FLAGS);
1316         bzero(adp, sizeof(struct allocdirect));
1317         adp->ad_list.wk_type = D_ALLOCDIRECT;
1318         adp->ad_lbn = lbn;
1319         adp->ad_newblkno = newblkno;
1320         adp->ad_oldblkno = oldblkno;
1321         adp->ad_newsize = newsize;
1322         adp->ad_oldsize = oldsize;
1323         adp->ad_state = ATTACHED;
1324         if (newblkno == oldblkno)
1325                 adp->ad_freefrag = NULL;
1326         else
1327                 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1328
1329         if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1330                 panic("softdep_setup_allocdirect: lost block");
1331
1332         ACQUIRE_LOCK(&lk);
1333         inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1334         adp->ad_inodedep = inodedep;
1335
1336         if (newblk->nb_state == DEPCOMPLETE) {
1337                 adp->ad_state |= DEPCOMPLETE;
1338                 adp->ad_buf = NULL;
1339         } else {
1340                 bmsafemap = newblk->nb_bmsafemap;
1341                 adp->ad_buf = bmsafemap->sm_buf;
1342                 LIST_REMOVE(newblk, nb_deps);
1343                 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1344         }
1345         LIST_REMOVE(newblk, nb_hash);
1346         FREE(newblk, M_NEWBLK);
1347
1348         WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1349         if (lbn >= NDADDR) {
1350                 /* allocating an indirect block */
1351                 if (oldblkno != 0) {
1352                         FREE_LOCK(&lk);
1353                         panic("softdep_setup_allocdirect: non-zero indir");
1354                 }
1355         } else {
1356                 /*
1357                  * Allocating a direct block.
1358                  *
1359                  * If we are allocating a directory block, then we must
1360                  * allocate an associated pagedep to track additions and
1361                  * deletions.
1362                  */
1363                 if ((ip->i_mode & IFMT) == IFDIR &&
1364                     pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1365                         WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1366         }
1367         /*
1368          * The list of allocdirects must be kept in sorted and ascending
1369          * order so that the rollback routines can quickly determine the
1370          * first uncommitted block (the size of the file stored on disk
1371          * ends at the end of the lowest committed fragment, or if there
1372          * are no fragments, at the end of the highest committed block).
1373          * Since files generally grow, the typical case is that the new
1374          * block is to be added at the end of the list. We speed this
1375          * special case by checking against the last allocdirect in the
1376          * list before laboriously traversing the list looking for the
1377          * insertion point.
1378          */
1379         adphead = &inodedep->id_newinoupdt;
1380         oldadp = TAILQ_LAST(adphead, allocdirectlst);
1381         if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1382                 /* insert at end of list */
1383                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1384                 if (oldadp != NULL && oldadp->ad_lbn == lbn)
1385                         allocdirect_merge(adphead, adp, oldadp);
1386                 FREE_LOCK(&lk);
1387                 return;
1388         }
1389         TAILQ_FOREACH(oldadp, adphead, ad_next) {
1390                 if (oldadp->ad_lbn >= lbn)
1391                         break;
1392         }
1393         if (oldadp == NULL) {
1394                 FREE_LOCK(&lk);
1395                 panic("softdep_setup_allocdirect: lost entry");
1396         }
1397         /* insert in middle of list */
1398         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1399         if (oldadp->ad_lbn == lbn)
1400                 allocdirect_merge(adphead, adp, oldadp);
1401         FREE_LOCK(&lk);
1402 }
1403
1404 /*
1405  * Replace an old allocdirect dependency with a newer one.
1406  * This routine must be called with splbio interrupts blocked.
1407  */
1408 static void
1409 allocdirect_merge(adphead, newadp, oldadp)
1410         struct allocdirectlst *adphead; /* head of list holding allocdirects */
1411         struct allocdirect *newadp;     /* allocdirect being added */
1412         struct allocdirect *oldadp;     /* existing allocdirect being checked */
1413 {
1414         struct freefrag *freefrag;
1415
1416 #ifdef DEBUG
1417         if (lk.lkt_held == NOHOLDER)
1418                 panic("allocdirect_merge: lock not held");
1419 #endif
1420         if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1421             newadp->ad_oldsize != oldadp->ad_newsize ||
1422             newadp->ad_lbn >= NDADDR) {
1423                 FREE_LOCK(&lk);
1424                 panic("allocdirect_check: old %d != new %d || lbn %ld >= %d",
1425                     newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
1426                     NDADDR);
1427         }
1428         newadp->ad_oldblkno = oldadp->ad_oldblkno;
1429         newadp->ad_oldsize = oldadp->ad_oldsize;
1430         /*
1431          * If the old dependency had a fragment to free or had never
1432          * previously had a block allocated, then the new dependency
1433          * can immediately post its freefrag and adopt the old freefrag.
1434          * This action is done by swapping the freefrag dependencies.
1435          * The new dependency gains the old one's freefrag, and the
1436          * old one gets the new one and then immediately puts it on
1437          * the worklist when it is freed by free_allocdirect. It is
1438          * not possible to do this swap when the old dependency had a
1439          * non-zero size but no previous fragment to free. This condition
1440          * arises when the new block is an extension of the old block.
1441          * Here, the first part of the fragment allocated to the new
1442          * dependency is part of the block currently claimed on disk by
1443          * the old dependency, so cannot legitimately be freed until the
1444          * conditions for the new dependency are fulfilled.
1445          */
1446         if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1447                 freefrag = newadp->ad_freefrag;
1448                 newadp->ad_freefrag = oldadp->ad_freefrag;
1449                 oldadp->ad_freefrag = freefrag;
1450         }
1451         free_allocdirect(adphead, oldadp, 0);
1452 }
1453
1454 /*
1455  * Allocate a new freefrag structure if needed.
1456  */
1457 static struct freefrag *
1458 newfreefrag(ip, blkno, size)
1459         struct inode *ip;
1460         ufs_daddr_t blkno;
1461         long size;
1462 {
1463         struct freefrag *freefrag;
1464         struct fs *fs;
1465
1466         if (blkno == 0)
1467                 return (NULL);
1468         fs = ip->i_fs;
1469         if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1470                 panic("newfreefrag: frag size");
1471         MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1472                 M_FREEFRAG, M_SOFTDEP_FLAGS);
1473         freefrag->ff_list.wk_type = D_FREEFRAG;
1474         freefrag->ff_state = ip->i_uid & ~ONWORKLIST;   /* XXX - used below */
1475         freefrag->ff_inum = ip->i_number;
1476         freefrag->ff_fs = fs;
1477         freefrag->ff_devvp = ip->i_devvp;
1478         freefrag->ff_blkno = blkno;
1479         freefrag->ff_fragsize = size;
1480         return (freefrag);
1481 }
1482
1483 /*
1484  * This workitem de-allocates fragments that were replaced during
1485  * file block allocation.
1486  */
1487 static void
1488 handle_workitem_freefrag(freefrag)
1489         struct freefrag *freefrag;
1490 {
1491         struct inode tip;
1492
1493         tip.i_fs = freefrag->ff_fs;
1494         tip.i_devvp = freefrag->ff_devvp;
1495         tip.i_dev = freefrag->ff_devvp->v_rdev;
1496         tip.i_number = freefrag->ff_inum;
1497         tip.i_uid = freefrag->ff_state & ~ONWORKLIST;   /* XXX - set above */
1498         ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
1499         FREE(freefrag, M_FREEFRAG);
1500 }
1501
1502 /*
1503  * Indirect block allocation dependencies.
1504  *
1505  * The same dependencies that exist for a direct block also exist when
1506  * a new block is allocated and pointed to by an entry in a block of
1507  * indirect pointers. The undo/redo states described above are also
1508  * used here. Because an indirect block contains many pointers that
1509  * may have dependencies, a second copy of the entire in-memory indirect
1510  * block is kept. The buffer cache copy is always completely up-to-date.
1511  * The second copy, which is used only as a source for disk writes,
1512  * contains only the safe pointers (i.e., those that have no remaining
1513  * update dependencies). The second copy is freed when all pointers
1514  * are safe. The cache is not allowed to replace indirect blocks with
1515  * pending update dependencies. If a buffer containing an indirect
1516  * block with dependencies is written, these routines will mark it
1517  * dirty again. It can only be successfully written once all the
1518  * dependencies are removed. The ffs_fsync routine in conjunction with
1519  * softdep_sync_metadata work together to get all the dependencies
1520  * removed so that a file can be successfully written to disk. Three
1521  * procedures are used when setting up indirect block pointer
1522  * dependencies. The division is necessary because of the organization
1523  * of the "balloc" routine and because of the distinction between file
1524  * pages and file metadata blocks.
1525  */
1526
1527 /*
1528  * Allocate a new allocindir structure.
1529  */
1530 static struct allocindir *
1531 newallocindir(ip, ptrno, newblkno, oldblkno)
1532         struct inode *ip;       /* inode for file being extended */
1533         int ptrno;              /* offset of pointer in indirect block */
1534         ufs_daddr_t newblkno;   /* disk block number being added */
1535         ufs_daddr_t oldblkno;   /* previous block number, 0 if none */
1536 {
1537         struct allocindir *aip;
1538
1539         MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1540                 M_ALLOCINDIR, M_SOFTDEP_FLAGS);
1541         bzero(aip, sizeof(struct allocindir));
1542         aip->ai_list.wk_type = D_ALLOCINDIR;
1543         aip->ai_state = ATTACHED;
1544         aip->ai_offset = ptrno;
1545         aip->ai_newblkno = newblkno;
1546         aip->ai_oldblkno = oldblkno;
1547         aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1548         return (aip);
1549 }
1550
1551 /*
1552  * Called just before setting an indirect block pointer
1553  * to a newly allocated file page.
1554  */
1555 void
1556 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1557         struct inode *ip;       /* inode for file being extended */
1558         ufs_lbn_t lbn;          /* allocated block number within file */
1559         struct buf *bp;         /* buffer with indirect blk referencing page */
1560         int ptrno;              /* offset of pointer in indirect block */
1561         ufs_daddr_t newblkno;   /* disk block number being added */
1562         ufs_daddr_t oldblkno;   /* previous block number, 0 if none */
1563         struct buf *nbp;        /* buffer holding allocated page */
1564 {
1565         struct allocindir *aip;
1566         struct pagedep *pagedep;
1567
1568         aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1569         ACQUIRE_LOCK(&lk);
1570         /*
1571          * If we are allocating a directory page, then we must
1572          * allocate an associated pagedep to track additions and
1573          * deletions.
1574          */
1575         if ((ip->i_mode & IFMT) == IFDIR &&
1576             pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1577                 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1578         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1579         FREE_LOCK(&lk);
1580         setup_allocindir_phase2(bp, ip, aip);
1581 }
1582
1583 /*
1584  * Called just before setting an indirect block pointer to a
1585  * newly allocated indirect block.
1586  */
1587 void
1588 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
1589         struct buf *nbp;        /* newly allocated indirect block */
1590         struct inode *ip;       /* inode for file being extended */
1591         struct buf *bp;         /* indirect block referencing allocated block */
1592         int ptrno;              /* offset of pointer in indirect block */
1593         ufs_daddr_t newblkno;   /* disk block number being added */
1594 {
1595         struct allocindir *aip;
1596
1597         aip = newallocindir(ip, ptrno, newblkno, 0);
1598         ACQUIRE_LOCK(&lk);
1599         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1600         FREE_LOCK(&lk);
1601         setup_allocindir_phase2(bp, ip, aip);
1602 }
1603
1604 /*
1605  * Called to finish the allocation of the "aip" allocated
1606  * by one of the two routines above.
1607  */
1608 static void
1609 setup_allocindir_phase2(bp, ip, aip)
1610         struct buf *bp;         /* in-memory copy of the indirect block */
1611         struct inode *ip;       /* inode for file being extended */
1612         struct allocindir *aip; /* allocindir allocated by the above routines */
1613 {
1614         struct worklist *wk;
1615         struct indirdep *indirdep, *newindirdep;
1616         struct bmsafemap *bmsafemap;
1617         struct allocindir *oldaip;
1618         struct freefrag *freefrag;
1619         struct newblk *newblk;
1620
1621         if (bp->b_lblkno >= 0)
1622                 panic("setup_allocindir_phase2: not indir blk");
1623         for (indirdep = NULL, newindirdep = NULL; ; ) {
1624                 ACQUIRE_LOCK(&lk);
1625                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
1626                         if (wk->wk_type != D_INDIRDEP)
1627                                 continue;
1628                         indirdep = WK_INDIRDEP(wk);
1629                         break;
1630                 }
1631                 if (indirdep == NULL && newindirdep) {
1632                         indirdep = newindirdep;
1633                         WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1634                         newindirdep = NULL;
1635                 }
1636                 FREE_LOCK(&lk);
1637                 if (indirdep) {
1638                         if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1639                             &newblk) == 0)
1640                                 panic("setup_allocindir: lost block");
1641                         ACQUIRE_LOCK(&lk);
1642                         if (newblk->nb_state == DEPCOMPLETE) {
1643                                 aip->ai_state |= DEPCOMPLETE;
1644                                 aip->ai_buf = NULL;
1645                         } else {
1646                                 bmsafemap = newblk->nb_bmsafemap;
1647                                 aip->ai_buf = bmsafemap->sm_buf;
1648                                 LIST_REMOVE(newblk, nb_deps);
1649                                 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1650                                     aip, ai_deps);
1651                         }
1652                         LIST_REMOVE(newblk, nb_hash);
1653                         FREE(newblk, M_NEWBLK);
1654                         aip->ai_indirdep = indirdep;
1655                         /*
1656                          * Check to see if there is an existing dependency
1657                          * for this block. If there is, merge the old
1658                          * dependency into the new one.
1659                          */
1660                         if (aip->ai_oldblkno == 0)
1661                                 oldaip = NULL;
1662                         else
1663
1664                                 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
1665                                         if (oldaip->ai_offset == aip->ai_offset)
1666                                                 break;
1667                         if (oldaip != NULL) {
1668                                 if (oldaip->ai_newblkno != aip->ai_oldblkno) {
1669                                         FREE_LOCK(&lk);
1670                                         panic("setup_allocindir_phase2: blkno");
1671                                 }
1672                                 aip->ai_oldblkno = oldaip->ai_oldblkno;
1673                                 freefrag = oldaip->ai_freefrag;
1674                                 oldaip->ai_freefrag = aip->ai_freefrag;
1675                                 aip->ai_freefrag = freefrag;
1676                                 free_allocindir(oldaip, NULL);
1677                         }
1678                         LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1679                         ((ufs_daddr_t *)indirdep->ir_savebp->b_data)
1680                             [aip->ai_offset] = aip->ai_oldblkno;
1681                         FREE_LOCK(&lk);
1682                 }
1683                 if (newindirdep) {
1684                         if (indirdep->ir_savebp != NULL)
1685                                 brelse(newindirdep->ir_savebp);
1686                         WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
1687                 }
1688                 if (indirdep)
1689                         break;
1690                 MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
1691                         M_INDIRDEP, M_SOFTDEP_FLAGS);
1692                 newindirdep->ir_list.wk_type = D_INDIRDEP;
1693                 newindirdep->ir_state = ATTACHED;
1694                 LIST_INIT(&newindirdep->ir_deplisthd);
1695                 LIST_INIT(&newindirdep->ir_donehd);
1696                 if (bp->b_blkno == bp->b_lblkno) {
1697                         VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
1698                                 NULL, NULL);
1699                 }
1700                 newindirdep->ir_savebp =
1701                     getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
1702                 BUF_KERNPROC(newindirdep->ir_savebp);
1703                 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
1704         }
1705 }
1706
1707 /*
1708  * Block de-allocation dependencies.
1709  *
1710  * When blocks are de-allocated, the on-disk pointers must be nullified before
1711  * the blocks are made available for use by other files.  (The true
1712  * requirement is that old pointers must be nullified before new on-disk
1713  * pointers are set.  We chose this slightly more stringent requirement to
1714  * reduce complexity.) Our implementation handles this dependency by updating
1715  * the inode (or indirect block) appropriately but delaying the actual block
1716  * de-allocation (i.e., freemap and free space count manipulation) until
1717  * after the updated versions reach stable storage.  After the disk is
1718  * updated, the blocks can be safely de-allocated whenever it is convenient.
1719  * This implementation handles only the common case of reducing a file's
1720  * length to zero. Other cases are handled by the conventional synchronous
1721  * write approach.
1722  *
1723  * The ffs implementation with which we worked double-checks
1724  * the state of the block pointers and file size as it reduces
1725  * a file's length.  Some of this code is replicated here in our
1726  * soft updates implementation.  The freeblks->fb_chkcnt field is
1727  * used to transfer a part of this information to the procedure
1728  * that eventually de-allocates the blocks.
1729  *
1730  * This routine should be called from the routine that shortens
1731  * a file's length, before the inode's size or block pointers
1732  * are modified. It will save the block pointer information for
1733  * later release and zero the inode so that the calling routine
1734  * can release it.
1735  */
1736 void
1737 softdep_setup_freeblocks(ip, length)
1738         struct inode *ip;       /* The inode whose length is to be reduced */
1739         off_t length;           /* The new length for the file */
1740 {
1741         struct freeblks *freeblks;
1742         struct inodedep *inodedep;
1743         struct allocdirect *adp;
1744         struct vnode *vp;
1745         struct buf *bp;
1746         struct fs *fs;
1747         int i, error, delay;
1748
1749         fs = ip->i_fs;
1750         if (length != 0)
1751                 panic("softde_setup_freeblocks: non-zero length");
1752         MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
1753                 M_FREEBLKS, M_SOFTDEP_FLAGS);
1754         bzero(freeblks, sizeof(struct freeblks));
1755         freeblks->fb_list.wk_type = D_FREEBLKS;
1756         freeblks->fb_uid = ip->i_uid;
1757         freeblks->fb_previousinum = ip->i_number;
1758         freeblks->fb_devvp = ip->i_devvp;
1759         freeblks->fb_fs = fs;
1760         freeblks->fb_oldsize = ip->i_size;
1761         freeblks->fb_newsize = length;
1762         freeblks->fb_chkcnt = ip->i_blocks;
1763         for (i = 0; i < NDADDR; i++) {
1764                 freeblks->fb_dblks[i] = ip->i_db[i];
1765                 ip->i_db[i] = 0;
1766         }
1767         for (i = 0; i < NIADDR; i++) {
1768                 freeblks->fb_iblks[i] = ip->i_ib[i];
1769                 ip->i_ib[i] = 0;
1770         }
1771         ip->i_blocks = 0;
1772         ip->i_size = 0;
1773         /*
1774          * Push the zero'ed inode to to its disk buffer so that we are free
1775          * to delete its dependencies below. Once the dependencies are gone
1776          * the buffer can be safely released.
1777          */
1778         if ((error = bread(ip->i_devvp,
1779             fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
1780             (int)fs->fs_bsize, NOCRED, &bp)) != 0)
1781                 softdep_error("softdep_setup_freeblocks", error);
1782         *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
1783             ip->i_din;
1784         /*
1785          * Find and eliminate any inode dependencies.
1786          */
1787         ACQUIRE_LOCK(&lk);
1788         (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
1789         if ((inodedep->id_state & IOSTARTED) != 0) {
1790                 FREE_LOCK(&lk);
1791                 panic("softdep_setup_freeblocks: inode busy");
1792         }
1793         /*
1794          * Add the freeblks structure to the list of operations that
1795          * must await the zero'ed inode being written to disk. If we
1796          * still have a bitmap dependency (delay == 0), then the inode
1797          * has never been written to disk, so we can process the
1798          * freeblks below once we have deleted the dependencies.
1799          */
1800         delay = (inodedep->id_state & DEPCOMPLETE);
1801         if (delay)
1802                 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
1803         /*
1804          * Because the file length has been truncated to zero, any
1805          * pending block allocation dependency structures associated
1806          * with this inode are obsolete and can simply be de-allocated.
1807          * We must first merge the two dependency lists to get rid of
1808          * any duplicate freefrag structures, then purge the merged list.
1809          */
1810         merge_inode_lists(inodedep);
1811         while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
1812                 free_allocdirect(&inodedep->id_inoupdt, adp, 1);
1813         FREE_LOCK(&lk);
1814         bdwrite(bp);
1815         /*
1816          * We must wait for any I/O in progress to finish so that
1817          * all potential buffers on the dirty list will be visible.
1818          * Once they are all there, walk the list and get rid of
1819          * any dependencies.
1820          */
1821         vp = ITOV(ip);
1822         ACQUIRE_LOCK(&lk);
1823         drain_output(vp, 1);
1824         while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) {
1825                 bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
1826                 (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
1827                 deallocate_dependencies(bp, inodedep);
1828                 bp->b_flags |= B_INVAL | B_NOCACHE;
1829                 FREE_LOCK(&lk);
1830                 brelse(bp);
1831                 ACQUIRE_LOCK(&lk);
1832         }
1833         if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
1834                 (void)free_inodedep(inodedep);
1835         FREE_LOCK(&lk);
1836         /*
1837          * If the inode has never been written to disk (delay == 0),
1838          * then we can process the freeblks now that we have deleted
1839          * the dependencies.
1840          */
1841         if (!delay)
1842                 handle_workitem_freeblocks(freeblks);
1843 }
1844
1845 /*
1846  * Reclaim any dependency structures from a buffer that is about to
1847  * be reallocated to a new vnode. The buffer must be locked, thus,
1848  * no I/O completion operations can occur while we are manipulating
1849  * its associated dependencies. The mutex is held so that other I/O's
1850  * associated with related dependencies do not occur.
1851  */
1852 static void
1853 deallocate_dependencies(bp, inodedep)
1854         struct buf *bp;
1855         struct inodedep *inodedep;
1856 {
1857         struct worklist *wk;
1858         struct indirdep *indirdep;
1859         struct allocindir *aip;
1860         struct pagedep *pagedep;
1861         struct dirrem *dirrem;
1862         struct diradd *dap;
1863         int i;
1864
1865         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
1866                 switch (wk->wk_type) {
1867
1868                 case D_INDIRDEP:
1869                         indirdep = WK_INDIRDEP(wk);
1870                         /*
1871                          * None of the indirect pointers will ever be visible,
1872                          * so they can simply be tossed. GOINGAWAY ensures
1873                          * that allocated pointers will be saved in the buffer
1874                          * cache until they are freed. Note that they will
1875                          * only be able to be found by their physical address
1876                          * since the inode mapping the logical address will
1877                          * be gone. The save buffer used for the safe copy
1878                          * was allocated in setup_allocindir_phase2 using
1879                          * the physical address so it could be used for this
1880                          * purpose. Hence we swap the safe copy with the real
1881                          * copy, allowing the safe copy to be freed and holding
1882                          * on to the real copy for later use in indir_trunc.
1883                          */
1884                         if (indirdep->ir_state & GOINGAWAY) {
1885                                 FREE_LOCK(&lk);
1886                                 panic("deallocate_dependencies: already gone");
1887                         }
1888                         indirdep->ir_state |= GOINGAWAY;
1889                         while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
1890                                 free_allocindir(aip, inodedep);
1891                         if (bp->b_lblkno >= 0 ||
1892                             bp->b_blkno != indirdep->ir_savebp->b_lblkno) {
1893                                 FREE_LOCK(&lk);
1894                                 panic("deallocate_dependencies: not indir");
1895                         }
1896                         bcopy(bp->b_data, indirdep->ir_savebp->b_data,
1897                             bp->b_bcount);
1898                         WORKLIST_REMOVE(wk);
1899                         WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
1900                         continue;
1901
1902                 case D_PAGEDEP:
1903                         pagedep = WK_PAGEDEP(wk);
1904                         /*
1905                          * None of the directory additions will ever be
1906                          * visible, so they can simply be tossed.
1907                          */
1908                         for (i = 0; i < DAHASHSZ; i++)
1909                                 while ((dap =
1910                                     LIST_FIRST(&pagedep->pd_diraddhd[i])))
1911                                         free_diradd(dap);
1912                         while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
1913                                 free_diradd(dap);
1914                         /*
1915                          * Copy any directory remove dependencies to the list
1916                          * to be processed after the zero'ed inode is written.
1917                          * If the inode has already been written, then they
1918                          * can be dumped directly onto the work list.
1919                          */
1920                         LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
1921                                 LIST_REMOVE(dirrem, dm_next);
1922                                 dirrem->dm_dirinum = pagedep->pd_ino;
1923                                 if (inodedep == NULL ||
1924                                     (inodedep->id_state & ALLCOMPLETE) ==
1925                                      ALLCOMPLETE)
1926                                         add_to_worklist(&dirrem->dm_list);
1927                                 else
1928                                         WORKLIST_INSERT(&inodedep->id_bufwait,
1929                                             &dirrem->dm_list);
1930                         }
1931                         WORKLIST_REMOVE(&pagedep->pd_list);
1932                         LIST_REMOVE(pagedep, pd_hash);
1933                         WORKITEM_FREE(pagedep, D_PAGEDEP);
1934                         continue;
1935
1936                 case D_ALLOCINDIR:
1937                         free_allocindir(WK_ALLOCINDIR(wk), inodedep);
1938                         continue;
1939
1940                 case D_ALLOCDIRECT:
1941                 case D_INODEDEP:
1942                         FREE_LOCK(&lk);
1943                         panic("deallocate_dependencies: Unexpected type %s",
1944                             TYPENAME(wk->wk_type));
1945                         /* NOTREACHED */
1946
1947                 default:
1948                         FREE_LOCK(&lk);
1949                         panic("deallocate_dependencies: Unknown type %s",
1950                             TYPENAME(wk->wk_type));
1951                         /* NOTREACHED */
1952                 }
1953         }
1954 }
1955
1956 /*
1957  * Free an allocdirect. Generate a new freefrag work request if appropriate.
1958  * This routine must be called with splbio interrupts blocked.
1959  */
1960 static void
1961 free_allocdirect(adphead, adp, delay)
1962         struct allocdirectlst *adphead;
1963         struct allocdirect *adp;
1964         int delay;
1965 {
1966
1967 #ifdef DEBUG
1968         if (lk.lkt_held == NOHOLDER)
1969                 panic("free_allocdirect: lock not held");
1970 #endif
1971         if ((adp->ad_state & DEPCOMPLETE) == 0)
1972                 LIST_REMOVE(adp, ad_deps);
1973         TAILQ_REMOVE(adphead, adp, ad_next);
1974         if ((adp->ad_state & COMPLETE) == 0)
1975                 WORKLIST_REMOVE(&adp->ad_list);
1976         if (adp->ad_freefrag != NULL) {
1977                 if (delay)
1978                         WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
1979                             &adp->ad_freefrag->ff_list);
1980                 else
1981                         add_to_worklist(&adp->ad_freefrag->ff_list);
1982         }
1983         WORKITEM_FREE(adp, D_ALLOCDIRECT);
1984 }
1985
1986 /*
1987  * Prepare an inode to be freed. The actual free operation is not
1988  * done until the zero'ed inode has been written to disk.
1989  */
1990 void
1991 softdep_freefile(pvp, ino, mode)
1992                 struct vnode *pvp;
1993                 ino_t ino;
1994                 int mode;
1995 {
1996         struct inode *ip = VTOI(pvp);
1997         struct inodedep *inodedep;
1998         struct freefile *freefile;
1999
2000         /*
2001          * This sets up the inode de-allocation dependency.
2002          */
2003         MALLOC(freefile, struct freefile *, sizeof(struct freefile),
2004                 M_FREEFILE, M_SOFTDEP_FLAGS);
2005         freefile->fx_list.wk_type = D_FREEFILE;
2006         freefile->fx_list.wk_state = 0;
2007         freefile->fx_mode = mode;
2008         freefile->fx_oldinum = ino;
2009         freefile->fx_devvp = ip->i_devvp;
2010         freefile->fx_fs = ip->i_fs;
2011
2012         /*
2013          * If the inodedep does not exist, then the zero'ed inode has
2014          * been written to disk. If the allocated inode has never been
2015          * written to disk, then the on-disk inode is zero'ed. In either
2016          * case we can free the file immediately.
2017          */
2018         ACQUIRE_LOCK(&lk);
2019         if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 ||
2020             check_inode_unwritten(inodedep)) {
2021                 FREE_LOCK(&lk);
2022                 handle_workitem_freefile(freefile);
2023                 return;
2024         }
2025         WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
2026         FREE_LOCK(&lk);
2027 }
2028
2029 /*
2030  * Check to see if an inode has never been written to disk. If
2031  * so free the inodedep and return success, otherwise return failure.
2032  * This routine must be called with splbio interrupts blocked.
2033  *
2034  * If we still have a bitmap dependency, then the inode has never
2035  * been written to disk. Drop the dependency as it is no longer
2036  * necessary since the inode is being deallocated. We set the
2037  * ALLCOMPLETE flags since the bitmap now properly shows that the
2038  * inode is not allocated. Even if the inode is actively being
2039  * written, it has been rolled back to its zero'ed state, so we
2040  * are ensured that a zero inode is what is on the disk. For short
2041  * lived files, this change will usually result in removing all the
2042  * dependencies from the inode so that it can be freed immediately.
2043  */
2044 static int
2045 check_inode_unwritten(inodedep)
2046         struct inodedep *inodedep;
2047 {
2048
2049         if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
2050             LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2051             LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2052             LIST_FIRST(&inodedep->id_inowait) != NULL ||
2053             TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2054             TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2055             inodedep->id_nlinkdelta != 0)
2056                 return (0);
2057         inodedep->id_state |= ALLCOMPLETE;
2058         LIST_REMOVE(inodedep, id_deps);
2059         inodedep->id_buf = NULL;
2060         if (inodedep->id_state & ONWORKLIST)
2061                 WORKLIST_REMOVE(&inodedep->id_list);
2062         if (inodedep->id_savedino != NULL) {
2063                 FREE(inodedep->id_savedino, M_INODEDEP);
2064                 inodedep->id_savedino = NULL;
2065         }
2066         if (free_inodedep(inodedep) == 0) {
2067                 FREE_LOCK(&lk);
2068                 panic("check_inode_unwritten: busy inode");
2069         }
2070         return (1);
2071 }
2072
2073 /*
2074  * Try to free an inodedep structure. Return 1 if it could be freed.
2075  */
2076 static int
2077 free_inodedep(inodedep)
2078         struct inodedep *inodedep;
2079 {
2080
2081         if ((inodedep->id_state & ONWORKLIST) != 0 ||
2082             (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
2083             LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2084             LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2085             LIST_FIRST(&inodedep->id_inowait) != NULL ||
2086             TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2087             TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2088             inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL)
2089                 return (0);
2090         LIST_REMOVE(inodedep, id_hash);
2091         WORKITEM_FREE(inodedep, D_INODEDEP);
2092         num_inodedep -= 1;
2093         return (1);
2094 }
2095
2096 /*
2097  * This workitem routine performs the block de-allocation.
2098  * The workitem is added to the pending list after the updated
2099  * inode block has been written to disk.  As mentioned above,
2100  * checks regarding the number of blocks de-allocated (compared
2101  * to the number of blocks allocated for the file) are also
2102  * performed in this function.
2103  */
2104 static void
2105 handle_workitem_freeblocks(freeblks)
2106         struct freeblks *freeblks;
2107 {
2108         struct inode tip;
2109         ufs_daddr_t bn;
2110         struct fs *fs;
2111         int i, level, bsize;
2112         long nblocks, blocksreleased = 0;
2113         int error, allerror = 0;
2114         ufs_lbn_t baselbns[NIADDR], tmpval;
2115
2116         tip.i_number = freeblks->fb_previousinum;
2117         tip.i_devvp = freeblks->fb_devvp;
2118         tip.i_dev = freeblks->fb_devvp->v_rdev;
2119         tip.i_fs = freeblks->fb_fs;
2120         tip.i_size = freeblks->fb_oldsize;
2121         tip.i_uid = freeblks->fb_uid;
2122         fs = freeblks->fb_fs;
2123         tmpval = 1;
2124         baselbns[0] = NDADDR;
2125         for (i = 1; i < NIADDR; i++) {
2126                 tmpval *= NINDIR(fs);
2127                 baselbns[i] = baselbns[i - 1] + tmpval;
2128         }
2129         nblocks = btodb(fs->fs_bsize);
2130         blocksreleased = 0;
2131         /*
2132          * Indirect blocks first.
2133          */
2134         for (level = (NIADDR - 1); level >= 0; level--) {
2135                 if ((bn = freeblks->fb_iblks[level]) == 0)
2136                         continue;
2137                 if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
2138                     baselbns[level], &blocksreleased)) == 0)
2139                         allerror = error;
2140                 ffs_blkfree(&tip, bn, fs->fs_bsize);
2141                 blocksreleased += nblocks;
2142         }
2143         /*
2144          * All direct blocks or frags.
2145          */
2146         for (i = (NDADDR - 1); i >= 0; i--) {
2147                 if ((bn = freeblks->fb_dblks[i]) == 0)
2148                         continue;
2149                 bsize = blksize(fs, &tip, i);
2150                 ffs_blkfree(&tip, bn, bsize);
2151                 blocksreleased += btodb(bsize);
2152         }
2153
2154 #ifdef DIAGNOSTIC
2155         if (freeblks->fb_chkcnt != blocksreleased)
2156                 printf("handle_workitem_freeblocks: block count\n");
2157         if (allerror)
2158                 softdep_error("handle_workitem_freeblks", allerror);
2159 #endif /* DIAGNOSTIC */
2160         WORKITEM_FREE(freeblks, D_FREEBLKS);
2161 }
2162
2163 /*
2164  * Release blocks associated with the inode ip and stored in the indirect
2165  * block dbn. If level is greater than SINGLE, the block is an indirect block
2166  * and recursive calls to indirtrunc must be used to cleanse other indirect
2167  * blocks.
2168  */
2169 static int
2170 indir_trunc(ip, dbn, level, lbn, countp)
2171         struct inode *ip;
2172         ufs_daddr_t dbn;
2173         int level;
2174         ufs_lbn_t lbn;
2175         long *countp;
2176 {
2177         struct buf *bp;
2178         ufs_daddr_t *bap;
2179         ufs_daddr_t nb;
2180         struct fs *fs;
2181         struct worklist *wk;
2182         struct indirdep *indirdep;
2183         int i, lbnadd, nblocks;
2184         int error, allerror = 0;
2185
2186         fs = ip->i_fs;
2187         lbnadd = 1;
2188         for (i = level; i > 0; i--)
2189                 lbnadd *= NINDIR(fs);
2190         /*
2191          * Get buffer of block pointers to be freed. This routine is not
2192          * called until the zero'ed inode has been written, so it is safe
2193          * to free blocks as they are encountered. Because the inode has
2194          * been zero'ed, calls to bmap on these blocks will fail. So, we
2195          * have to use the on-disk address and the block device for the
2196          * filesystem to look them up. If the file was deleted before its
2197          * indirect blocks were all written to disk, the routine that set
2198          * us up (deallocate_dependencies) will have arranged to leave
2199          * a complete copy of the indirect block in memory for our use.
2200          * Otherwise we have to read the blocks in from the disk.
2201          */
2202         ACQUIRE_LOCK(&lk);
2203         if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
2204             (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2205                 if (wk->wk_type != D_INDIRDEP ||
2206                     (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2207                     (indirdep->ir_state & GOINGAWAY) == 0) {
2208                         FREE_LOCK(&lk);
2209                         panic("indir_trunc: lost indirdep");
2210                 }
2211                 WORKLIST_REMOVE(wk);
2212                 WORKITEM_FREE(indirdep, D_INDIRDEP);
2213                 if (LIST_FIRST(&bp->b_dep) != NULL) {
2214                         FREE_LOCK(&lk);
2215                         panic("indir_trunc: dangling dep");
2216                 }
2217                 FREE_LOCK(&lk);
2218         } else {
2219                 FREE_LOCK(&lk);
2220                 error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
2221                 if (error)
2222                         return (error);
2223         }
2224         /*
2225          * Recursively free indirect blocks.
2226          */
2227         bap = (ufs_daddr_t *)bp->b_data;
2228         nblocks = btodb(fs->fs_bsize);
2229         for (i = NINDIR(fs) - 1; i >= 0; i--) {
2230                 if ((nb = bap[i]) == 0)
2231                         continue;
2232                 if (level != 0) {
2233                         if ((error = indir_trunc(ip, fsbtodb(fs, nb),
2234                              level - 1, lbn + (i * lbnadd), countp)) != 0)
2235                                 allerror = error;
2236                 }
2237                 ffs_blkfree(ip, nb, fs->fs_bsize);
2238                 *countp += nblocks;
2239         }
2240         bp->b_flags |= B_INVAL | B_NOCACHE;
2241         brelse(bp);
2242         return (allerror);
2243 }
2244
2245 /*
2246  * Free an allocindir.
2247  * This routine must be called with splbio interrupts blocked.
2248  */
2249 static void
2250 free_allocindir(aip, inodedep)
2251         struct allocindir *aip;
2252         struct inodedep *inodedep;
2253 {
2254         struct freefrag *freefrag;
2255
2256 #ifdef DEBUG
2257         if (lk.lkt_held == NOHOLDER)
2258                 panic("free_allocindir: lock not held");
2259 #endif
2260         if ((aip->ai_state & DEPCOMPLETE) == 0)
2261                 LIST_REMOVE(aip, ai_deps);
2262         if (aip->ai_state & ONWORKLIST)
2263                 WORKLIST_REMOVE(&aip->ai_list);
2264         LIST_REMOVE(aip, ai_next);
2265         if ((freefrag = aip->ai_freefrag) != NULL) {
2266                 if (inodedep == NULL)
2267                         add_to_worklist(&freefrag->ff_list);
2268                 else
2269                         WORKLIST_INSERT(&inodedep->id_bufwait,
2270                             &freefrag->ff_list);
2271         }
2272         WORKITEM_FREE(aip, D_ALLOCINDIR);
2273 }
2274
2275 /*
2276  * Directory entry addition dependencies.
2277  *
2278  * When adding a new directory entry, the inode (with its incremented link
2279  * count) must be written to disk before the directory entry's pointer to it.
2280  * Also, if the inode is newly allocated, the corresponding freemap must be
2281  * updated (on disk) before the directory entry's pointer. These requirements
2282  * are met via undo/redo on the directory entry's pointer, which consists
2283  * simply of the inode number.
2284  *
2285  * As directory entries are added and deleted, the free space within a
2286  * directory block can become fragmented.  The ufs file system will compact
2287  * a fragmented directory block to make space for a new entry. When this
2288  * occurs, the offsets of previously added entries change. Any "diradd"
2289  * dependency structures corresponding to these entries must be updated with
2290  * the new offsets.
2291  */
2292
2293 /*
2294  * This routine is called after the in-memory inode's link
2295  * count has been incremented, but before the directory entry's
2296  * pointer to the inode has been set.
2297  */
2298 void
2299 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
2300         struct buf *bp;         /* buffer containing directory block */
2301         struct inode *dp;       /* inode for directory */
2302         off_t diroffset;        /* offset of new entry in directory */
2303         long newinum;           /* inode referenced by new directory entry */
2304         struct buf *newdirbp;   /* non-NULL => contents of new mkdir */
2305 {
2306         int offset;             /* offset of new entry within directory block */
2307         ufs_lbn_t lbn;          /* block in directory containing new entry */
2308         struct fs *fs;
2309         struct diradd *dap;
2310         struct pagedep *pagedep;
2311         struct inodedep *inodedep;
2312         struct mkdir *mkdir1, *mkdir2;
2313
2314         /*
2315          * Whiteouts have no dependencies.
2316          */
2317         if (newinum == WINO) {
2318                 if (newdirbp != NULL)
2319                         bdwrite(newdirbp);
2320                 return;
2321         }
2322
2323         fs = dp->i_fs;
2324         lbn = lblkno(fs, diroffset);
2325         offset = blkoff(fs, diroffset);
2326         MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
2327             M_SOFTDEP_FLAGS);
2328         bzero(dap, sizeof(struct diradd));
2329         dap->da_list.wk_type = D_DIRADD;
2330         dap->da_offset = offset;
2331         dap->da_newinum = newinum;
2332         dap->da_state = ATTACHED;
2333         if (newdirbp == NULL) {
2334                 dap->da_state |= DEPCOMPLETE;
2335                 ACQUIRE_LOCK(&lk);
2336         } else {
2337                 dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2338                 MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2339                     M_SOFTDEP_FLAGS);
2340                 mkdir1->md_list.wk_type = D_MKDIR;
2341                 mkdir1->md_state = MKDIR_BODY;
2342                 mkdir1->md_diradd = dap;
2343                 MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2344                     M_SOFTDEP_FLAGS);
2345                 mkdir2->md_list.wk_type = D_MKDIR;
2346                 mkdir2->md_state = MKDIR_PARENT;
2347                 mkdir2->md_diradd = dap;
2348                 /*
2349                  * Dependency on "." and ".." being written to disk.
2350                  */
2351                 mkdir1->md_buf = newdirbp;
2352                 ACQUIRE_LOCK(&lk);
2353                 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2354                 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
2355                 FREE_LOCK(&lk);
2356                 bdwrite(newdirbp);
2357                 /*
2358                  * Dependency on link count increase for parent directory
2359                  */
2360                 ACQUIRE_LOCK(&lk);
2361                 if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
2362                     || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2363                         dap->da_state &= ~MKDIR_PARENT;
2364                         WORKITEM_FREE(mkdir2, D_MKDIR);
2365                 } else {
2366                         LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2367                         WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2368                 }
2369         }
2370         /*
2371          * Link into parent directory pagedep to await its being written.
2372          */
2373         if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2374                 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2375         dap->da_pagedep = pagedep;
2376         LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2377             da_pdlist);
2378         /*
2379          * Link into its inodedep. Put it on the id_bufwait list if the inode
2380          * is not yet written. If it is written, do the post-inode write
2381          * processing to put it on the id_pendinghd list.
2382          */
2383         (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2384         if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2385                 diradd_inode_written(dap, inodedep);
2386         else
2387                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2388         FREE_LOCK(&lk);
2389 }
2390
2391 /*
2392  * This procedure is called to change the offset of a directory
2393  * entry when compacting a directory block which must be owned
2394  * exclusively by the caller. Note that the actual entry movement
2395  * must be done in this procedure to ensure that no I/O completions
2396  * occur while the move is in progress.
2397  */
2398 void
2399 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
2400         struct inode *dp;       /* inode for directory */
2401         caddr_t base;           /* address of dp->i_offset */
2402         caddr_t oldloc;         /* address of old directory location */
2403         caddr_t newloc;         /* address of new directory location */
2404         int entrysize;          /* size of directory entry */
2405 {
2406         int offset, oldoffset, newoffset;
2407         struct pagedep *pagedep;
2408         struct diradd *dap;
2409         ufs_lbn_t lbn;
2410
2411         ACQUIRE_LOCK(&lk);
2412         lbn = lblkno(dp->i_fs, dp->i_offset);
2413         offset = blkoff(dp->i_fs, dp->i_offset);
2414         if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2415                 goto done;
2416         oldoffset = offset + (oldloc - base);
2417         newoffset = offset + (newloc - base);
2418
2419         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
2420                 if (dap->da_offset != oldoffset)
2421                         continue;
2422                 dap->da_offset = newoffset;
2423                 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2424                         break;
2425                 LIST_REMOVE(dap, da_pdlist);
2426                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2427                     dap, da_pdlist);
2428                 break;
2429         }
2430         if (dap == NULL) {
2431
2432                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
2433                         if (dap->da_offset == oldoffset) {
2434                                 dap->da_offset = newoffset;
2435                                 break;
2436                         }
2437                 }
2438         }
2439 done:
2440         bcopy(oldloc, newloc, entrysize);
2441         FREE_LOCK(&lk);
2442 }
2443
2444 /*
2445  * Free a diradd dependency structure. This routine must be called
2446  * with splbio interrupts blocked.
2447  */
2448 static void
2449 free_diradd(dap)
2450         struct diradd *dap;
2451 {
2452         struct dirrem *dirrem;
2453         struct pagedep *pagedep;
2454         struct inodedep *inodedep;
2455         struct mkdir *mkdir, *nextmd;
2456
2457 #ifdef DEBUG
2458         if (lk.lkt_held == NOHOLDER)
2459                 panic("free_diradd: lock not held");
2460 #endif
2461         WORKLIST_REMOVE(&dap->da_list);
2462         LIST_REMOVE(dap, da_pdlist);
2463         if ((dap->da_state & DIRCHG) == 0) {
2464                 pagedep = dap->da_pagedep;
2465         } else {
2466                 dirrem = dap->da_previous;
2467                 pagedep = dirrem->dm_pagedep;
2468                 dirrem->dm_dirinum = pagedep->pd_ino;
2469                 add_to_worklist(&dirrem->dm_list);
2470         }
2471         if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2472             0, &inodedep) != 0)
2473                 (void) free_inodedep(inodedep);
2474         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2475                 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2476                         nextmd = LIST_NEXT(mkdir, md_mkdirs);
2477                         if (mkdir->md_diradd != dap)
2478                                 continue;
2479                         dap->da_state &= ~mkdir->md_state;
2480                         WORKLIST_REMOVE(&mkdir->md_list);
2481                         LIST_REMOVE(mkdir, md_mkdirs);
2482                         WORKITEM_FREE(mkdir, D_MKDIR);
2483                 }
2484                 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2485                         FREE_LOCK(&lk);
2486                         panic("free_diradd: unfound ref");
2487                 }
2488         }
2489         WORKITEM_FREE(dap, D_DIRADD);
2490 }
2491
2492 /*
2493  * Directory entry removal dependencies.
2494  *
2495  * When removing a directory entry, the entry's inode pointer must be
2496  * zero'ed on disk before the corresponding inode's link count is decremented
2497  * (possibly freeing the inode for re-use). This dependency is handled by
2498  * updating the directory entry but delaying the inode count reduction until
2499  * after the directory block has been written to disk. After this point, the
2500  * inode count can be decremented whenever it is convenient.
2501  */
2502
2503 /*
2504  * This routine should be called immediately after removing
2505  * a directory entry.  The inode's link count should not be
2506  * decremented by the calling procedure -- the soft updates
2507  * code will do this task when it is safe.
2508  */
2509 void
2510 softdep_setup_remove(bp, dp, ip, isrmdir)
2511         struct buf *bp;         /* buffer containing directory block */
2512         struct inode *dp;       /* inode for the directory being modified */
2513         struct inode *ip;       /* inode for directory entry being removed */
2514         int isrmdir;            /* indicates if doing RMDIR */
2515 {
2516         struct dirrem *dirrem, *prevdirrem;
2517
2518         /*
2519          * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2520          */
2521         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2522
2523         /*
2524          * If the COMPLETE flag is clear, then there were no active
2525          * entries and we want to roll back to a zeroed entry until
2526          * the new inode is committed to disk. If the COMPLETE flag is
2527          * set then we have deleted an entry that never made it to
2528          * disk. If the entry we deleted resulted from a name change,
2529          * then the old name still resides on disk. We cannot delete
2530          * its inode (returned to us in prevdirrem) until the zeroed
2531          * directory entry gets to disk. The new inode has never been
2532          * referenced on the disk, so can be deleted immediately.
2533          */
2534         if ((dirrem->dm_state & COMPLETE) == 0) {
2535                 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2536                     dm_next);
2537                 FREE_LOCK(&lk);
2538         } else {
2539                 if (prevdirrem != NULL)
2540                         LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
2541                             prevdirrem, dm_next);
2542                 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2543                 FREE_LOCK(&lk);
2544                 handle_workitem_remove(dirrem);
2545         }
2546 }
2547
2548 /*
2549  * Allocate a new dirrem if appropriate and return it along with
2550  * its associated pagedep. Called without a lock, returns with lock.
2551  */
2552 static long num_dirrem;         /* number of dirrem allocated */
2553 static struct dirrem *
2554 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
2555         struct buf *bp;         /* buffer containing directory block */
2556         struct inode *dp;       /* inode for the directory being modified */
2557         struct inode *ip;       /* inode for directory entry being removed */
2558         int isrmdir;            /* indicates if doing RMDIR */
2559         struct dirrem **prevdirremp; /* previously referenced inode, if any */
2560 {
2561         int offset;
2562         ufs_lbn_t lbn;
2563         struct diradd *dap;
2564         struct dirrem *dirrem;
2565         struct pagedep *pagedep;
2566
2567         /*
2568          * Whiteouts have no deletion dependencies.
2569          */
2570         if (ip == NULL)
2571                 panic("newdirrem: whiteout");
2572         /*
2573          * If we are over our limit, try to improve the situation.
2574          * Limiting the number of dirrem structures will also limit
2575          * the number of freefile and freeblks structures.
2576          */
2577         if (num_dirrem > max_softdeps / 2 && speedup_syncer() == 0)
2578                 (void) request_cleanup(FLUSH_REMOVE, 0);
2579         num_dirrem += 1;
2580         MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
2581                 M_DIRREM, M_SOFTDEP_FLAGS);
2582         bzero(dirrem, sizeof(struct dirrem));
2583         dirrem->dm_list.wk_type = D_DIRREM;
2584         dirrem->dm_state = isrmdir ? RMDIR : 0;
2585         dirrem->dm_mnt = ITOV(ip)->v_mount;
2586         dirrem->dm_oldinum = ip->i_number;
2587         *prevdirremp = NULL;
2588
2589         ACQUIRE_LOCK(&lk);
2590         lbn = lblkno(dp->i_fs, dp->i_offset);
2591         offset = blkoff(dp->i_fs, dp->i_offset);
2592         if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2593                 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2594         dirrem->dm_pagedep = pagedep;
2595         /*
2596          * Check for a diradd dependency for the same directory entry.
2597          * If present, then both dependencies become obsolete and can
2598          * be de-allocated. Check for an entry on both the pd_dirraddhd
2599          * list and the pd_pendinghd list.
2600          */
2601
2602         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
2603                 if (dap->da_offset == offset)
2604                         break;
2605         if (dap == NULL) {
2606
2607                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
2608                         if (dap->da_offset == offset)
2609                                 break;
2610                 if (dap == NULL)
2611                         return (dirrem);
2612         }
2613         /*
2614          * Must be ATTACHED at this point.
2615          */
2616         if ((dap->da_state & ATTACHED) == 0) {
2617                 FREE_LOCK(&lk);
2618                 panic("newdirrem: not ATTACHED");
2619         }
2620         if (dap->da_newinum != ip->i_number) {
2621                 FREE_LOCK(&lk);
2622                 panic("newdirrem: inum %d should be %d",
2623                     ip->i_number, dap->da_newinum);
2624         }
2625         /*
2626          * If we are deleting a changed name that never made it to disk,
2627          * then return the dirrem describing the previous inode (which
2628          * represents the inode currently referenced from this entry on disk).
2629          */
2630         if ((dap->da_state & DIRCHG) != 0) {
2631                 *prevdirremp = dap->da_previous;
2632                 dap->da_state &= ~DIRCHG;
2633                 dap->da_pagedep = pagedep;
2634         }
2635         /*
2636          * We are deleting an entry that never made it to disk.
2637          * Mark it COMPLETE so we can delete its inode immediately.
2638          */
2639         dirrem->dm_state |= COMPLETE;
2640         free_diradd(dap);
2641         return (dirrem);
2642 }
2643
2644 /*
2645  * Directory entry change dependencies.
2646  *
2647  * Changing an existing directory entry requires that an add operation
2648  * be completed first followed by a deletion. The semantics for the addition
2649  * are identical to the description of adding a new entry above except
2650  * that the rollback is to the old inode number rather than zero. Once
2651  * the addition dependency is completed, the removal is done as described
2652  * in the removal routine above.
2653  */
2654
2655 /*
2656  * This routine should be called immediately after changing
2657  * a directory entry.  The inode's link count should not be
2658  * decremented by the calling procedure -- the soft updates
2659  * code will perform this task when it is safe.
2660  */
2661 void
2662 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
2663         struct buf *bp;         /* buffer containing directory block */
2664         struct inode *dp;       /* inode for the directory being modified */
2665         struct inode *ip;       /* inode for directory entry being removed */
2666         long newinum;           /* new inode number for changed entry */
2667         int isrmdir;            /* indicates if doing RMDIR */
2668 {
2669         int offset;
2670         struct diradd *dap = NULL;
2671         struct dirrem *dirrem, *prevdirrem;
2672         struct pagedep *pagedep;
2673         struct inodedep *inodedep;
2674
2675         offset = blkoff(dp->i_fs, dp->i_offset);
2676
2677         /*
2678          * Whiteouts do not need diradd dependencies.
2679          */
2680         if (newinum != WINO) {
2681                 MALLOC(dap, struct diradd *, sizeof(struct diradd),
2682                     M_DIRADD, M_SOFTDEP_FLAGS);
2683                 bzero(dap, sizeof(struct diradd));
2684                 dap->da_list.wk_type = D_DIRADD;
2685                 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
2686                 dap->da_offset = offset;
2687                 dap->da_newinum = newinum;
2688         }
2689
2690         /*
2691          * Allocate a new dirrem and ACQUIRE_LOCK.
2692          */
2693         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2694         pagedep = dirrem->dm_pagedep;
2695         /*
2696          * The possible values for isrmdir:
2697          *      0 - non-directory file rename
2698          *      1 - directory rename within same directory
2699          *   inum - directory rename to new directory of given inode number
2700          * When renaming to a new directory, we are both deleting and
2701          * creating a new directory entry, so the link count on the new
2702          * directory should not change. Thus we do not need the followup
2703          * dirrem which is usually done in handle_workitem_remove. We set
2704          * the DIRCHG flag to tell handle_workitem_remove to skip the
2705          * followup dirrem.
2706          */
2707         if (isrmdir > 1)
2708                 dirrem->dm_state |= DIRCHG;
2709
2710         /*
2711          * Whiteouts have no additional dependencies,
2712          * so just put the dirrem on the correct list.
2713          */
2714         if (newinum == WINO) {
2715                 if ((dirrem->dm_state & COMPLETE) == 0) {
2716                         LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
2717                             dm_next);
2718                 } else {
2719                         dirrem->dm_dirinum = pagedep->pd_ino;
2720                         add_to_worklist(&dirrem->dm_list);
2721                 }
2722                 FREE_LOCK(&lk);
2723                 return;
2724         }
2725
2726         /*
2727          * If the COMPLETE flag is clear, then there were no active
2728          * entries and we want to roll back to the previous inode until
2729          * the new inode is committed to disk. If the COMPLETE flag is
2730          * set, then we have deleted an entry that never made it to disk.
2731          * If the entry we deleted resulted from a name change, then the old
2732          * inode reference still resides on disk. Any rollback that we do
2733          * needs to be to that old inode (returned to us in prevdirrem). If
2734          * the entry we deleted resulted from a create, then there is
2735          * no entry on the disk, so we want to roll back to zero rather
2736          * than the uncommitted inode. In either of the COMPLETE cases we
2737          * want to immediately free the unwritten and unreferenced inode.
2738          */
2739         if ((dirrem->dm_state & COMPLETE) == 0) {
2740                 dap->da_previous = dirrem;
2741         } else {
2742                 if (prevdirrem != NULL) {
2743                         dap->da_previous = prevdirrem;
2744                 } else {
2745                         dap->da_state &= ~DIRCHG;
2746                         dap->da_pagedep = pagedep;
2747                 }
2748                 dirrem->dm_dirinum = pagedep->pd_ino;
2749                 add_to_worklist(&dirrem->dm_list);
2750         }
2751         /*
2752          * Link into its inodedep. Put it on the id_bufwait list if the inode
2753          * is not yet written. If it is written, do the post-inode write
2754          * processing to put it on the id_pendinghd list.
2755          */
2756         if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
2757             (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2758                 dap->da_state |= COMPLETE;
2759                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
2760                 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
2761         } else {
2762                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
2763                     dap, da_pdlist);
2764                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2765         }
2766         FREE_LOCK(&lk);
2767 }
2768
2769 /*
2770  * Called whenever the link count on an inode is changed.
2771  * It creates an inode dependency so that the new reference(s)
2772  * to the inode cannot be committed to disk until the updated
2773  * inode has been written.
2774  */
2775 void
2776 softdep_change_linkcnt(ip)
2777         struct inode *ip;       /* the inode with the increased link count */
2778 {
2779         struct inodedep *inodedep;
2780
2781         ACQUIRE_LOCK(&lk);
2782         (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
2783         if (ip->i_nlink < ip->i_effnlink) {
2784                 FREE_LOCK(&lk);
2785                 panic("softdep_change_linkcnt: bad delta");
2786         }
2787         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2788         FREE_LOCK(&lk);
2789 }
2790
2791 /*
2792  * This workitem decrements the inode's link count.
2793  * If the link count reaches zero, the file is removed.
2794  */
2795 static void
2796 handle_workitem_remove(dirrem)
2797         struct dirrem *dirrem;
2798 {
2799         struct thread *td = curthread;  /* XXX */
2800         struct ucred *cred;
2801         struct inodedep *inodedep;
2802         struct vnode *vp;
2803         struct inode *ip;
2804         ino_t oldinum;
2805         int error;
2806
2807         KKASSERT(td->td_proc);
2808         cred = td->td_proc->p_ucred;
2809         if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
2810                 softdep_error("handle_workitem_remove: vget", error);
2811                 return;
2812         }
2813         ip = VTOI(vp);
2814         ACQUIRE_LOCK(&lk);
2815         if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){
2816                 FREE_LOCK(&lk);
2817                 panic("handle_workitem_remove: lost inodedep");
2818         }
2819         /*
2820          * Normal file deletion.
2821          */
2822         if ((dirrem->dm_state & RMDIR) == 0) {
2823                 ip->i_nlink--;
2824                 ip->i_flag |= IN_CHANGE;
2825                 if (ip->i_nlink < ip->i_effnlink) {
2826                         FREE_LOCK(&lk);
2827                         panic("handle_workitem_remove: bad file delta");
2828                 }
2829                 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2830                 FREE_LOCK(&lk);
2831                 vput(vp);
2832                 num_dirrem -= 1;
2833                 WORKITEM_FREE(dirrem, D_DIRREM);
2834                 return;
2835         }
2836         /*
2837          * Directory deletion. Decrement reference count for both the
2838          * just deleted parent directory entry and the reference for ".".
2839          * Next truncate the directory to length zero. When the
2840          * truncation completes, arrange to have the reference count on
2841          * the parent decremented to account for the loss of "..".
2842          */
2843         ip->i_nlink -= 2;
2844         ip->i_flag |= IN_CHANGE;
2845         if (ip->i_nlink < ip->i_effnlink) {
2846                 FREE_LOCK(&lk);
2847                 panic("handle_workitem_remove: bad dir delta");
2848         }
2849         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2850         FREE_LOCK(&lk);
2851         if ((error = UFS_TRUNCATE(vp, (off_t)0, 0,cred, td)) != 0)
2852                 softdep_error("handle_workitem_remove: truncate", error);
2853         /*
2854          * Rename a directory to a new parent. Since, we are both deleting
2855          * and creating a new directory entry, the link count on the new
2856          * directory should not change. Thus we skip the followup dirrem.
2857          */
2858         if (dirrem->dm_state & DIRCHG) {
2859                 vput(vp);
2860                 num_dirrem -= 1;
2861                 WORKITEM_FREE(dirrem, D_DIRREM);
2862                 return;
2863         }
2864         /*
2865          * If the inodedep does not exist, then the zero'ed inode has
2866          * been written to disk. If the allocated inode has never been
2867          * written to disk, then the on-disk inode is zero'ed. In either
2868          * case we can remove the file immediately.
2869          */
2870         ACQUIRE_LOCK(&lk);
2871         dirrem->dm_state = 0;
2872         oldinum = dirrem->dm_oldinum;
2873         dirrem->dm_oldinum = dirrem->dm_dirinum;
2874         if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 ||
2875             check_inode_unwritten(inodedep)) {
2876                 FREE_LOCK(&lk);
2877                 vput(vp);
2878                 handle_workitem_remove(dirrem);
2879                 return;
2880         }
2881         WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
2882         FREE_LOCK(&lk);
2883         vput(vp);
2884 }
2885
2886 /*
2887  * Inode de-allocation dependencies.
2888  *
2889  * When an inode's link count is reduced to zero, it can be de-allocated. We
2890  * found it convenient to postpone de-allocation until after the inode is
2891  * written to disk with its new link count (zero).  At this point, all of the
2892  * on-disk inode's block pointers are nullified and, with careful dependency
2893  * list ordering, all dependencies related to the inode will be satisfied and
2894  * the corresponding dependency structures de-allocated.  So, if/when the
2895  * inode is reused, there will be no mixing of old dependencies with new
2896  * ones.  This artificial dependency is set up by the block de-allocation
2897  * procedure above (softdep_setup_freeblocks) and completed by the
2898  * following procedure.
2899  */
2900 static void
2901 handle_workitem_freefile(freefile)
2902         struct freefile *freefile;
2903 {
2904         struct vnode vp;
2905         struct inode tip;
2906         struct inodedep *idp;
2907         int error;
2908
2909 #ifdef DEBUG
2910         ACQUIRE_LOCK(&lk);
2911         error = inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp);
2912         FREE_LOCK(&lk);
2913         if (error)
2914                 panic("handle_workitem_freefile: inodedep survived");
2915 #endif
2916         tip.i_devvp = freefile->fx_devvp;
2917         tip.i_dev = freefile->fx_devvp->v_rdev;
2918         tip.i_fs = freefile->fx_fs;
2919         vp.v_data = &tip;
2920         if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
2921                 softdep_error("handle_workitem_freefile", error);
2922         WORKITEM_FREE(freefile, D_FREEFILE);
2923 }
2924
2925 /*
2926  * Disk writes.
2927  *
2928  * The dependency structures constructed above are most actively used when file
2929  * system blocks are written to disk.  No constraints are placed on when a
2930  * block can be written, but unsatisfied update dependencies are made safe by
2931  * modifying (or replacing) the source memory for the duration of the disk
2932  * write.  When the disk write completes, the memory block is again brought
2933  * up-to-date.
2934  *
2935  * In-core inode structure reclamation.
2936  *
2937  * Because there are a finite number of "in-core" inode structures, they are
2938  * reused regularly.  By transferring all inode-related dependencies to the
2939  * in-memory inode block and indexing them separately (via "inodedep"s), we
2940  * can allow "in-core" inode structures to be reused at any time and avoid
2941  * any increase in contention.
2942  *
2943  * Called just before entering the device driver to initiate a new disk I/O.
2944  * The buffer must be locked, thus, no I/O completion operations can occur
2945  * while we are manipulating its associated dependencies.
2946  */
2947 static void
2948 softdep_disk_io_initiation(bp)
2949         struct buf *bp;         /* structure describing disk write to occur */
2950 {
2951         struct worklist *wk, *nextwk;
2952         struct indirdep *indirdep;
2953
2954         /*
2955          * We only care about write operations. There should never
2956          * be dependencies for reads.
2957          */
2958         if (bp->b_flags & B_READ)
2959                 panic("softdep_disk_io_initiation: read");
2960         /*
2961          * Do any necessary pre-I/O processing.
2962          */
2963         for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
2964                 nextwk = LIST_NEXT(wk, wk_list);
2965                 switch (wk->wk_type) {
2966
2967                 case D_PAGEDEP:
2968                         initiate_write_filepage(WK_PAGEDEP(wk), bp);
2969                         continue;
2970
2971                 case D_INODEDEP:
2972                         initiate_write_inodeblock(WK_INODEDEP(wk), bp);
2973                         continue;
2974
2975                 case D_INDIRDEP:
2976                         indirdep = WK_INDIRDEP(wk);
2977                         if (indirdep->ir_state & GOINGAWAY)
2978                                 panic("disk_io_initiation: indirdep gone");
2979                         /*
2980                          * If there are no remaining dependencies, this
2981                          * will be writing the real pointers, so the
2982                          * dependency can be freed.
2983                          */
2984                         if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
2985                                 indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
2986                                 brelse(indirdep->ir_savebp);
2987                                 /* inline expand WORKLIST_REMOVE(wk); */
2988                                 wk->wk_state &= ~ONWORKLIST;
2989                                 LIST_REMOVE(wk, wk_list);
2990                                 WORKITEM_FREE(indirdep, D_INDIRDEP);
2991                                 continue;
2992                         }
2993                         /*
2994                          * Replace up-to-date version with safe version.
2995                          */
2996                         MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
2997                             M_INDIRDEP, M_SOFTDEP_FLAGS);
2998                         ACQUIRE_LOCK(&lk);
2999                         indirdep->ir_state &= ~ATTACHED;
3000                         indirdep->ir_state |= UNDONE;
3001                         bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
3002                         bcopy(indirdep->ir_savebp->b_data, bp->b_data,
3003                             bp->b_bcount);
3004                         FREE_LOCK(&lk);
3005                         continue;
3006
3007                 case D_MKDIR:
3008                 case D_BMSAFEMAP:
3009                 case D_ALLOCDIRECT:
3010                 case D_ALLOCINDIR:
3011                         continue;
3012
3013                 default:
3014                         panic("handle_disk_io_initiation: Unexpected type %s",
3015                             TYPENAME(wk->wk_type));
3016                         /* NOTREACHED */
3017                 }
3018         }
3019 }
3020
3021 /*
3022  * Called from within the procedure above to deal with unsatisfied
3023  * allocation dependencies in a directory. The buffer must be locked,
3024  * thus, no I/O completion operations can occur while we are
3025  * manipulating its associated dependencies.
3026  */
3027 static void
3028 initiate_write_filepage(pagedep, bp)
3029         struct pagedep *pagedep;
3030         struct buf *bp;
3031 {
3032         struct diradd *dap;
3033         struct direct *ep;
3034         int i;
3035
3036         if (pagedep->pd_state & IOSTARTED) {
3037                 /*
3038                  * This can only happen if there is a driver that does not
3039                  * understand chaining. Here biodone will reissue the call
3040                  * to strategy for the incomplete buffers.
3041                  */
3042                 printf("initiate_write_filepage: already started\n");
3043                 return;
3044         }
3045         pagedep->pd_state |= IOSTARTED;
3046         ACQUIRE_LOCK(&lk);
3047         for (i = 0; i < DAHASHSZ; i++) {
3048                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
3049                         ep = (struct direct *)
3050                             ((char *)bp->b_data + dap->da_offset);
3051                         if (ep->d_ino != dap->da_newinum) {
3052                                 FREE_LOCK(&lk);
3053                                 panic("%s: dir inum %d != new %d",
3054                                     "initiate_write_filepage",
3055                                     ep->d_ino, dap->da_newinum);
3056                         }
3057                         if (dap->da_state & DIRCHG)
3058                                 ep->d_ino = dap->da_previous->dm_oldinum;
3059                         else
3060                                 ep->d_ino = 0;
3061                         dap->da_state &= ~ATTACHED;
3062                         dap->da_state |= UNDONE;
3063                 }
3064         }
3065         FREE_LOCK(&lk);
3066 }
3067
3068 /*
3069  * Called from within the procedure above to deal with unsatisfied
3070  * allocation dependencies in an inodeblock. The buffer must be
3071  * locked, thus, no I/O completion operations can occur while we
3072  * are manipulating its associated dependencies.
3073  */
3074 static void
3075 initiate_write_inodeblock(inodedep, bp)
3076         struct inodedep *inodedep;
3077         struct buf *bp;                 /* The inode block */
3078 {
3079         struct allocdirect *adp, *lastadp;
3080         struct dinode *dp;
3081         struct fs *fs;
3082         ufs_lbn_t prevlbn = 0;
3083         int i, deplist;
3084
3085         if (inodedep->id_state & IOSTARTED)
3086                 panic("initiate_write_inodeblock: already started");
3087         inodedep->id_state |= IOSTARTED;
3088         fs = inodedep->id_fs;
3089         dp = (struct dinode *)bp->b_data +
3090             ino_to_fsbo(fs, inodedep->id_ino);
3091         /*
3092          * If the bitmap is not yet written, then the allocated
3093          * inode cannot be written to disk.
3094          */
3095         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3096                 if (inodedep->id_savedino != NULL)
3097                         panic("initiate_write_inodeblock: already doing I/O");
3098                 MALLOC(inodedep->id_savedino, struct dinode *,
3099                     sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
3100                 *inodedep->id_savedino = *dp;
3101                 bzero((caddr_t)dp, sizeof(struct dinode));
3102                 return;
3103         }
3104         /*
3105          * If no dependencies, then there is nothing to roll back.
3106          */
3107         inodedep->id_savedsize = dp->di_size;
3108         if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
3109                 return;
3110         /*
3111          * Set the dependencies to busy.
3112          */
3113         ACQUIRE_LOCK(&lk);
3114         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3115              adp = TAILQ_NEXT(adp, ad_next)) {
3116 #ifdef DIAGNOSTIC
3117                 if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3118                         FREE_LOCK(&lk);
3119                         panic("softdep_write_inodeblock: lbn order");
3120                 }
3121                 prevlbn = adp->ad_lbn;
3122                 if (adp->ad_lbn < NDADDR &&
3123                     dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
3124                         FREE_LOCK(&lk);
3125                         panic("%s: direct pointer #%ld mismatch %d != %d",
3126                             "softdep_write_inodeblock", adp->ad_lbn,
3127                             dp->di_db[adp->ad_lbn], adp->ad_newblkno);
3128                 }
3129                 if (adp->ad_lbn >= NDADDR &&
3130                     dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) {
3131                         FREE_LOCK(&lk);
3132                         panic("%s: indirect pointer #%ld mismatch %d != %d",
3133                             "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
3134                             dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno);
3135                 }
3136                 deplist |= 1 << adp->ad_lbn;
3137                 if ((adp->ad_state & ATTACHED) == 0) {
3138                         FREE_LOCK(&lk);
3139                         panic("softdep_write_inodeblock: Unknown state 0x%x",
3140                             adp->ad_state);
3141                 }
3142 #endif /* DIAGNOSTIC */
3143                 adp->ad_state &= ~ATTACHED;
3144                 adp->ad_state |= UNDONE;
3145         }
3146         /*
3147          * The on-disk inode cannot claim to be any larger than the last
3148          * fragment that has been written. Otherwise, the on-disk inode
3149          * might have fragments that were not the last block in the file
3150          * which would corrupt the filesystem.
3151          */
3152         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3153              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3154                 if (adp->ad_lbn >= NDADDR)
3155                         break;
3156                 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3157                 /* keep going until hitting a rollback to a frag */
3158                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3159                         continue;
3160                 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3161                 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3162 #ifdef DIAGNOSTIC
3163                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
3164                                 FREE_LOCK(&lk);
3165                                 panic("softdep_write_inodeblock: lost dep1");
3166                         }
3167 #endif /* DIAGNOSTIC */
3168                         dp->di_db[i] = 0;
3169                 }
3170                 for (i = 0; i < NIADDR; i++) {
3171 #ifdef DIAGNOSTIC
3172                         if (dp->di_ib[i] != 0 &&
3173                             (deplist & ((1 << NDADDR) << i)) == 0) {
3174                                 FREE_LOCK(&lk);
3175                                 panic("softdep_write_inodeblock: lost dep2");
3176                         }
3177 #endif /* DIAGNOSTIC */
3178                         dp->di_ib[i] = 0;
3179                 }
3180                 FREE_LOCK(&lk);
3181                 return;
3182         }
3183         /*
3184          * If we have zero'ed out the last allocated block of the file,
3185          * roll back the size to the last currently allocated block.
3186          * We know that this last allocated block is a full-sized as
3187          * we already checked for fragments in the loop above.
3188          */
3189         if (lastadp != NULL &&
3190             dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3191                 for (i = lastadp->ad_lbn; i >= 0; i--)
3192                         if (dp->di_db[i] != 0)
3193                                 break;
3194                 dp->di_size = (i + 1) * fs->fs_bsize;
3195         }
3196         /*
3197          * The only dependencies are for indirect blocks.
3198          *
3199          * The file size for indirect block additions is not guaranteed.
3200          * Such a guarantee would be non-trivial to achieve. The conventional
3201          * synchronous write implementation also does not make this guarantee.
3202          * Fsck should catch and fix discrepancies. Arguably, the file size
3203          * can be over-estimated without destroying integrity when the file
3204          * moves into the indirect blocks (i.e., is large). If we want to
3205          * postpone fsck, we are stuck with this argument.
3206          */
3207         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3208                 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3209         FREE_LOCK(&lk);
3210 }
3211
3212 /*
3213  * This routine is called during the completion interrupt
3214  * service routine for a disk write (from the procedure called
3215  * by the device driver to inform the file system caches of
3216  * a request completion).  It should be called early in this
3217  * procedure, before the block is made available to other
3218  * processes or other routines are called.
3219  */
3220 static void
3221 softdep_disk_write_complete(bp)
3222         struct buf *bp;         /* describes the completed disk write */
3223 {
3224         struct worklist *wk;
3225         struct workhead reattach;
3226         struct newblk *newblk;
3227         struct allocindir *aip;
3228         struct allocdirect *adp;
3229         struct indirdep *indirdep;
3230         struct inodedep *inodedep;
3231         struct bmsafemap *bmsafemap;
3232
3233 #ifdef DEBUG
3234         if (lk.lkt_held != NOHOLDER)
3235                 panic("softdep_disk_write_complete: lock is held");
3236         lk.lkt_held = SPECIAL_FLAG;
3237 #endif
3238         LIST_INIT(&reattach);
3239         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
3240                 WORKLIST_REMOVE(wk);
3241                 switch (wk->wk_type) {
3242
3243                 case D_PAGEDEP:
3244                         if (handle_written_filepage(WK_PAGEDEP(wk), bp))
3245                                 WORKLIST_INSERT(&reattach, wk);
3246                         continue;
3247
3248                 case D_INODEDEP:
3249                         if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
3250                                 WORKLIST_INSERT(&reattach, wk);
3251                         continue;
3252
3253                 case D_BMSAFEMAP:
3254                         bmsafemap = WK_BMSAFEMAP(wk);
3255                         while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
3256                                 newblk->nb_state |= DEPCOMPLETE;
3257                                 newblk->nb_bmsafemap = NULL;
3258                                 LIST_REMOVE(newblk, nb_deps);
3259                         }
3260                         while ((adp =
3261                            LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
3262                                 adp->ad_state |= DEPCOMPLETE;
3263                                 adp->ad_buf = NULL;
3264                                 LIST_REMOVE(adp, ad_deps);
3265                                 handle_allocdirect_partdone(adp);
3266                         }
3267                         while ((aip =
3268                             LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
3269                                 aip->ai_state |= DEPCOMPLETE;
3270                                 aip->ai_buf = NULL;
3271                                 LIST_REMOVE(aip, ai_deps);
3272                                 handle_allocindir_partdone(aip);
3273                         }
3274                         while ((inodedep =
3275                              LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
3276                                 inodedep->id_state |= DEPCOMPLETE;
3277                                 LIST_REMOVE(inodedep, id_deps);
3278                                 inodedep->id_buf = NULL;
3279                         }
3280                         WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
3281                         continue;
3282
3283                 case D_MKDIR:
3284                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
3285                         continue;
3286
3287                 case D_ALLOCDIRECT:
3288                         adp = WK_ALLOCDIRECT(wk);
3289                         adp->ad_state |= COMPLETE;
3290                         handle_allocdirect_partdone(adp);
3291                         continue;
3292
3293                 case D_ALLOCINDIR:
3294                         aip = WK_ALLOCINDIR(wk);
3295                         aip->ai_state |= COMPLETE;
3296                         handle_allocindir_partdone(aip);
3297                         continue;
3298
3299                 case D_INDIRDEP:
3300                         indirdep = WK_INDIRDEP(wk);
3301                         if (indirdep->ir_state & GOINGAWAY) {
3302                                 lk.lkt_held = NOHOLDER;
3303                                 panic("disk_write_complete: indirdep gone");
3304                         }
3305                         bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
3306                         FREE(indirdep->ir_saveddata, M_INDIRDEP);
3307                         indirdep->ir_saveddata = 0;
3308                         indirdep->ir_state &= ~UNDONE;
3309                         indirdep->ir_state |= ATTACHED;
3310                         while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
3311                                 handle_allocindir_partdone(aip);
3312                                 if (aip == LIST_FIRST(&indirdep->ir_donehd)) {
3313                                         lk.lkt_held = NOHOLDER;
3314                                         panic("disk_write_complete: not gone");
3315                                 }
3316                         }
3317                         WORKLIST_INSERT(&reattach, wk);
3318                         if ((bp->b_flags & B_DELWRI) == 0)
3319                                 stat_indir_blk_ptrs++;
3320                         bdirty(bp);
3321                         continue;
3322
3323                 default:
3324                         lk.lkt_held = NOHOLDER;
3325                         panic("handle_disk_write_complete: Unknown type %s",
3326                             TYPENAME(wk->wk_type));
3327                         /* NOTREACHED */
3328                 }
3329         }
3330         /*
3331          * Reattach any requests that must be redone.
3332          */
3333         while ((wk = LIST_FIRST(&reattach)) != NULL) {
3334                 WORKLIST_REMOVE(wk);
3335                 WORKLIST_INSERT(&bp->b_dep, wk);
3336         }
3337 #ifdef DEBUG
3338         if (lk.lkt_held != SPECIAL_FLAG)
3339                 panic("softdep_disk_write_complete: lock lost");
3340         lk.lkt_held = NOHOLDER;
3341 #endif
3342 }
3343
3344 /*
3345  * Called from within softdep_disk_write_complete above. Note that
3346  * this routine is always called from interrupt level with further
3347  * splbio interrupts blocked.
3348  */
3349 static void
3350 handle_allocdirect_partdone(adp)
3351         struct allocdirect *adp;        /* the completed allocdirect */
3352 {
3353         struct allocdirect *listadp;
3354         struct inodedep *inodedep;
3355         long bsize;
3356
3357         if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3358                 return;
3359         if (adp->ad_buf != NULL) {
3360                 lk.lkt_held = NOHOLDER;
3361                 panic("handle_allocdirect_partdone: dangling dep");
3362         }
3363         /*
3364          * The on-disk inode cannot claim to be any larger than the last
3365          * fragment that has been written. Otherwise, the on-disk inode
3366          * might have fragments that were not the last block in the file
3367          * which would corrupt the filesystem. Thus, we cannot free any
3368          * allocdirects after one whose ad_oldblkno claims a fragment as
3369          * these blocks must be rolled back to zero before writing the inode.
3370          * We check the currently active set of allocdirects in id_inoupdt.
3371          */
3372         inodedep = adp->ad_inodedep;
3373         bsize = inodedep->id_fs->fs_bsize;
3374         TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) {
3375                 /* found our block */
3376                 if (listadp == adp)
3377                         break;
3378                 /* continue if ad_oldlbn is not a fragment */
3379                 if (listadp->ad_oldsize == 0 ||
3380                     listadp->ad_oldsize == bsize)
3381                         continue;
3382                 /* hit a fragment */
3383                 return;
3384         }
3385         /*
3386          * If we have reached the end of the current list without
3387          * finding the just finished dependency, then it must be
3388          * on the future dependency list. Future dependencies cannot
3389          * be freed until they are moved to the current list.
3390          */
3391         if (listadp == NULL) {
3392 #ifdef DEBUG
3393                 TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next)
3394                         /* found our block */
3395                         if (listadp == adp)
3396                                 break;
3397                 if (listadp == NULL) {
3398                         lk.lkt_held = NOHOLDER;
3399                         panic("handle_allocdirect_partdone: lost dep");
3400                 }
3401 #endif /* DEBUG */
3402                 return;
3403         }
3404         /*
3405          * If we have found the just finished dependency, then free
3406          * it along with anything that follows it that is complete.
3407          */
3408         for (; adp; adp = listadp) {
3409                 listadp = TAILQ_NEXT(adp, ad_next);
3410                 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3411                         return;
3412                 free_allocdirect(&inodedep->id_inoupdt, adp, 1);
3413         }
3414 }
3415
3416 /*
3417  * Called from within softdep_disk_write_complete above. Note that
3418  * this routine is always called from interrupt level with further
3419  * splbio interrupts blocked.
3420  */
3421 static void
3422 handle_allocindir_partdone(aip)
3423         struct allocindir *aip;         /* the completed allocindir */
3424 {
3425         struct indirdep *indirdep;
3426
3427         if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
3428                 return;
3429         if (aip->ai_buf != NULL) {
3430                 lk.lkt_held = NOHOLDER;
3431                 panic("handle_allocindir_partdone: dangling dependency");
3432         }
3433         indirdep = aip->ai_indirdep;
3434         if (indirdep->ir_state & UNDONE) {
3435                 LIST_REMOVE(aip, ai_next);
3436                 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
3437                 return;
3438         }
3439         ((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
3440             aip->ai_newblkno;
3441         LIST_REMOVE(aip, ai_next);
3442         if (aip->ai_freefrag != NULL)
3443                 add_to_worklist(&aip->ai_freefrag->ff_list);
3444         WORKITEM_FREE(aip, D_ALLOCINDIR);
3445 }
3446
3447 /*
3448  * Called from within softdep_disk_write_complete above to restore
3449  * in-memory inode block contents to their most up-to-date state. Note
3450  * that this routine is always called from interrupt level with further
3451  * splbio interrupts blocked.
3452  */
3453 static int
3454 handle_written_inodeblock(inodedep, bp)
3455         struct inodedep *inodedep;
3456         struct buf *bp;         /* buffer containing the inode block */
3457 {
3458         struct worklist *wk, *filefree;
3459         struct allocdirect *adp, *nextadp;
3460         struct dinode *dp;
3461         int hadchanges;
3462
3463         if ((inodedep->id_state & IOSTARTED) == 0) {
3464                 lk.lkt_held = NOHOLDER;
3465                 panic("handle_written_inodeblock: not started");
3466         }
3467         inodedep->id_state &= ~IOSTARTED;
3468         inodedep->id_state |= COMPLETE;
3469         dp = (struct dinode *)bp->b_data +
3470             ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
3471         /*
3472          * If we had to rollback the inode allocation because of
3473          * bitmaps being incomplete, then simply restore it.
3474          * Keep the block dirty so that it will not be reclaimed until
3475          * all associated dependencies have been cleared and the
3476          * corresponding updates written to disk.
3477          */
3478         if (inodedep->id_savedino != NULL) {
3479                 *dp = *inodedep->id_savedino;
3480                 FREE(inodedep->id_savedino, M_INODEDEP);
3481                 inodedep->id_savedino = NULL;
3482                 if ((bp->b_flags & B_DELWRI) == 0)
3483                         stat_inode_bitmap++;
3484                 bdirty(bp);
3485                 return (1);
3486         }
3487         /*
3488          * Roll forward anything that had to be rolled back before
3489          * the inode could be updated.
3490          */
3491         hadchanges = 0;
3492         for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
3493                 nextadp = TAILQ_NEXT(adp, ad_next);
3494                 if (adp->ad_state & ATTACHED) {
3495                         lk.lkt_held = NOHOLDER;
3496                         panic("handle_written_inodeblock: new entry");
3497                 }
3498                 if (adp->ad_lbn < NDADDR) {
3499                         if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) {
3500                                 lk.lkt_held = NOHOLDER;
3501                                 panic("%s: %s #%ld mismatch %d != %d",
3502                                     "handle_written_inodeblock",
3503                                     "direct pointer", adp->ad_lbn,
3504                                     dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
3505                         }
3506                         dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
3507                 } else {
3508                         if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) {
3509                                 lk.lkt_held = NOHOLDER;
3510                                 panic("%s: %s #%ld allocated as %d",
3511                                     "handle_written_inodeblock",
3512                                     "indirect pointer", adp->ad_lbn - NDADDR,
3513                                     dp->di_ib[adp->ad_lbn - NDADDR]);
3514                         }
3515                         dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno;
3516                 }
3517                 adp->ad_state &= ~UNDONE;
3518                 adp->ad_state |= ATTACHED;
3519                 hadchanges = 1;
3520         }
3521         if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
3522                 stat_direct_blk_ptrs++;
3523         /*
3524          * Reset the file size to its most up-to-date value.
3525          */
3526         if (inodedep->id_savedsize == -1) {
3527                 lk.lkt_held = NOHOLDER;
3528                 panic("handle_written_inodeblock: bad size");
3529         }
3530         if (dp->di_size != inodedep->id_savedsize) {
3531                 dp->di_size = inodedep->id_savedsize;
3532                 hadchanges = 1;
3533         }
3534         inodedep->id_savedsize = -1;
3535         /*
3536          * If there were any rollbacks in the inode block, then it must be
3537          * marked dirty so that its will eventually get written back in
3538          * its correct form.
3539          */
3540         if (hadchanges)
3541                 bdirty(bp);
3542         /*
3543          * Process any allocdirects that completed during the update.
3544          */
3545         if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
3546                 handle_allocdirect_partdone(adp);
3547         /*
3548          * Process deallocations that were held pending until the
3549          * inode had been written to disk. Freeing of the inode
3550          * is delayed until after all blocks have been freed to
3551          * avoid creation of new <vfsid, inum, lbn> triples
3552          * before the old ones have been deleted.
3553          */
3554         filefree = NULL;
3555         while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
3556                 WORKLIST_REMOVE(wk);
3557                 switch (wk->wk_type) {
3558
3559                 case D_FREEFILE:
3560                         /*
3561                          * We defer adding filefree to the worklist until
3562                          * all other additions have been made to ensure
3563                          * that it will be done after all the old blocks
3564                          * have been freed.
3565                          */
3566                         if (filefree != NULL) {
3567                                 lk.lkt_held = NOHOLDER;
3568                                 panic("handle_written_inodeblock: filefree");
3569                         }
3570                         filefree = wk;
3571                         continue;
3572
3573                 case D_MKDIR:
3574                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
3575                         continue;
3576
3577                 case D_DIRADD:
3578                         diradd_inode_written(WK_DIRADD(wk), inodedep);
3579                         continue;
3580
3581                 case D_FREEBLKS:
3582                 case D_FREEFRAG:
3583                 case D_DIRREM:
3584                         add_to_worklist(wk);
3585                         continue;
3586
3587                 default:
3588                         lk.lkt_held = NOHOLDER;
3589                         panic("handle_written_inodeblock: Unknown type %s",
3590                             TYPENAME(wk->wk_type));
3591                         /* NOTREACHED */
3592                 }
3593         }
3594         if (filefree != NULL) {
3595                 if (free_inodedep(inodedep) == 0) {
3596                         lk.lkt_held = NOHOLDER;
3597                         panic("handle_written_inodeblock: live inodedep");
3598                 }
3599                 add_to_worklist(filefree);
3600                 return (0);
3601         }
3602
3603         /*
3604          * If no outstanding dependencies, free it.
3605          */
3606         if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
3607                 return (0);
3608         return (hadchanges);
3609 }
3610
3611 /*
3612  * Process a diradd entry after its dependent inode has been written.
3613  * This routine must be called with splbio interrupts blocked.
3614  */
3615 static void
3616 diradd_inode_written(dap, inodedep)
3617         struct diradd *dap;
3618         struct inodedep *inodedep;
3619 {
3620         struct pagedep *pagedep;
3621
3622         dap->da_state |= COMPLETE;
3623         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3624                 if (dap->da_state & DIRCHG)
3625                         pagedep = dap->da_previous->dm_pagedep;
3626                 else
3627                         pagedep = dap->da_pagedep;
3628                 LIST_REMOVE(dap, da_pdlist);
3629                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3630         }
3631         WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3632 }
3633
3634 /*
3635  * Handle the completion of a mkdir dependency.
3636  */
3637 static void
3638 handle_written_mkdir(mkdir, type)
3639         struct mkdir *mkdir;
3640         int type;
3641 {
3642         struct diradd *dap;
3643         struct pagedep *pagedep;
3644
3645         if (mkdir->md_state != type) {
3646                 lk.lkt_held = NOHOLDER;
3647                 panic("handle_written_mkdir: bad type");
3648         }
3649         dap = mkdir->md_diradd;
3650         dap->da_state &= ~type;
3651         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
3652                 dap->da_state |= DEPCOMPLETE;
3653         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3654                 if (dap->da_state & DIRCHG)
3655                         pagedep = dap->da_previous->dm_pagedep;
3656                 else
3657                         pagedep = dap->da_pagedep;
3658                 LIST_REMOVE(dap, da_pdlist);
3659                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3660         }
3661         LIST_REMOVE(mkdir, md_mkdirs);
3662         WORKITEM_FREE(mkdir, D_MKDIR);
3663 }
3664
3665 /*
3666  * Called from within softdep_disk_write_complete above.
3667  * A write operation was just completed. Removed inodes can
3668  * now be freed and associated block pointers may be committed.
3669  * Note that this routine is always called from interrupt level
3670  * with further splbio interrupts blocked.
3671  */
3672 static int
3673 handle_written_filepage(pagedep, bp)
3674         struct pagedep *pagedep;
3675         struct buf *bp;         /* buffer containing the written page */
3676 {
3677         struct dirrem *dirrem;
3678         struct diradd *dap, *nextdap;
3679         struct direct *ep;
3680         int i, chgs;
3681
3682         if ((pagedep->pd_state & IOSTARTED) == 0) {
3683                 lk.lkt_held = NOHOLDER;
3684                 panic("handle_written_filepage: not started");
3685         }
3686         pagedep->pd_state &= ~IOSTARTED;
3687         /*
3688          * Process any directory removals that have been committed.
3689          */
3690         while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
3691                 LIST_REMOVE(dirrem, dm_next);
3692                 dirrem->dm_dirinum = pagedep->pd_ino;
3693                 add_to_worklist(&dirrem->dm_list);
3694         }
3695         /*
3696          * Free any directory additions that have been committed.
3697          */
3698         while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
3699                 free_diradd(dap);
3700         /*
3701          * Uncommitted directory entries must be restored.
3702          */
3703         for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
3704                 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
3705                      dap = nextdap) {
3706                         nextdap = LIST_NEXT(dap, da_pdlist);
3707                         if (dap->da_state & ATTACHED) {
3708                                 lk.lkt_held = NOHOLDER;
3709                                 panic("handle_written_filepage: attached");
3710                         }
3711                         ep = (struct direct *)
3712                             ((char *)bp->b_data + dap->da_offset);
3713                         ep->d_ino = dap->da_newinum;
3714                         dap->da_state &= ~UNDONE;
3715                         dap->da_state |= ATTACHED;
3716                         chgs = 1;
3717                         /*
3718                          * If the inode referenced by the directory has
3719                          * been written out, then the dependency can be
3720                          * moved to the pending list.
3721                          */
3722                         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3723                                 LIST_REMOVE(dap, da_pdlist);
3724                                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
3725                                     da_pdlist);
3726                         }
3727                 }
3728         }
3729         /*
3730          * If there were any rollbacks in the directory, then it must be
3731          * marked dirty so that its will eventually get written back in
3732          * its correct form.
3733          */
3734         if (chgs) {
3735                 if ((bp->b_flags & B_DELWRI) == 0)
3736                         stat_dir_entry++;
3737                 bdirty(bp);
3738         }
3739         /*
3740          * If no dependencies remain, the pagedep will be freed.
3741          * Otherwise it will remain to update the page before it
3742          * is written back to disk.
3743          */
3744         if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) {
3745                 for (i = 0; i < DAHASHSZ; i++)
3746                         if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
3747                                 break;
3748                 if (i == DAHASHSZ) {
3749                         LIST_REMOVE(pagedep, pd_hash);
3750                         WORKITEM_FREE(pagedep, D_PAGEDEP);
3751                         return (0);
3752                 }
3753         }
3754         return (1);
3755 }
3756
3757 /*
3758  * Writing back in-core inode structures.
3759  *
3760  * The file system only accesses an inode's contents when it occupies an
3761  * "in-core" inode structure.  These "in-core" structures are separate from
3762  * the page frames used to cache inode blocks.  Only the latter are
3763  * transferred to/from the disk.  So, when the updated contents of the
3764  * "in-core" inode structure are copied to the corresponding in-memory inode
3765  * block, the dependencies are also transferred.  The following procedure is
3766  * called when copying a dirty "in-core" inode to a cached inode block.
3767  */
3768
3769 /*
3770  * Called when an inode is loaded from disk. If the effective link count
3771  * differed from the actual link count when it was last flushed, then we
3772  * need to ensure that the correct effective link count is put back.
3773  */
3774 void
3775 softdep_load_inodeblock(ip)
3776         struct inode *ip;       /* the "in_core" copy of the inode */
3777 {
3778         struct inodedep *inodedep;
3779
3780         /*
3781          * Check for alternate nlink count.
3782          */
3783         ip->i_effnlink = ip->i_nlink;
3784         ACQUIRE_LOCK(&lk);
3785         if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3786                 FREE_LOCK(&lk);
3787                 return;
3788         }
3789         ip->i_effnlink -= inodedep->id_nlinkdelta;
3790         FREE_LOCK(&lk);
3791 }
3792
3793 /*
3794  * This routine is called just before the "in-core" inode
3795  * information is to be copied to the in-memory inode block.
3796  * Recall that an inode block contains several inodes. If
3797  * the force flag is set, then the dependencies will be
3798  * cleared so that the update can always be made. Note that
3799  * the buffer is locked when this routine is called, so we
3800  * will never be in the middle of writing the inode block
3801  * to disk.
3802  */
3803 void
3804 softdep_update_inodeblock(ip, bp, waitfor)
3805         struct inode *ip;       /* the "in_core" copy of the inode */
3806         struct buf *bp;         /* the buffer containing the inode block */
3807         int waitfor;            /* nonzero => update must be allowed */
3808 {
3809         struct inodedep *inodedep;
3810         struct worklist *wk;
3811         int error, gotit;
3812
3813         /*
3814          * If the effective link count is not equal to the actual link
3815          * count, then we must track the difference in an inodedep while
3816          * the inode is (potentially) tossed out of the cache. Otherwise,
3817          * if there is no existing inodedep, then there are no dependencies
3818          * to track.
3819          */
3820         ACQUIRE_LOCK(&lk);
3821         if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3822                 FREE_LOCK(&lk);
3823                 if (ip->i_effnlink != ip->i_nlink)
3824                         panic("softdep_update_inodeblock: bad link count");
3825                 return;
3826         }
3827         if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) {
3828                 FREE_LOCK(&lk);
3829                 panic("softdep_update_inodeblock: bad delta");
3830         }
3831         /*
3832          * Changes have been initiated. Anything depending on these
3833          * changes cannot occur until this inode has been written.
3834          */
3835         inodedep->id_state &= ~COMPLETE;
3836         if ((inodedep->id_state & ONWORKLIST) == 0)
3837                 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
3838         /*
3839          * Any new dependencies associated with the incore inode must
3840          * now be moved to the list associated with the buffer holding
3841          * the in-memory copy of the inode. Once merged process any
3842          * allocdirects that are completed by the merger.
3843          */
3844         merge_inode_lists(inodedep);
3845         if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
3846                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
3847         /*
3848          * Now that the inode has been pushed into the buffer, the
3849          * operations dependent on the inode being written to disk
3850          * can be moved to the id_bufwait so that they will be
3851          * processed when the buffer I/O completes.
3852          */
3853         while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
3854                 WORKLIST_REMOVE(wk);
3855                 WORKLIST_INSERT(&inodedep->id_bufwait, wk);
3856         }
3857         /*
3858          * Newly allocated inodes cannot be written until the bitmap
3859          * that allocates them have been written (indicated by
3860          * DEPCOMPLETE being set in id_state). If we are doing a
3861          * forced sync (e.g., an fsync on a file), we force the bitmap
3862          * to be written so that the update can be done.
3863          */
3864         if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
3865                 FREE_LOCK(&lk);
3866                 return;
3867         }
3868         gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
3869         FREE_LOCK(&lk);
3870         if (gotit &&
3871             (error = VOP_BWRITE(inodedep->id_buf->b_vp, inodedep->id_buf)) != 0)
3872                 softdep_error("softdep_update_inodeblock: bwrite", error);
3873         if ((inodedep->id_state & DEPCOMPLETE) == 0)
3874                 panic("softdep_update_inodeblock: update failed");
3875 }
3876
3877 /*
3878  * Merge the new inode dependency list (id_newinoupdt) into the old
3879  * inode dependency list (id_inoupdt). This routine must be called
3880  * with splbio interrupts blocked.
3881  */
3882 static void
3883 merge_inode_lists(inodedep)
3884         struct inodedep *inodedep;
3885 {
3886         struct allocdirect *listadp, *newadp;
3887
3888         newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3889         for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
3890                 if (listadp->ad_lbn < newadp->ad_lbn) {
3891                         listadp = TAILQ_NEXT(listadp, ad_next);
3892                         continue;
3893                 }
3894                 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3895                 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
3896                 if (listadp->ad_lbn == newadp->ad_lbn) {
3897                         allocdirect_merge(&inodedep->id_inoupdt, newadp,
3898                             listadp);
3899                         listadp = newadp;
3900                 }
3901                 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3902         }
3903         while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
3904                 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3905                 TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
3906         }
3907 }
3908
3909 /*
3910  * If we are doing an fsync, then we must ensure that any directory
3911  * entries for the inode have been written after the inode gets to disk.
3912  */
3913 static int
3914 softdep_fsync(vp)
3915         struct vnode *vp;       /* the "in_core" copy of the inode */
3916 {
3917         struct inodedep *inodedep;
3918         struct pagedep *pagedep;
3919         struct worklist *wk;
3920         struct diradd *dap;
3921         struct mount *mnt;
3922         struct vnode *pvp;
3923         struct inode *ip;
3924         struct buf *bp;
3925         struct fs *fs;
3926         struct thread *td = curthread;          /* XXX */
3927         struct proc *p = td->td_proc;
3928         int error, flushparent;
3929         ino_t parentino;
3930         ufs_lbn_t lbn;
3931
3932         KKASSERT(p);
3933
3934         ip = VTOI(vp);
3935         fs = ip->i_fs;
3936         ACQUIRE_LOCK(&lk);
3937         if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
3938                 FREE_LOCK(&lk);
3939                 return (0);
3940         }
3941         if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
3942             LIST_FIRST(&inodedep->id_bufwait) != NULL ||
3943             TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
3944             TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
3945                 FREE_LOCK(&lk);
3946                 panic("softdep_fsync: pending ops");
3947         }
3948         for (error = 0, flushparent = 0; ; ) {
3949                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
3950                         break;
3951                 if (wk->wk_type != D_DIRADD) {
3952                         FREE_LOCK(&lk);
3953                         panic("softdep_fsync: Unexpected type %s",
3954                             TYPENAME(wk->wk_type));
3955                 }
3956                 dap = WK_DIRADD(wk);
3957                 /*
3958                  * Flush our parent if this directory entry
3959                  * has a MKDIR_PARENT dependency.
3960                  */
3961                 if (dap->da_state & DIRCHG)
3962                         pagedep = dap->da_previous->dm_pagedep;
3963                 else
3964                         pagedep = dap->da_pagedep;
3965                 mnt = pagedep->pd_mnt;
3966                 parentino = pagedep->pd_ino;
3967                 lbn = pagedep->pd_lbn;
3968                 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) {
3969                         FREE_LOCK(&lk);
3970                         panic("softdep_fsync: dirty");
3971                 }
3972                 flushparent = dap->da_state & MKDIR_PARENT;
3973                 /*
3974                  * If we are being fsync'ed as part of vgone'ing this vnode,
3975                  * then we will not be able to release and recover the
3976                  * vnode below, so we just have to give up on writing its
3977                  * directory entry out. It will eventually be written, just
3978                  * not now, but then the user was not asking to have it
3979                  * written, so we are not breaking any promises.
3980                  */
3981                 if (vp->v_flag & VXLOCK)
3982                         break;
3983                 /*
3984                  * We prevent deadlock by always fetching inodes from the
3985                  * root, moving down the directory tree. Thus, when fetching
3986                  * our parent directory, we must unlock ourselves before
3987                  * requesting the lock on our parent. See the comment in
3988                  * ufs_lookup for details on possible races.
3989                  */
3990                 FREE_LOCK(&lk);
3991                 VOP_UNLOCK(vp, 0, td);
3992                 error = VFS_VGET(mnt, parentino, &pvp);
3993                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
3994                 if (error != 0)
3995                         return (error);
3996                 if (flushparent) {
3997                         if ((error = UFS_UPDATE(pvp, 1)) != 0) {
3998                                 vput(pvp);
3999                                 return (error);
4000                         }
4001                 }
4002                 /*
4003                  * Flush directory page containing the inode's name.
4004                  */
4005                 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
4006                     &bp);
4007                 if (error == 0)
4008                         error = VOP_BWRITE(bp->b_vp, bp);
4009                 vput(pvp);
4010                 if (error != 0)
4011                         return (error);
4012                 ACQUIRE_LOCK(&lk);
4013                 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
4014                         break;
4015         }
4016         FREE_LOCK(&lk);
4017         return (0);
4018 }
4019
4020 /*
4021  * Flush all the dirty bitmaps associated with the block device
4022  * before flushing the rest of the dirty blocks so as to reduce
4023  * the number of dependencies that will have to be rolled back.
4024  */
4025 void
4026 softdep_fsync_mountdev(vp)
4027         struct vnode *vp;
4028 {
4029         struct buf *bp, *nbp;
4030         struct worklist *wk;
4031
4032         if (!vn_isdisk(vp, NULL))
4033                 panic("softdep_fsync_mountdev: vnode not a disk");
4034         ACQUIRE_LOCK(&lk);
4035         for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
4036                 nbp = TAILQ_NEXT(bp, b_vnbufs);
4037                 /*
4038                  * If it is already scheduled, skip to the next buffer.
4039                  */
4040                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
4041                         continue;
4042                 if ((bp->b_flags & B_DELWRI) == 0) {
4043                         FREE_LOCK(&lk);
4044                         panic("softdep_fsync_mountdev: not dirty");
4045                 }
4046                 /*
4047                  * We are only interested in bitmaps with outstanding
4048                  * dependencies.
4049                  */
4050                 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
4051                     wk->wk_type != D_BMSAFEMAP ||
4052                     (bp->b_xflags & BX_BKGRDINPROG)) {
4053                         BUF_UNLOCK(bp);
4054                         continue;
4055                 }
4056                 bremfree(bp);
4057                 FREE_LOCK(&lk);
4058                 (void) bawrite(bp);
4059                 ACQUIRE_LOCK(&lk);
4060                 /*
4061                  * Since we may have slept during the I/O, we need
4062                  * to start from a known point.
4063                  */
4064                 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
4065         }
4066         drain_output(vp, 1);
4067         FREE_LOCK(&lk);
4068 }
4069
4070 /*
4071  * This routine is called when we are trying to synchronously flush a
4072  * file. This routine must eliminate any filesystem metadata dependencies
4073  * so that the syncing routine can succeed by pushing the dirty blocks
4074  * associated with the file. If any I/O errors occur, they are returned.
4075  */
4076 int
4077 softdep_sync_metadata(ap)
4078         struct vop_fsync_args /* {
4079                 struct vnode *a_vp;
4080                 struct ucred *a_cred;
4081                 int a_waitfor;
4082                 struct proc *a_p;
4083         } */ *ap;
4084 {
4085         struct vnode *vp = ap->a_vp;
4086         struct pagedep *pagedep;
4087         struct allocdirect *adp;
4088         struct allocindir *aip;
4089         struct buf *bp, *nbp;
4090         struct worklist *wk;
4091         int i, error, waitfor;
4092
4093         /*
4094          * Check whether this vnode is involved in a filesystem
4095          * that is doing soft dependency processing.
4096          */
4097         if (!vn_isdisk(vp, NULL)) {
4098                 if (!DOINGSOFTDEP(vp))
4099                         return (0);
4100         } else
4101                 if (vp->v_specmountpoint == NULL ||
4102                     (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0)
4103                         return (0);
4104         /*
4105          * Ensure that any direct block dependencies have been cleared.
4106          */
4107         ACQUIRE_LOCK(&lk);
4108         if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
4109                 FREE_LOCK(&lk);
4110                 return (error);
4111         }
4112         /*
4113          * For most files, the only metadata dependencies are the
4114          * cylinder group maps that allocate their inode or blocks.
4115          * The block allocation dependencies can be found by traversing
4116          * the dependency lists for any buffers that remain on their
4117          * dirty buffer list. The inode allocation dependency will
4118          * be resolved when the inode is updated with MNT_WAIT.
4119          * This work is done in two passes. The first pass grabs most
4120          * of the buffers and begins asynchronously writing them. The
4121          * only way to wait for these asynchronous writes is to sleep
4122          * on the filesystem vnode which may stay busy for a long time
4123          * if the filesystem is active. So, instead, we make a second
4124          * pass over the dependencies blocking on each write. In the
4125          * usual case we will be blocking against a write that we
4126          * initiated, so when it is done the dependency will have been
4127          * resolved. Thus the second pass is expected to end quickly.
4128          */
4129         waitfor = MNT_NOWAIT;
4130 top:
4131         /*
4132          * We must wait for any I/O in progress to finish so that
4133          * all potential buffers on the dirty list will be visible.
4134          */
4135         drain_output(vp, 1);
4136         if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) {
4137                 FREE_LOCK(&lk);
4138                 return (0);
4139         }
4140         bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
4141 loop:
4142         /*
4143          * As we hold the buffer locked, none of its dependencies
4144          * will disappear.
4145          */
4146         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4147                 switch (wk->wk_type) {
4148
4149                 case D_ALLOCDIRECT:
4150                         adp = WK_ALLOCDIRECT(wk);
4151                         if (adp->ad_state & DEPCOMPLETE)
4152                                 break;
4153                         nbp = adp->ad_buf;
4154                         if (getdirtybuf(&nbp, waitfor) == 0)
4155                                 break;
4156                         FREE_LOCK(&lk);
4157                         if (waitfor == MNT_NOWAIT) {
4158                                 bawrite(nbp);
4159                         } else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
4160                                 bawrite(bp);
4161                                 return (error);
4162                         }
4163                         ACQUIRE_LOCK(&lk);
4164                         break;
4165
4166                 case D_ALLOCINDIR:
4167                         aip = WK_ALLOCINDIR(wk);
4168                         if (aip->ai_state & DEPCOMPLETE)
4169                                 break;
4170                         nbp = aip->ai_buf;
4171                         if (getdirtybuf(&nbp, waitfor) == 0)
4172                                 break;
4173                         FREE_LOCK(&lk);
4174                         if (waitfor == MNT_NOWAIT) {
4175                                 bawrite(nbp);
4176                         } else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
4177                                 bawrite(bp);
4178                                 return (error);
4179                         }
4180                         ACQUIRE_LOCK(&lk);
4181                         break;
4182
4183                 case D_INDIRDEP:
4184                 restart:
4185
4186                         LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
4187                                 if (aip->ai_state & DEPCOMPLETE)
4188                                         continue;
4189                                 nbp = aip->ai_buf;
4190                                 if (getdirtybuf(&nbp, MNT_WAIT) == 0)
4191                                         goto restart;
4192                                 FREE_LOCK(&lk);
4193                                 if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
4194                                         bawrite(bp);
4195                                         return (error);
4196                                 }
4197                                 ACQUIRE_LOCK(&lk);
4198                                 goto restart;
4199                         }
4200                         break;
4201
4202                 case D_INODEDEP:
4203                         if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
4204                             WK_INODEDEP(wk)->id_ino)) != 0) {
4205                                 FREE_LOCK(&lk);
4206                                 bawrite(bp);
4207                                 return (error);
4208                         }
4209                         break;
4210
4211                 case D_PAGEDEP:
4212                         /*
4213                          * We are trying to sync a directory that may
4214                          * have dependencies on both its own metadata
4215                          * and/or dependencies on the inodes of any
4216                          * recently allocated files. We walk its diradd
4217                          * lists pushing out the associated inode.
4218                          */
4219                         pagedep = WK_PAGEDEP(wk);
4220                         for (i = 0; i < DAHASHSZ; i++) {
4221                                 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
4222                                         continue;
4223                                 if ((error =
4224                                     flush_pagedep_deps(vp, pagedep->pd_mnt,
4225                                                 &pagedep->pd_diraddhd[i]))) {
4226                                         FREE_LOCK(&lk);
4227                                         bawrite(bp);
4228                                         return (error);
4229                                 }
4230                         }
4231                         break;
4232
4233                 case D_MKDIR:
4234                         /*
4235                          * This case should never happen if the vnode has
4236                          * been properly sync'ed. However, if this function
4237                          * is used at a place where the vnode has not yet
4238                          * been sync'ed, this dependency can show up. So,
4239                          * rather than panic, just flush it.
4240                          */
4241                         nbp = WK_MKDIR(wk)->md_buf;
4242                         if (getdirtybuf(&nbp, waitfor) == 0)
4243                                 break;
4244                         FREE_LOCK(&lk);
4245                         if (waitfor == MNT_NOWAIT) {
4246                                 bawrite(nbp);
4247                         } else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
4248                                 bawrite(bp);
4249                                 return (error);
4250                         }
4251                         ACQUIRE_LOCK(&lk);
4252                         break;
4253
4254                 case D_BMSAFEMAP:
4255                         /*
4256                          * This case should never happen if the vnode has
4257                          * been properly sync'ed. However, if this function
4258                          * is used at a place where the vnode has not yet
4259                          * been sync'ed, this dependency can show up. So,
4260                          * rather than panic, just flush it.
4261                          */
4262                         nbp = WK_BMSAFEMAP(wk)->sm_buf;
4263                         if (getdirtybuf(&nbp, waitfor) == 0)
4264                                 break;
4265                         FREE_LOCK(&lk);
4266                         if (waitfor == MNT_NOWAIT) {
4267                                 bawrite(nbp);
4268                         } else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
4269                                 bawrite(bp);
4270                                 return (error);
4271                         }
4272                         ACQUIRE_LOCK(&lk);
4273                         break;
4274
4275                 default:
4276                         FREE_LOCK(&lk);
4277                         panic("softdep_sync_metadata: Unknown type %s",
4278                             TYPENAME(wk->wk_type));
4279                         /* NOTREACHED */
4280                 }
4281         }
4282         (void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT);
4283         nbp = TAILQ_NEXT(bp, b_vnbufs);
4284         FREE_LOCK(&lk);
4285         bawrite(bp);
4286         ACQUIRE_LOCK(&lk);
4287         if (nbp != NULL) {
4288                 bp = nbp;
4289                 goto loop;
4290         }
4291         /*
4292          * The brief unlock is to allow any pent up dependency
4293          * processing to be done.  Then proceed with the second pass.
4294          */
4295         if (waitfor == MNT_NOWAIT) {
4296                 waitfor = MNT_WAIT;
4297                 FREE_LOCK(&lk);
4298                 ACQUIRE_LOCK(&lk);
4299                 goto top;
4300         }
4301
4302         /*
4303          * If we have managed to get rid of all the dirty buffers,
4304          * then we are done. For certain directories and block
4305          * devices, we may need to do further work.
4306          */
4307         if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) {
4308                 FREE_LOCK(&lk);
4309                 return (0);
4310         }
4311
4312         FREE_LOCK(&lk);
4313         /*
4314          * If we are trying to sync a block device, some of its buffers may
4315          * contain metadata that cannot be written until the contents of some
4316          * partially written files have been written to disk. The only easy
4317          * way to accomplish this is to sync the entire filesystem (luckily
4318          * this happens rarely).
4319          *
4320          * We must wait for any I/O in progress to finish so that
4321          * all potential buffers on the dirty list will be visible.
4322          */
4323         drain_output(vp, 1);
4324         if (vn_isdisk(vp, NULL) &&
4325             vp->v_specmountpoint && !VOP_ISLOCKED(vp, NULL) &&
4326             (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred,
4327              ap->a_td)) != 0)
4328                 return (error);
4329         return (0);
4330 }
4331
4332 /*
4333  * Flush the dependencies associated with an inodedep.
4334  * Called with splbio blocked.
4335  */
4336 static int
4337 flush_inodedep_deps(fs, ino)
4338         struct fs *fs;
4339         ino_t ino;
4340 {
4341         struct inodedep *inodedep;
4342         struct allocdirect *adp;
4343         int error, waitfor;
4344         struct buf *bp;
4345
4346         /*
4347          * This work is done in two passes. The first pass grabs most
4348          * of the buffers and begins asynchronously writing them. The
4349          * only way to wait for these asynchronous writes is to sleep
4350          * on the filesystem vnode which may stay busy for a long time
4351          * if the filesystem is active. So, instead, we make a second
4352          * pass over the dependencies blocking on each write. In the
4353          * usual case we will be blocking against a write that we
4354          * initiated, so when it is done the dependency will have been
4355          * resolved. Thus the second pass is expected to end quickly.
4356          * We give a brief window at the top of the loop to allow
4357          * any pending I/O to complete.
4358          */
4359         for (waitfor = MNT_NOWAIT; ; ) {
4360                 FREE_LOCK(&lk);
4361                 ACQUIRE_LOCK(&lk);
4362                 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4363                         return (0);
4364                 TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) {
4365                         if (adp->ad_state & DEPCOMPLETE)
4366                                 continue;
4367                         bp = adp->ad_buf;
4368                         if (getdirtybuf(&bp, waitfor) == 0) {
4369                                 if (waitfor == MNT_NOWAIT)
4370                                         continue;
4371                                 break;
4372                         }
4373                         FREE_LOCK(&lk);
4374                         if (waitfor == MNT_NOWAIT) {
4375                                 bawrite(bp);
4376                         } else if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) {
4377                                 ACQUIRE_LOCK(&lk);
4378                                 return (error);
4379                         }
4380                         ACQUIRE_LOCK(&lk);
4381                         break;
4382                 }
4383                 if (adp != NULL)
4384                         continue;
4385                 TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) {
4386                         if (adp->ad_state & DEPCOMPLETE)
4387                                 continue;
4388                         bp = adp->ad_buf;
4389                         if (getdirtybuf(&bp, waitfor) == 0) {
4390                                 if (waitfor == MNT_NOWAIT)
4391                                         continue;
4392                                 break;
4393                         }
4394                         FREE_LOCK(&lk);
4395                         if (waitfor == MNT_NOWAIT) {
4396                                 bawrite(bp);
4397                         } else if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) {
4398                                 ACQUIRE_LOCK(&lk);
4399                                 return (error);
4400                         }
4401                         ACQUIRE_LOCK(&lk);
4402                         break;
4403                 }
4404                 if (adp != NULL)
4405                         continue;
4406                 /*
4407                  * If pass2, we are done, otherwise do pass 2.
4408                  */
4409                 if (waitfor == MNT_WAIT)
4410                         break;
4411                 waitfor = MNT_WAIT;
4412         }
4413         /*
4414          * Try freeing inodedep in case all dependencies have been removed.
4415          */
4416         if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
4417                 (void) free_inodedep(inodedep);
4418         return (0);
4419 }
4420
4421 /*
4422  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
4423  * Called with splbio blocked.
4424  */
4425 static int
4426 flush_pagedep_deps(pvp, mp, diraddhdp)
4427         struct vnode *pvp;
4428         struct mount *mp;
4429         struct diraddhd *diraddhdp;
4430 {
4431         struct thread *td = curthread;          /* XXX */
4432         struct ucred *cr;
4433         struct inodedep *inodedep;
4434         struct ufsmount *ump;
4435         struct diradd *dap;
4436         struct vnode *vp;
4437         int gotit, error = 0;
4438         struct buf *bp;
4439         ino_t inum;
4440
4441         KKASSERT(td->td_proc);
4442         cr = td->td_proc->p_ucred;
4443
4444         ump = VFSTOUFS(mp);
4445         while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
4446                 /*
4447                  * Flush ourselves if this directory entry
4448                  * has a MKDIR_PARENT dependency.
4449                  */
4450                 if (dap->da_state & MKDIR_PARENT) {
4451                         FREE_LOCK(&lk);
4452                         if ((error = UFS_UPDATE(pvp, 1)) != 0)
4453                                 break;
4454                         ACQUIRE_LOCK(&lk);
4455                         /*
4456                          * If that cleared dependencies, go on to next.
4457                          */
4458                         if (dap != LIST_FIRST(diraddhdp))
4459                                 continue;
4460                         if (dap->da_state & MKDIR_PARENT) {
4461                                 FREE_LOCK(&lk);
4462                                 panic("flush_pagedep_deps: MKDIR_PARENT");
4463                         }
4464                 }
4465                 /*
4466                  * A newly allocated directory must have its "." and
4467                  * ".." entries written out before its name can be
4468                  * committed in its parent. We do not want or need
4469                  * the full semantics of a synchronous VOP_FSYNC as
4470                  * that may end up here again, once for each directory
4471                  * level in the filesystem. Instead, we push the blocks
4472                  * and wait for them to clear. We have to fsync twice
4473                  * because the first call may choose to defer blocks
4474                  * that still have dependencies, but deferral will
4475                  * happen at most once.
4476                  */
4477                 inum = dap->da_newinum;
4478                 if (dap->da_state & MKDIR_BODY) {
4479                         FREE_LOCK(&lk);
4480                         if ((error = VFS_VGET(mp, inum, &vp)) != 0)
4481                                 break;
4482                         if ((error=VOP_FSYNC(vp, cr, MNT_NOWAIT, td)) ||
4483                             (error=VOP_FSYNC(vp, cr, MNT_NOWAIT, td))) {
4484                                 vput(vp);
4485                                 break;
4486                         }
4487                         drain_output(vp, 0);
4488                         vput(vp);
4489                         ACQUIRE_LOCK(&lk);
4490                         /*
4491                          * If that cleared dependencies, go on to next.
4492                          */
4493                         if (dap != LIST_FIRST(diraddhdp))
4494                                 continue;
4495                         if (dap->da_state & MKDIR_BODY) {
4496                                 FREE_LOCK(&lk);
4497                                 panic("flush_pagedep_deps: MKDIR_BODY");
4498                         }
4499                 }
4500                 /*
4501                  * Flush the inode on which the directory entry depends.
4502                  * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
4503                  * the only remaining dependency is that the updated inode
4504                  * count must get pushed to disk. The inode has already
4505                  * been pushed into its inode buffer (via VOP_UPDATE) at
4506                  * the time of the reference count change. So we need only
4507                  * locate that buffer, ensure that there will be no rollback
4508                  * caused by a bitmap dependency, then write the inode buffer.
4509                  */
4510                 if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
4511                         FREE_LOCK(&lk);
4512                         panic("flush_pagedep_deps: lost inode");
4513                 }
4514                 /*
4515                  * If the inode still has bitmap dependencies,
4516                  * push them to disk.
4517                  */
4518                 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4519                         gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
4520                         FREE_LOCK(&lk);
4521                         if (gotit &&
4522                             (error = VOP_BWRITE(inodedep->id_buf->b_vp,
4523                              inodedep->id_buf)) != 0)
4524                                 break;
4525                         ACQUIRE_LOCK(&lk);
4526                         if (dap != LIST_FIRST(diraddhdp))
4527                                 continue;
4528                 }
4529                 /*
4530                  * If the inode is still sitting in a buffer waiting
4531                  * to be written, push it to disk.
4532                  */
4533                 FREE_LOCK(&lk);
4534                 if ((error = bread(ump->um_devvp,
4535                     fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
4536                     (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0)
4537                         break;
4538                 if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0)
4539                         break;
4540                 ACQUIRE_LOCK(&lk);
4541                 /*
4542                  * If we have failed to get rid of all the dependencies
4543                  * then something is seriously wrong.
4544                  */
4545                 if (dap == LIST_FIRST(diraddhdp)) {
4546                         FREE_LOCK(&lk);
4547                         panic("flush_pagedep_deps: flush failed");
4548                 }
4549         }
4550         if (error)
4551                 ACQUIRE_LOCK(&lk);
4552         return (error);
4553 }
4554
4555 /*
4556  * A large burst of file addition or deletion activity can drive the
4557  * memory load excessively high. First attempt to slow things down
4558  * using the techniques below. If that fails, this routine requests
4559  * the offending operations to fall back to running synchronously
4560  * until the memory load returns to a reasonable level.
4561  */
4562 int
4563 softdep_slowdown(vp)
4564         struct vnode *vp;
4565 {
4566         int max_softdeps_hard;
4567
4568         max_softdeps_hard = max_softdeps * 11 / 10;
4569         if (num_dirrem < max_softdeps_hard / 2 &&
4570             num_inodedep < max_softdeps_hard)
4571                 return (0);
4572         stat_sync_limit_hit += 1;
4573         return (1);
4574 }
4575
4576 /*
4577  * If memory utilization has gotten too high, deliberately slow things
4578  * down and speed up the I/O processing.
4579  */
4580 static int
4581 request_cleanup(resource, islocked)
4582         int resource;
4583         int islocked;
4584 {
4585         struct thread *td = curthread;          /* XXX */
4586
4587         /*
4588          * We never hold up the filesystem syncer process.
4589          */
4590         if (td == filesys_syncer)
4591                 return (0);
4592         /*
4593          * First check to see if the work list has gotten backlogged.
4594          * If it has, co-opt this process to help clean up two entries.
4595          * Because this process may hold inodes locked, we cannot
4596          * handle any remove requests that might block on a locked
4597          * inode as that could lead to deadlock.
4598          */
4599         if (num_on_worklist > max_softdeps / 10) {
4600                 if (islocked)
4601                         FREE_LOCK(&lk);
4602                 process_worklist_item(NULL, LK_NOWAIT);
4603                 process_worklist_item(NULL, LK_NOWAIT);
4604                 stat_worklist_push += 2;
4605                 if (islocked)
4606                         ACQUIRE_LOCK(&lk);
4607                 return(1);
4608         }
4609
4610         /*
4611          * If we are resource constrained on inode dependencies, try
4612          * flushing some dirty inodes. Otherwise, we are constrained
4613          * by file deletions, so try accelerating flushes of directories
4614          * with removal dependencies. We would like to do the cleanup
4615          * here, but we probably hold an inode locked at this point and
4616          * that might deadlock against one that we try to clean. So,
4617          * the best that we can do is request the syncer daemon to do
4618          * the cleanup for us.
4619          */
4620         switch (resource) {
4621
4622         case FLUSH_INODES:
4623                 stat_ino_limit_push += 1;
4624                 req_clear_inodedeps += 1;
4625                 stat_countp = &stat_ino_limit_hit;
4626                 break;
4627
4628         case FLUSH_REMOVE:
4629                 stat_blk_limit_push += 1;
4630                 req_clear_remove += 1;
4631                 stat_countp = &stat_blk_limit_hit;
4632                 break;
4633
4634         default:
4635                 if (islocked)
4636                         FREE_LOCK(&lk);
4637                 panic("request_cleanup: unknown type");
4638         }
4639         /*
4640          * Hopefully the syncer daemon will catch up and awaken us.
4641          * We wait at most tickdelay before proceeding in any case.
4642          */
4643         if (islocked == 0)
4644                 ACQUIRE_LOCK(&lk);
4645         proc_waiting += 1;
4646         if (handle.callout == NULL)
4647                 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
4648         interlocked_sleep(&lk, SLEEP, (caddr_t)&proc_waiting, PPAUSE,
4649             "softupdate", 0);
4650         proc_waiting -= 1;
4651         if (islocked == 0)
4652                 FREE_LOCK(&lk);
4653         return (1);
4654 }
4655
4656 /*
4657  * Awaken processes pausing in request_cleanup and clear proc_waiting
4658  * to indicate that there is no longer a timer running.
4659  */
4660 void
4661 pause_timer(arg)
4662         void *arg;
4663 {
4664
4665         *stat_countp += 1;
4666         wakeup_one(&proc_waiting);
4667         if (proc_waiting > 0)
4668                 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
4669         else
4670                 handle.callout = NULL;
4671 }
4672
4673 /*
4674  * Flush out a directory with at least one removal dependency in an effort to
4675  * reduce the number of dirrem, freefile, and freeblks dependency structures.
4676  */
4677 static void
4678 clear_remove(struct thread *td)
4679 {
4680         struct pagedep_hashhead *pagedephd;
4681         struct pagedep *pagedep;
4682         static int next = 0;
4683         struct mount *mp;
4684         struct vnode *vp;
4685         int error, cnt;
4686         ino_t ino;
4687         struct ucred *cred;
4688
4689         KKASSERT(td->td_proc);
4690         cred = td->td_proc->p_ucred;
4691
4692         ACQUIRE_LOCK(&lk);
4693         for (cnt = 0; cnt < pagedep_hash; cnt++) {
4694                 pagedephd = &pagedep_hashtbl[next++];
4695                 if (next >= pagedep_hash)
4696                         next = 0;
4697                 LIST_FOREACH(pagedep, pagedephd, pd_hash) {
4698                         if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
4699                                 continue;
4700                         mp = pagedep->pd_mnt;
4701                         ino = pagedep->pd_ino;
4702                         FREE_LOCK(&lk);
4703                         if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
4704                                 softdep_error("clear_remove: vget", error);
4705                                 return;
4706                         }
4707                         if ((error = VOP_FSYNC(vp, cred, MNT_NOWAIT, td)))
4708                                 softdep_error("clear_remove: fsync", error);
4709                         drain_output(vp, 0);
4710                         vput(vp);
4711                         return;
4712                 }
4713         }
4714         FREE_LOCK(&lk);
4715 }
4716
4717 /*
4718  * Clear out a block of dirty inodes in an effort to reduce
4719  * the number of inodedep dependency structures.
4720  */
4721 static void
4722 clear_inodedeps(struct thread *td)
4723 {
4724         struct ucred *cred;
4725         struct inodedep_hashhead *inodedephd;
4726         struct inodedep *inodedep;
4727         static int next = 0;
4728         struct mount *mp;
4729         struct vnode *vp;
4730         struct fs *fs;
4731         int error, cnt;
4732         ino_t firstino, lastino, ino;
4733
4734         KKASSERT(td->td_proc);
4735         cred = td->td_proc->p_ucred;
4736
4737         ACQUIRE_LOCK(&lk);
4738         /*
4739          * Pick a random inode dependency to be cleared.
4740          * We will then gather up all the inodes in its block
4741          * that have dependencies and flush them out.
4742          */
4743         for (cnt = 0; cnt < inodedep_hash; cnt++) {
4744                 inodedephd = &inodedep_hashtbl[next++];
4745                 if (next >= inodedep_hash)
4746                         next = 0;
4747                 if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
4748                         break;
4749         }
4750         if (inodedep == NULL)
4751                 return;
4752         /*
4753          * Ugly code to find mount point given pointer to superblock.
4754          */
4755         fs = inodedep->id_fs;
4756         TAILQ_FOREACH(mp, &mountlist, mnt_list)
4757                 if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
4758                         break;
4759         /*
4760          * Find the last inode in the block with dependencies.
4761          */
4762         firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
4763         for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
4764                 if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
4765                         break;
4766         /*
4767          * Asynchronously push all but the last inode with dependencies.
4768          * Synchronously push the last inode with dependencies to ensure
4769          * that the inode block gets written to free up the inodedeps.
4770          */
4771         for (ino = firstino; ino <= lastino; ino++) {
4772                 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4773                         continue;
4774                 FREE_LOCK(&lk);
4775                 if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
4776                         softdep_error("clear_inodedeps: vget", error);
4777                         return;
4778                 }
4779                 if (ino == lastino) {
4780                         if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)))
4781                                 softdep_error("clear_inodedeps: fsync1", error);
4782                 } else {
4783                         if ((error = VOP_FSYNC(vp, cred, MNT_NOWAIT, td)))
4784                                 softdep_error("clear_inodedeps: fsync2", error);
4785                         drain_output(vp, 0);
4786                 }
4787                 vput(vp);
4788                 ACQUIRE_LOCK(&lk);
4789         }
4790         FREE_LOCK(&lk);
4791 }
4792
4793 /*
4794  * Function to determine if the buffer has outstanding dependencies
4795  * that will cause a roll-back if the buffer is written. If wantcount
4796  * is set, return number of dependencies, otherwise just yes or no.
4797  */
4798 static int
4799 softdep_count_dependencies(bp, wantcount)
4800         struct buf *bp;
4801         int wantcount;
4802 {
4803         struct worklist *wk;
4804         struct inodedep *inodedep;
4805         struct indirdep *indirdep;
4806         struct allocindir *aip;
4807         struct pagedep *pagedep;
4808         struct diradd *dap;
4809         int i, retval;
4810
4811         retval = 0;
4812         ACQUIRE_LOCK(&lk);
4813         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4814                 switch (wk->wk_type) {
4815
4816                 case D_INODEDEP:
4817                         inodedep = WK_INODEDEP(wk);
4818                         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4819                                 /* bitmap allocation dependency */
4820                                 retval += 1;
4821                                 if (!wantcount)
4822                                         goto out;
4823                         }
4824                         if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
4825                                 /* direct block pointer dependency */
4826                                 retval += 1;
4827                                 if (!wantcount)
4828                                         goto out;
4829                         }
4830                         continue;
4831
4832                 case D_INDIRDEP:
4833                         indirdep = WK_INDIRDEP(wk);
4834
4835                         LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
4836                                 /* indirect block pointer dependency */
4837                                 retval += 1;
4838                                 if (!wantcount)
4839                                         goto out;
4840                         }
4841                         continue;
4842
4843                 case D_PAGEDEP:
4844                         pagedep = WK_PAGEDEP(wk);
4845                         for (i = 0; i < DAHASHSZ; i++) {
4846
4847                                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
4848                                         /* directory entry dependency */
4849                                         retval += 1;
4850                                         if (!wantcount)
4851                                                 goto out;
4852                                 }
4853                         }
4854                         continue;
4855
4856                 case D_BMSAFEMAP:
4857                 case D_ALLOCDIRECT:
4858                 case D_ALLOCINDIR:
4859                 case D_MKDIR:
4860                         /* never a dependency on these blocks */
4861                         continue;
4862
4863                 default:
4864                         FREE_LOCK(&lk);
4865                         panic("softdep_check_for_rollback: Unexpected type %s",
4866                             TYPENAME(wk->wk_type));
4867                         /* NOTREACHED */
4868                 }
4869         }
4870 out:
4871         FREE_LOCK(&lk);
4872         return retval;
4873 }
4874
4875 /*
4876  * Acquire exclusive access to a buffer.
4877  * Must be called with splbio blocked.
4878  * Return 1 if buffer was acquired.
4879  */
4880 static int
4881 getdirtybuf(bpp, waitfor)
4882         struct buf **bpp;
4883         int waitfor;
4884 {
4885         struct buf *bp;
4886         int error;
4887
4888         for (;;) {
4889                 if ((bp = *bpp) == NULL)
4890                         return (0);
4891                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
4892                         if ((bp->b_xflags & BX_BKGRDINPROG) == 0)
4893                                 break;
4894                         BUF_UNLOCK(bp);
4895                         if (waitfor != MNT_WAIT)
4896                                 return (0);
4897                         bp->b_xflags |= BX_BKGRDWAIT;
4898                         interlocked_sleep(&lk, SLEEP, &bp->b_xflags, PRIBIO,
4899                             "getbuf", 0);
4900                         continue;
4901                 }
4902                 if (waitfor != MNT_WAIT)
4903                         return (0);
4904                 error = interlocked_sleep(&lk, LOCKBUF, bp,
4905                     LK_EXCLUSIVE | LK_SLEEPFAIL, 0, 0);
4906                 if (error != ENOLCK) {
4907                         FREE_LOCK(&lk);
4908                         panic("getdirtybuf: inconsistent lock");
4909                 }
4910         }
4911         if ((bp->b_flags & B_DELWRI) == 0) {
4912                 BUF_UNLOCK(bp);
4913                 return (0);
4914         }
4915         bremfree(bp);
4916         return (1);
4917 }
4918
4919 /*
4920  * Wait for pending output on a vnode to complete.
4921  * Must be called with vnode locked.
4922  */
4923 static void
4924 drain_output(vp, islocked)
4925         struct vnode *vp;
4926         int islocked;
4927 {
4928
4929         if (!islocked)
4930                 ACQUIRE_LOCK(&lk);
4931         while (vp->v_numoutput) {
4932                 vp->v_flag |= VBWAIT;
4933                 interlocked_sleep(&lk, SLEEP, (caddr_t)&vp->v_numoutput,
4934                     PRIBIO + 1, "drainvp", 0);
4935         }
4936         if (!islocked)
4937                 FREE_LOCK(&lk);
4938 }
4939
4940 /*
4941  * Called whenever a buffer that is being invalidated or reallocated
4942  * contains dependencies. This should only happen if an I/O error has
4943  * occurred. The routine is called with the buffer locked.
4944  */
4945 static void
4946 softdep_deallocate_dependencies(bp)
4947         struct buf *bp;
4948 {
4949
4950         if ((bp->b_flags & B_ERROR) == 0)
4951                 panic("softdep_deallocate_dependencies: dangling deps");
4952         softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
4953         panic("softdep_deallocate_dependencies: unrecovered I/O error");
4954 }
4955
4956 /*
4957  * Function to handle asynchronous write errors in the filesystem.
4958  */
4959 void
4960 softdep_error(func, error)
4961         char *func;
4962         int error;
4963 {
4964
4965         /* XXX should do something better! */
4966         printf("%s: got error %d while accessing filesystem\n", func, error);
4967 }