sys/vfs/hammer/hammer_inode.c

   1 /*
   2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34
  35 #include "hammer.h"
  36 #include <vm/vm_extern.h>
  37
  38 static int      hammer_unload_inode(struct hammer_inode *ip);
  39 static void     hammer_free_inode(hammer_inode_t ip);
  40 static void     hammer_flush_inode_core(hammer_inode_t ip,
  41                                         hammer_flush_group_t flg, int flags);
  42 static int      hammer_setup_child_callback(hammer_record_t rec, void *data);
  43 #if 0
  44 static int      hammer_syncgrp_child_callback(hammer_record_t rec, void *data);
  45 #endif
  46 static int      hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
  47                                         hammer_flush_group_t flg);
  48 static int      hammer_setup_parent_inodes_helper(hammer_record_t record,
  49                                         int depth, hammer_flush_group_t flg);
  50 static void     hammer_inode_wakereclaims(hammer_inode_t ip);
  51 static struct hammer_inostats *hammer_inode_inostats(hammer_mount_t hmp,
  52                                         pid_t pid);
  53
  54 #ifdef DEBUG_TRUNCATE
  55 extern struct hammer_inode *HammerTruncIp;
  56 #endif
  57
  58 struct krate hammer_gen_krate = { 1 };
  59
  60 /*
  61  * RB-Tree support for inode structures
  62  */
  63 int
  64 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
  65 {
  66         if (ip1->obj_localization < ip2->obj_localization)
  67                 return(-1);
  68         if (ip1->obj_localization > ip2->obj_localization)
  69                 return(1);
  70         if (ip1->obj_id < ip2->obj_id)
  71                 return(-1);
  72         if (ip1->obj_id > ip2->obj_id)
  73                 return(1);
  74         if (ip1->obj_asof < ip2->obj_asof)
  75                 return(-1);
  76         if (ip1->obj_asof > ip2->obj_asof)
  77                 return(1);
  78         return(0);
  79 }
  80
  81 int
  82 hammer_redo_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
  83 {
  84         if (ip1->redo_fifo_start < ip2->redo_fifo_start)
  85                 return(-1);
  86         if (ip1->redo_fifo_start > ip2->redo_fifo_start)
  87                 return(1);
  88         return(0);
  89 }
  90
  91 /*
  92  * RB-Tree support for inode structures / special LOOKUP_INFO
  93  */
  94 static int
  95 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
  96 {
  97         if (info->obj_localization < ip->obj_localization)
  98                 return(-1);
  99         if (info->obj_localization > ip->obj_localization)
 100                 return(1);
 101         if (info->obj_id < ip->obj_id)
 102                 return(-1);
 103         if (info->obj_id > ip->obj_id)
 104                 return(1);
 105         if (info->obj_asof < ip->obj_asof)
 106                 return(-1);
 107         if (info->obj_asof > ip->obj_asof)
 108                 return(1);
 109         return(0);
 110 }
 111
 112 /*
 113  * Used by hammer_scan_inode_snapshots() to locate all of an object's
 114  * snapshots.  Note that the asof field is not tested, which we can get
 115  * away with because it is the lowest-priority field.
 116  */
 117 static int
 118 hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data)
 119 {
 120         hammer_inode_info_t info = data;
 121
 122         if (ip->obj_localization > info->obj_localization)
 123                 return(1);
 124         if (ip->obj_localization < info->obj_localization)
 125                 return(-1);
 126         if (ip->obj_id > info->obj_id)
 127                 return(1);
 128         if (ip->obj_id < info->obj_id)
 129                 return(-1);
 130         return(0);
 131 }
 132
 133 /*
 134  * Used by hammer_unload_pseudofs() to locate all inodes associated with
 135  * a particular PFS.
 136  */
 137 static int
 138 hammer_inode_pfs_cmp(hammer_inode_t ip, void *data)
 139 {
 140         u_int32_t localization = *(u_int32_t *)data;
 141         if (ip->obj_localization > localization)
 142                 return(1);
 143         if (ip->obj_localization < localization)
 144                 return(-1);
 145         return(0);
 146 }
 147
 148 /*
 149  * RB-Tree support for pseudofs structures
 150  */
 151 static int
 152 hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2)
 153 {
 154         if (p1->localization < p2->localization)
 155                 return(-1);
 156         if (p1->localization > p2->localization)
 157                 return(1);
 158         return(0);
 159 }
 160
 161
 162 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
 163 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
 164                 hammer_inode_info_cmp, hammer_inode_info_t);
 165 RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node,
 166              hammer_pfs_rb_compare, u_int32_t, localization);
 167
 168 /*
 169  * The kernel is not actively referencing this vnode but is still holding
 170  * it cached.
 171  *
 172  * This is called from the frontend.
 173  *
 174  * MPALMOSTSAFE
 175  */
 176 int
 177 hammer_vop_inactive(struct vop_inactive_args *ap)
 178 {
 179         struct hammer_inode *ip = VTOI(ap->a_vp);
 180         hammer_mount_t hmp;
 181
 182         /*
 183          * Degenerate case
 184          */
 185         if (ip == NULL) {
 186                 vrecycle(ap->a_vp);
 187                 return(0);
 188         }
 189
 190         /*
 191          * If the inode no longer has visibility in the filesystem try to
 192          * recycle it immediately, even if the inode is dirty.  Recycling
 193          * it quickly allows the system to reclaim buffer cache and VM
 194          * resources which can matter a lot in a heavily loaded system.
 195          *
 196          * This can deadlock in vfsync() if we aren't careful.
 197          *
 198          * Do not queue the inode to the flusher if we still have visibility,
 199          * otherwise namespace calls such as chmod will unnecessarily generate
 200          * multiple inode updates.
 201          */
 202         if (ip->ino_data.nlinks == 0) {
 203                 hmp = ip->hmp;
 204                 lwkt_gettoken(&hmp->fs_token);
 205                 hammer_inode_unloadable_check(ip, 0);
 206                 if (ip->flags & HAMMER_INODE_MODMASK)
 207                         hammer_flush_inode(ip, 0);
 208                 lwkt_reltoken(&hmp->fs_token);
 209                 vrecycle(ap->a_vp);
 210         }
 211         return(0);
 212 }
 213
 214 /*
 215  * Release the vnode association.  This is typically (but not always)
 216  * the last reference on the inode.
 217  *
 218  * Once the association is lost we are on our own with regards to
 219  * flushing the inode.
 220  *
 221  * We must interlock ip->vp so hammer_get_vnode() can avoid races.
 222  */
 223 int
 224 hammer_vop_reclaim(struct vop_reclaim_args *ap)
 225 {
 226         struct hammer_inode *ip;
 227         hammer_mount_t hmp;
 228         struct vnode *vp;
 229
 230         vp = ap->a_vp;
 231
 232         if ((ip = vp->v_data) != NULL) {
 233                 hmp = ip->hmp;
 234                 lwkt_gettoken(&hmp->fs_token);
 235                 hammer_lock_ex(&ip->lock);
 236                 vp->v_data = NULL;
 237                 ip->vp = NULL;
 238
 239                 if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
 240                         ++hammer_count_reclaims;
 241                         ++hmp->count_reclaims;
 242                         ip->flags |= HAMMER_INODE_RECLAIM;
 243                 }
 244                 hammer_unlock(&ip->lock);
 245                 vclrisdirty(vp);
 246                 hammer_rel_inode(ip, 1);
 247                 lwkt_reltoken(&hmp->fs_token);
 248         }
 249         return(0);
 250 }
 251
 252 /*
 253  * Inform the kernel that the inode is dirty.  This will be checked
 254  * by vn_unlock().
 255  */
 256 void
 257 hammer_inode_dirty(struct hammer_inode *ip)
 258 {
 259         struct vnode *vp;
 260
 261         if ((ip->flags & HAMMER_INODE_MODMASK) &&
 262             (vp = ip->vp) != NULL) {
 263                 vsetisdirty(vp);
 264         }
 265 }
 266
 267 /*
 268  * Return a locked vnode for the specified inode.  The inode must be
 269  * referenced but NOT LOCKED on entry and will remain referenced on
 270  * return.
 271  *
 272  * Called from the frontend.
 273  */
 274 int
 275 hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp)
 276 {
 277         hammer_mount_t hmp;
 278         struct vnode *vp;
 279         int error = 0;
 280         u_int8_t obj_type;
 281
 282         hmp = ip->hmp;
 283
 284         for (;;) {
 285                 if ((vp = ip->vp) == NULL) {
 286                         error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0);
 287                         if (error)
 288                                 break;
 289                         hammer_lock_ex(&ip->lock);
 290                         if (ip->vp != NULL) {
 291                                 hammer_unlock(&ip->lock);
 292                                 vp = *vpp;
 293                                 vp->v_type = VBAD;
 294                                 vx_put(vp);
 295                                 continue;
 296                         }
 297                         hammer_ref(&ip->lock);
 298                         vp = *vpp;
 299                         ip->vp = vp;
 300
 301                         obj_type = ip->ino_data.obj_type;
 302                         vp->v_type = hammer_get_vnode_type(obj_type);
 303
 304                         hammer_inode_wakereclaims(ip);
 305
 306                         switch(ip->ino_data.obj_type) {
 307                         case HAMMER_OBJTYPE_CDEV:
 308                         case HAMMER_OBJTYPE_BDEV:
 309                                 vp->v_ops = &hmp->mp->mnt_vn_spec_ops;
 310                                 addaliasu(vp, ip->ino_data.rmajor,
 311                                           ip->ino_data.rminor);
 312                                 break;
 313                         case HAMMER_OBJTYPE_FIFO:
 314                                 vp->v_ops = &hmp->mp->mnt_vn_fifo_ops;
 315                                 break;
 316                         case HAMMER_OBJTYPE_REGFILE:
 317                                 break;
 318                         default:
 319                                 break;
 320                         }
 321
 322                         /*
 323                          * Only mark as the root vnode if the ip is not
 324                          * historical, otherwise the VFS cache will get
 325                          * confused.  The other half of the special handling
 326                          * is in hammer_vop_nlookupdotdot().
 327                          *
 328                          * Pseudo-filesystem roots can be accessed via
 329                          * non-root filesystem paths and setting VROOT may
 330                          * confuse the namecache.  Set VPFSROOT instead.
 331                          */
 332                         if (ip->obj_id == HAMMER_OBJID_ROOT &&
 333                             ip->obj_asof == hmp->asof) {
 334                                 if (ip->obj_localization == 0)
 335                                         vsetflags(vp, VROOT);
 336                                 else
 337                                         vsetflags(vp, VPFSROOT);
 338                         }
 339
 340                         vp->v_data = (void *)ip;
 341                         /* vnode locked by getnewvnode() */
 342                         /* make related vnode dirty if inode dirty? */
 343                         hammer_unlock(&ip->lock);
 344                         if (vp->v_type == VREG) {
 345                                 vinitvmio(vp, ip->ino_data.size,
 346                                           hammer_blocksize(ip->ino_data.size),
 347                                           hammer_blockoff(ip->ino_data.size));
 348                         }
 349                         break;
 350                 }
 351
 352                 /*
 353                  * Interlock vnode clearing.  This does not prevent the
 354                  * vnode from going into a reclaimed state but it does
 355                  * prevent it from being destroyed or reused so the vget()
 356                  * will properly fail.
 357                  */
 358                 hammer_lock_ex(&ip->lock);
 359                 if ((vp = ip->vp) == NULL) {
 360                         hammer_unlock(&ip->lock);
 361                         continue;
 362                 }
 363                 vhold(vp);
 364                 hammer_unlock(&ip->lock);
 365
 366                 /*
 367                  * loop if the vget fails (aka races), or if the vp
 368                  * no longer matches ip->vp.
 369                  */
 370                 if (vget(vp, LK_EXCLUSIVE) == 0) {
 371                         if (vp == ip->vp) {
 372                                 vdrop(vp);
 373                                 break;
 374                         }
 375                         vput(vp);
 376                 }
 377                 vdrop(vp);
 378         }
 379         *vpp = vp;
 380         return(error);
 381 }
 382
 383 /*
 384  * Locate all copies of the inode for obj_id compatible with the specified
 385  * asof, reference, and issue the related call-back.  This routine is used
 386  * for direct-io invalidation and does not create any new inodes.
 387  */
 388 void
 389 hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo,
 390                             int (*callback)(hammer_inode_t ip, void *data),
 391                             void *data)
 392 {
 393         hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root,
 394                                    hammer_inode_info_cmp_all_history,
 395                                    callback, iinfo);
 396 }
 397
 398 /*
 399  * Acquire a HAMMER inode.  The returned inode is not locked.  These functions
 400  * do not attach or detach the related vnode (use hammer_get_vnode() for
 401  * that).
 402  *
 403  * The flags argument is only applied for newly created inodes, and only
 404  * certain flags are inherited.
 405  *
 406  * Called from the frontend.
 407  */
 408 struct hammer_inode *
 409 hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
 410                  int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
 411                  int flags, int *errorp)
 412 {
 413         hammer_mount_t hmp = trans->hmp;
 414         struct hammer_node_cache *cachep;
 415         struct hammer_inode_info iinfo;
 416         struct hammer_cursor cursor;
 417         struct hammer_inode *ip;
 418
 419
 420         /*
 421          * Determine if we already have an inode cached.  If we do then
 422          * we are golden.
 423          *
 424          * If we find an inode with no vnode we have to mark the
 425          * transaction such that hammer_inode_waitreclaims() is
 426          * called later on to avoid building up an infinite number
 427          * of inodes.  Otherwise we can continue to * add new inodes
 428          * faster then they can be disposed of, even with the tsleep
 429          * delay.
 430          *
 431          * If we find a dummy inode we return a failure so dounlink
 432          * (which does another lookup) doesn't try to mess with the
 433          * link count.  hammer_vop_nresolve() uses hammer_get_dummy_inode()
 434          * to ref dummy inodes.
 435          */
 436         iinfo.obj_id = obj_id;
 437         iinfo.obj_asof = asof;
 438         iinfo.obj_localization = localization;
 439 loop:
 440         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
 441         if (ip) {
 442                 if (ip->flags & HAMMER_INODE_DUMMY) {
 443                         *errorp = ENOENT;
 444                         return(NULL);
 445                 }
 446                 hammer_ref(&ip->lock);
 447                 *errorp = 0;
 448                 return(ip);
 449         }
 450
 451         /*
 452          * Allocate a new inode structure and deal with races later.
 453          */
 454         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
 455         ++hammer_count_inodes;
 456         ++hmp->count_inodes;
 457         ip->obj_id = obj_id;
 458         ip->obj_asof = iinfo.obj_asof;
 459         ip->obj_localization = localization;
 460         ip->hmp = hmp;
 461         ip->flags = flags & HAMMER_INODE_RO;
 462         ip->cache[0].ip = ip;
 463         ip->cache[1].ip = ip;
 464         ip->cache[2].ip = ip;
 465         ip->cache[3].ip = ip;
 466         if (hmp->ronly)
 467                 ip->flags |= HAMMER_INODE_RO;
 468         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
 469                 0x7FFFFFFFFFFFFFFFLL;
 470         RB_INIT(&ip->rec_tree);
 471         TAILQ_INIT(&ip->target_list);
 472         hammer_ref(&ip->lock);
 473
 474         /*
 475          * Locate the on-disk inode.  If this is a PFS root we always
 476          * access the current version of the root inode and (if it is not
 477          * a master) always access information under it with a snapshot
 478          * TID.
 479          *
 480          * We cache recent inode lookups in this directory in dip->cache[2].
 481          * If we can't find it we assume the inode we are looking for is
 482          * close to the directory inode.
 483          */
 484 retry:
 485         cachep = NULL;
 486         if (dip) {
 487                 if (dip->cache[2].node)
 488                         cachep = &dip->cache[2];
 489                 else
 490                         cachep = &dip->cache[0];
 491         }
 492         hammer_init_cursor(trans, &cursor, cachep, NULL);
 493         cursor.key_beg.localization = localization + HAMMER_LOCALIZE_INODE;
 494         cursor.key_beg.obj_id = ip->obj_id;
 495         cursor.key_beg.key = 0;
 496         cursor.key_beg.create_tid = 0;
 497         cursor.key_beg.delete_tid = 0;
 498         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
 499         cursor.key_beg.obj_type = 0;
 500
 501         cursor.asof = iinfo.obj_asof;
 502         cursor.flags = HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_GET_DATA |
 503                        HAMMER_CURSOR_ASOF;
 504
 505         *errorp = hammer_btree_lookup(&cursor);
 506         if (*errorp == EDEADLK) {
 507                 hammer_done_cursor(&cursor);
 508                 goto retry;
 509         }
 510
 511         /*
 512          * On success the B-Tree lookup will hold the appropriate
 513          * buffer cache buffers and provide a pointer to the requested
 514          * information.  Copy the information to the in-memory inode
 515          * and cache the B-Tree node to improve future operations.
 516          */
 517         if (*errorp == 0) {
 518                 ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf;
 519                 ip->ino_data = cursor.data->inode;
 520
 521                 /*
 522                  * cache[0] tries to cache the location of the object inode.
 523                  * The assumption is that it is near the directory inode.
 524                  *
 525                  * cache[1] tries to cache the location of the object data.
 526                  * We might have something in the governing directory from
 527                  * scan optimizations (see the strategy code in
 528                  * hammer_vnops.c).
 529                  *
 530                  * We update dip->cache[2], if possible, with the location
 531                  * of the object inode for future directory shortcuts.
 532                  */
 533                 hammer_cache_node(&ip->cache[0], cursor.node);
 534                 if (dip) {
 535                         if (dip->cache[3].node) {
 536                                 hammer_cache_node(&ip->cache[1],
 537                                                   dip->cache[3].node);
 538                         }
 539                         hammer_cache_node(&dip->cache[2], cursor.node);
 540                 }
 541
 542                 /*
 543                  * The file should not contain any data past the file size
 544                  * stored in the inode.  Setting save_trunc_off to the
 545                  * file size instead of max reduces B-Tree lookup overheads
 546                  * on append by allowing the flusher to avoid checking for
 547                  * record overwrites.
 548                  */
 549                 ip->save_trunc_off = ip->ino_data.size;
 550
 551                 /*
 552                  * Locate and assign the pseudofs management structure to
 553                  * the inode.
 554                  */
 555                 if (dip && dip->obj_localization == ip->obj_localization) {
 556                         ip->pfsm = dip->pfsm;
 557                         hammer_ref(&ip->pfsm->lock);
 558                 } else {
 559                         ip->pfsm = hammer_load_pseudofs(trans,
 560                                                         ip->obj_localization,
 561                                                         errorp);
 562                         *errorp = 0;    /* ignore ENOENT */
 563                 }
 564         }
 565
 566         /*
 567          * The inode is placed on the red-black tree and will be synced to
 568          * the media when flushed or by the filesystem sync.  If this races
 569          * another instantiation/lookup the insertion will fail.
 570          */
 571         if (*errorp == 0) {
 572                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
 573                         hammer_free_inode(ip);
 574                         hammer_done_cursor(&cursor);
 575                         goto loop;
 576                 }
 577                 ip->flags |= HAMMER_INODE_ONDISK;
 578         } else {
 579                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
 580                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
 581                         --hmp->rsv_inodes;
 582                 }
 583
 584                 hammer_free_inode(ip);
 585                 ip = NULL;
 586         }
 587         hammer_done_cursor(&cursor);
 588
 589         /*
 590          * NEWINODE is only set if the inode becomes dirty later,
 591          * setting it here just leads to unnecessary stalls.
 592          *
 593          * trans->flags |= HAMMER_TRANSF_NEWINODE;
 594          */
 595         return (ip);
 596 }
 597
 598 /*
 599  * Get a dummy inode to placemark a broken directory entry.
 600  */
 601 struct hammer_inode *
 602 hammer_get_dummy_inode(hammer_transaction_t trans, hammer_inode_t dip,
 603                  int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
 604                  int flags, int *errorp)
 605 {
 606         hammer_mount_t hmp = trans->hmp;
 607         struct hammer_inode_info iinfo;
 608         struct hammer_inode *ip;
 609
 610         /*
 611          * Determine if we already have an inode cached.  If we do then
 612          * we are golden.
 613          *
 614          * If we find an inode with no vnode we have to mark the
 615          * transaction such that hammer_inode_waitreclaims() is
 616          * called later on to avoid building up an infinite number
 617          * of inodes.  Otherwise we can continue to * add new inodes
 618          * faster then they can be disposed of, even with the tsleep
 619          * delay.
 620          *
 621          * If we find a non-fake inode we return an error.  Only fake
 622          * inodes can be returned by this routine.
 623          */
 624         iinfo.obj_id = obj_id;
 625         iinfo.obj_asof = asof;
 626         iinfo.obj_localization = localization;
 627 loop:
 628         *errorp = 0;
 629         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
 630         if (ip) {
 631                 if ((ip->flags & HAMMER_INODE_DUMMY) == 0) {
 632                         *errorp = ENOENT;
 633                         return(NULL);
 634                 }
 635                 hammer_ref(&ip->lock);
 636                 return(ip);
 637         }
 638
 639         /*
 640          * Allocate a new inode structure and deal with races later.
 641          */
 642         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
 643         ++hammer_count_inodes;
 644         ++hmp->count_inodes;
 645         ip->obj_id = obj_id;
 646         ip->obj_asof = iinfo.obj_asof;
 647         ip->obj_localization = localization;
 648         ip->hmp = hmp;
 649         ip->flags = flags | HAMMER_INODE_RO | HAMMER_INODE_DUMMY;
 650         ip->cache[0].ip = ip;
 651         ip->cache[1].ip = ip;
 652         ip->cache[2].ip = ip;
 653         ip->cache[3].ip = ip;
 654         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
 655                 0x7FFFFFFFFFFFFFFFLL;
 656         RB_INIT(&ip->rec_tree);
 657         TAILQ_INIT(&ip->target_list);
 658         hammer_ref(&ip->lock);
 659
 660         /*
 661          * Populate the dummy inode.  Leave everything zero'd out.
 662          *
 663          * (ip->ino_leaf and ip->ino_data)
 664          *
 665          * Make the dummy inode a FIFO object which most copy programs
 666          * will properly ignore.
 667          */
 668         ip->save_trunc_off = ip->ino_data.size;
 669         ip->ino_data.obj_type = HAMMER_OBJTYPE_FIFO;
 670
 671         /*
 672          * Locate and assign the pseudofs management structure to
 673          * the inode.
 674          */
 675         if (dip && dip->obj_localization == ip->obj_localization) {
 676                 ip->pfsm = dip->pfsm;
 677                 hammer_ref(&ip->pfsm->lock);
 678         } else {
 679                 ip->pfsm = hammer_load_pseudofs(trans, ip->obj_localization,
 680                                                 errorp);
 681                 *errorp = 0;    /* ignore ENOENT */
 682         }
 683
 684         /*
 685          * The inode is placed on the red-black tree and will be synced to
 686          * the media when flushed or by the filesystem sync.  If this races
 687          * another instantiation/lookup the insertion will fail.
 688          *
 689          * NOTE: Do not set HAMMER_INODE_ONDISK.  The inode is a fake.
 690          */
 691         if (*errorp == 0) {
 692                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
 693                         hammer_free_inode(ip);
 694                         goto loop;
 695                 }
 696         } else {
 697                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
 698                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
 699                         --hmp->rsv_inodes;
 700                 }
 701                 hammer_free_inode(ip);
 702                 ip = NULL;
 703         }
 704         trans->flags |= HAMMER_TRANSF_NEWINODE;
 705         return (ip);
 706 }
 707
 708 /*
 709  * Return a referenced inode only if it is in our inode cache.
 710  *
 711  * Dummy inodes do not count.
 712  */
 713 struct hammer_inode *
 714 hammer_find_inode(hammer_transaction_t trans, int64_t obj_id,
 715                   hammer_tid_t asof, u_int32_t localization)
 716 {
 717         hammer_mount_t hmp = trans->hmp;
 718         struct hammer_inode_info iinfo;
 719         struct hammer_inode *ip;
 720
 721         iinfo.obj_id = obj_id;
 722         iinfo.obj_asof = asof;
 723         iinfo.obj_localization = localization;
 724
 725         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
 726         if (ip) {
 727                 if (ip->flags & HAMMER_INODE_DUMMY)
 728                         ip = NULL;
 729                 else
 730                         hammer_ref(&ip->lock);
 731         }
 732         return(ip);
 733 }
 734
 735 /*
 736  * Create a new filesystem object, returning the inode in *ipp.  The
 737  * returned inode will be referenced.  The inode is created in-memory.
 738  *
 739  * If pfsm is non-NULL the caller wishes to create the root inode for
 740  * a master PFS.
 741  */
 742 int
 743 hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
 744                     struct ucred *cred,
 745                     hammer_inode_t dip, const char *name, int namelen,
 746                     hammer_pseudofs_inmem_t pfsm, struct hammer_inode **ipp)
 747 {
 748         hammer_mount_t hmp;
 749         hammer_inode_t ip;
 750         uid_t xuid;
 751         int error;
 752         int64_t namekey;
 753         u_int32_t dummy;
 754
 755         hmp = trans->hmp;
 756
 757         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
 758         ++hammer_count_inodes;
 759         ++hmp->count_inodes;
 760         trans->flags |= HAMMER_TRANSF_NEWINODE;
 761
 762         if (pfsm) {
 763                 KKASSERT(pfsm->localization != 0);
 764                 ip->obj_id = HAMMER_OBJID_ROOT;
 765                 ip->obj_localization = pfsm->localization;
 766         } else {
 767                 KKASSERT(dip != NULL);
 768                 namekey = hammer_directory_namekey(dip, name, namelen, &dummy);
 769                 ip->obj_id = hammer_alloc_objid(hmp, dip, namekey);
 770                 ip->obj_localization = dip->obj_localization;
 771         }
 772
 773         KKASSERT(ip->obj_id != 0);
 774         ip->obj_asof = hmp->asof;
 775         ip->hmp = hmp;
 776         ip->flush_state = HAMMER_FST_IDLE;
 777         ip->flags = HAMMER_INODE_DDIRTY |
 778                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME;
 779         ip->cache[0].ip = ip;
 780         ip->cache[1].ip = ip;
 781         ip->cache[2].ip = ip;
 782         ip->cache[3].ip = ip;
 783
 784         ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
 785         /* ip->save_trunc_off = 0; (already zero) */
 786         RB_INIT(&ip->rec_tree);
 787         TAILQ_INIT(&ip->target_list);
 788
 789         ip->ino_data.atime = trans->time;
 790         ip->ino_data.mtime = trans->time;
 791         ip->ino_data.size = 0;
 792         ip->ino_data.nlinks = 0;
 793
 794         /*
 795          * A nohistory designator on the parent directory is inherited by
 796          * the child.  We will do this even for pseudo-fs creation... the
 797          * sysad can turn it off.
 798          */
 799         if (dip) {
 800                 ip->ino_data.uflags = dip->ino_data.uflags &
 801                                       (SF_NOHISTORY|UF_NOHISTORY|UF_NODUMP);
 802         }
 803
 804         ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
 805         ip->ino_leaf.base.localization = ip->obj_localization +
 806                                          HAMMER_LOCALIZE_INODE;
 807         ip->ino_leaf.base.obj_id = ip->obj_id;
 808         ip->ino_leaf.base.key = 0;
 809         ip->ino_leaf.base.create_tid = 0;
 810         ip->ino_leaf.base.delete_tid = 0;
 811         ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE;
 812         ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type);
 813
 814         ip->ino_data.obj_type = ip->ino_leaf.base.obj_type;
 815         ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
 816         ip->ino_data.mode = vap->va_mode;
 817         ip->ino_data.ctime = trans->time;
 818
 819         /*
 820          * If we are running version 2 or greater directory entries are
 821          * inode-localized instead of data-localized.
 822          */
 823         if (trans->hmp->version >= HAMMER_VOL_VERSION_TWO) {
 824                 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
 825                         ip->ino_data.cap_flags |=
 826                                 HAMMER_INODE_CAP_DIR_LOCAL_INO;
 827                 }
 828         }
 829         if (trans->hmp->version >= HAMMER_VOL_VERSION_SIX) {
 830                 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
 831                         ip->ino_data.cap_flags |=
 832                                 HAMMER_INODE_CAP_DIRHASH_ALG1;
 833                 }
 834         }
 835
 836         /*
 837          * Setup the ".." pointer.  This only needs to be done for directories
 838          * but we do it for all objects as a recovery aid.
 839          */
 840         if (dip)
 841                 ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id;
 842 #if 0
 843         /*
 844          * The parent_obj_localization field only applies to pseudo-fs roots.
 845          * XXX this is no longer applicable, PFSs are no longer directly
 846          * tied into the parent's directory structure.
 847          */
 848         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY &&
 849             ip->obj_id == HAMMER_OBJID_ROOT) {
 850                 ip->ino_data.ext.obj.parent_obj_localization =
 851                                                 dip->obj_localization;
 852         }
 853 #endif
 854
 855         switch(ip->ino_leaf.base.obj_type) {
 856         case HAMMER_OBJTYPE_CDEV:
 857         case HAMMER_OBJTYPE_BDEV:
 858                 ip->ino_data.rmajor = vap->va_rmajor;
 859                 ip->ino_data.rminor = vap->va_rminor;
 860                 break;
 861         default:
 862                 break;
 863         }
 864
 865         /*
 866          * Calculate default uid/gid and overwrite with information from
 867          * the vap.
 868          */
 869         if (dip) {
 870                 xuid = hammer_to_unix_xid(&dip->ino_data.uid);
 871                 xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode,
 872                                              xuid, cred, &vap->va_mode);
 873         } else {
 874                 xuid = 0;
 875         }
 876         ip->ino_data.mode = vap->va_mode;
 877
 878         if (vap->va_vaflags & VA_UID_UUID_VALID)
 879                 ip->ino_data.uid = vap->va_uid_uuid;
 880         else if (vap->va_uid != (uid_t)VNOVAL)
 881                 hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
 882         else
 883                 hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
 884
 885         if (vap->va_vaflags & VA_GID_UUID_VALID)
 886                 ip->ino_data.gid = vap->va_gid_uuid;
 887         else if (vap->va_gid != (gid_t)VNOVAL)
 888                 hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
 889         else if (dip)
 890                 ip->ino_data.gid = dip->ino_data.gid;
 891
 892         hammer_ref(&ip->lock);
 893
 894         if (pfsm) {
 895                 ip->pfsm = pfsm;
 896                 hammer_ref(&pfsm->lock);
 897                 error = 0;
 898         } else if (dip->obj_localization == ip->obj_localization) {
 899                 ip->pfsm = dip->pfsm;
 900                 hammer_ref(&ip->pfsm->lock);
 901                 error = 0;
 902         } else {
 903                 ip->pfsm = hammer_load_pseudofs(trans,
 904                                                 ip->obj_localization,
 905                                                 &error);
 906                 error = 0;      /* ignore ENOENT */
 907         }
 908
 909         if (error) {
 910                 hammer_free_inode(ip);
 911                 ip = NULL;
 912         } else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
 913                 panic("hammer_create_inode: duplicate obj_id %llx",
 914                       (long long)ip->obj_id);
 915                 /* not reached */
 916                 hammer_free_inode(ip);
 917         }
 918         *ipp = ip;
 919         return(error);
 920 }
 921
 922 /*
 923  * Final cleanup / freeing of an inode structure
 924  */
 925 static void
 926 hammer_free_inode(hammer_inode_t ip)
 927 {
 928         struct hammer_mount *hmp;
 929
 930         hmp = ip->hmp;
 931         KKASSERT(hammer_oneref(&ip->lock));
 932         hammer_uncache_node(&ip->cache[0]);
 933         hammer_uncache_node(&ip->cache[1]);
 934         hammer_uncache_node(&ip->cache[2]);
 935         hammer_uncache_node(&ip->cache[3]);
 936         hammer_inode_wakereclaims(ip);
 937         if (ip->objid_cache)
 938                 hammer_clear_objid(ip);
 939         --hammer_count_inodes;
 940         --hmp->count_inodes;
 941         if (ip->pfsm) {
 942                 hammer_rel_pseudofs(hmp, ip->pfsm);
 943                 ip->pfsm = NULL;
 944         }
 945         kfree(ip, hmp->m_inodes);
 946         ip = NULL;
 947 }
 948
 949 /*
 950  * Retrieve pseudo-fs data.  NULL will never be returned.
 951  *
 952  * If an error occurs *errorp will be set and a default template is returned,
 953  * otherwise *errorp is set to 0.  Typically when an error occurs it will
 954  * be ENOENT.
 955  */
 956 hammer_pseudofs_inmem_t
 957 hammer_load_pseudofs(hammer_transaction_t trans,
 958                      u_int32_t localization, int *errorp)
 959 {
 960         hammer_mount_t hmp = trans->hmp;
 961         hammer_inode_t ip;
 962         hammer_pseudofs_inmem_t pfsm;
 963         struct hammer_cursor cursor;
 964         int bytes;
 965
 966 retry:
 967         pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, localization);
 968         if (pfsm) {
 969                 hammer_ref(&pfsm->lock);
 970                 *errorp = 0;
 971                 return(pfsm);
 972         }
 973
 974         /*
 975          * PFS records are stored in the root inode (not the PFS root inode,
 976          * but the real root).  Avoid an infinite recursion if loading
 977          * the PFS for the real root.
 978          */
 979         if (localization) {
 980                 ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT,
 981                                       HAMMER_MAX_TID,
 982                                       HAMMER_DEF_LOCALIZATION, 0, errorp);
 983         } else {
 984                 ip = NULL;
 985         }
 986
 987         pfsm = kmalloc(sizeof(*pfsm), hmp->m_misc, M_WAITOK | M_ZERO);
 988         pfsm->localization = localization;
 989         pfsm->pfsd.unique_uuid = trans->rootvol->ondisk->vol_fsid;
 990         pfsm->pfsd.shared_uuid = pfsm->pfsd.unique_uuid;
 991
 992         hammer_init_cursor(trans, &cursor, (ip ? &ip->cache[1] : NULL), ip);
 993         cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION +
 994                                       HAMMER_LOCALIZE_MISC;
 995         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
 996         cursor.key_beg.create_tid = 0;
 997         cursor.key_beg.delete_tid = 0;
 998         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
 999         cursor.key_beg.obj_type = 0;
1000         cursor.key_beg.key = localization;
1001         cursor.asof = HAMMER_MAX_TID;
1002         cursor.flags |= HAMMER_CURSOR_ASOF;
1003
1004         if (ip)
1005                 *errorp = hammer_ip_lookup(&cursor);
1006         else
1007                 *errorp = hammer_btree_lookup(&cursor);
1008         if (*errorp == 0) {
1009                 *errorp = hammer_ip_resolve_data(&cursor);
1010                 if (*errorp == 0) {
1011                         if (cursor.data->pfsd.mirror_flags &
1012                             HAMMER_PFSD_DELETED) {
1013                                 *errorp = ENOENT;
1014                         } else {
1015                                 bytes = cursor.leaf->data_len;
1016                                 if (bytes > sizeof(pfsm->pfsd))
1017                                         bytes = sizeof(pfsm->pfsd);
1018                                 bcopy(cursor.data, &pfsm->pfsd, bytes);
1019                         }
1020                 }
1021         }
1022         hammer_done_cursor(&cursor);
1023
1024         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
1025         hammer_ref(&pfsm->lock);
1026         if (ip)
1027                 hammer_rel_inode(ip, 0);
1028         if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) {
1029                 kfree(pfsm, hmp->m_misc);
1030                 goto retry;
1031         }
1032         return(pfsm);
1033 }
1034
1035 /*
1036  * Store pseudo-fs data.  The backend will automatically delete any prior
1037  * on-disk pseudo-fs data but we have to delete in-memory versions.
1038  */
1039 int
1040 hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm)
1041 {
1042         struct hammer_cursor cursor;
1043         hammer_record_t record;
1044         hammer_inode_t ip;
1045         int error;
1046
1047         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
1048                               HAMMER_DEF_LOCALIZATION, 0, &error);
1049 retry:
1050         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
1051         hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
1052         cursor.key_beg.localization = ip->obj_localization +
1053                                       HAMMER_LOCALIZE_MISC;
1054         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
1055         cursor.key_beg.create_tid = 0;
1056         cursor.key_beg.delete_tid = 0;
1057         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
1058         cursor.key_beg.obj_type = 0;
1059         cursor.key_beg.key = pfsm->localization;
1060         cursor.asof = HAMMER_MAX_TID;
1061         cursor.flags |= HAMMER_CURSOR_ASOF;
1062
1063         /*
1064          * Replace any in-memory version of the record.
1065          */
1066         error = hammer_ip_lookup(&cursor);
1067         if (error == 0 && hammer_cursor_inmem(&cursor)) {
1068                 record = cursor.iprec;
1069                 if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
1070                         KKASSERT(cursor.deadlk_rec == NULL);
1071                         hammer_ref(&record->lock);
1072                         cursor.deadlk_rec = record;
1073                         error = EDEADLK;
1074                 } else {
1075                         record->flags |= HAMMER_RECF_DELETED_FE;
1076                         error = 0;
1077                 }
1078         }
1079
1080         /*
1081          * Allocate replacement general record.  The backend flush will
1082          * delete any on-disk version of the record.
1083          */
1084         if (error == 0 || error == ENOENT) {
1085                 record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd));
1086                 record->type = HAMMER_MEM_RECORD_GENERAL;
1087
1088                 record->leaf.base.localization = ip->obj_localization +
1089                                                  HAMMER_LOCALIZE_MISC;
1090                 record->leaf.base.rec_type = HAMMER_RECTYPE_PFS;
1091                 record->leaf.base.key = pfsm->localization;
1092                 record->leaf.data_len = sizeof(pfsm->pfsd);
1093                 bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd));
1094                 error = hammer_ip_add_record(trans, record);
1095         }
1096         hammer_done_cursor(&cursor);
1097         if (error == EDEADLK)
1098                 goto retry;
1099         hammer_rel_inode(ip, 0);
1100         return(error);
1101 }
1102
1103 /*
1104  * Create a root directory for a PFS if one does not alredy exist.
1105  *
1106  * The PFS root stands alone so we must also bump the nlinks count
1107  * to prevent it from being destroyed on release.
1108  */
1109 int
1110 hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred,
1111                        hammer_pseudofs_inmem_t pfsm)
1112 {
1113         hammer_inode_t ip;
1114         struct vattr vap;
1115         int error;
1116
1117         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
1118                               pfsm->localization, 0, &error);
1119         if (ip == NULL) {
1120                 vattr_null(&vap);
1121                 vap.va_mode = 0755;
1122                 vap.va_type = VDIR;
1123                 error = hammer_create_inode(trans, &vap, cred,
1124                                             NULL, NULL, 0,
1125                                             pfsm, &ip);
1126                 if (error == 0) {
1127                         ++ip->ino_data.nlinks;
1128                         hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY);
1129                 }
1130         }
1131         if (ip)
1132                 hammer_rel_inode(ip, 0);
1133         return(error);
1134 }
1135
1136 /*
1137  * Unload any vnodes & inodes associated with a PFS, return ENOTEMPTY
1138  * if we are unable to disassociate all the inodes.
1139  */
1140 static
1141 int
1142 hammer_unload_pseudofs_callback(hammer_inode_t ip, void *data)
1143 {
1144         int res;
1145
1146         hammer_ref(&ip->lock);
1147         if (hammer_isactive(&ip->lock) == 2 && ip->vp)
1148                 vclean_unlocked(ip->vp);
1149         if (hammer_isactive(&ip->lock) == 1 && ip->vp == NULL)
1150                 res = 0;
1151         else
1152                 res = -1;       /* stop, someone is using the inode */
1153         hammer_rel_inode(ip, 0);
1154         return(res);
1155 }
1156
1157 int
1158 hammer_unload_pseudofs(hammer_transaction_t trans, u_int32_t localization)
1159 {
1160         int res;
1161         int try;
1162
1163         for (try = res = 0; try < 4; ++try) {
1164                 res = hammer_ino_rb_tree_RB_SCAN(&trans->hmp->rb_inos_root,
1165                                            hammer_inode_pfs_cmp,
1166                                            hammer_unload_pseudofs_callback,
1167                                            &localization);
1168                 if (res == 0 && try > 1)
1169                         break;
1170                 hammer_flusher_sync(trans->hmp);
1171         }
1172         if (res != 0)
1173                 res = ENOTEMPTY;
1174         return(res);
1175 }
1176
1177
1178 /*
1179  * Release a reference on a PFS
1180  */
1181 void
1182 hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm)
1183 {
1184         hammer_rel(&pfsm->lock);
1185         if (hammer_norefs(&pfsm->lock)) {
1186                 RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm);
1187                 kfree(pfsm, hmp->m_misc);
1188         }
1189 }
1190
1191 /*
1192  * Called by hammer_sync_inode().
1193  */
1194 static int
1195 hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
1196 {
1197         hammer_transaction_t trans = cursor->trans;
1198         hammer_record_t record;
1199         int error;
1200         int redirty;
1201
1202 retry:
1203         error = 0;
1204
1205         /*
1206          * If the inode has a presence on-disk then locate it and mark
1207          * it deleted, setting DELONDISK.
1208          *
1209          * The record may or may not be physically deleted, depending on
1210          * the retention policy.
1211          */
1212         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
1213             HAMMER_INODE_ONDISK) {
1214                 hammer_normalize_cursor(cursor);
1215                 cursor->key_beg.localization = ip->obj_localization +
1216                                                HAMMER_LOCALIZE_INODE;
1217                 cursor->key_beg.obj_id = ip->obj_id;
1218                 cursor->key_beg.key = 0;
1219                 cursor->key_beg.create_tid = 0;
1220                 cursor->key_beg.delete_tid = 0;
1221                 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
1222                 cursor->key_beg.obj_type = 0;
1223                 cursor->asof = ip->obj_asof;
1224                 cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1225                 cursor->flags |= HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_ASOF;
1226                 cursor->flags |= HAMMER_CURSOR_BACKEND;
1227
1228                 error = hammer_btree_lookup(cursor);
1229                 if (hammer_debug_inode)
1230                         kprintf("IPDEL %p %08x %d", ip, ip->flags, error);
1231
1232                 if (error == 0) {
1233                         error = hammer_ip_delete_record(cursor, ip, trans->tid);
1234                         if (hammer_debug_inode)
1235                                 kprintf(" error %d\n", error);
1236                         if (error == 0) {
1237                                 ip->flags |= HAMMER_INODE_DELONDISK;
1238                         }
1239                         if (cursor->node)
1240                                 hammer_cache_node(&ip->cache[0], cursor->node);
1241                 }
1242                 if (error == EDEADLK) {
1243                         hammer_done_cursor(cursor);
1244                         error = hammer_init_cursor(trans, cursor,
1245                                                    &ip->cache[0], ip);
1246                         if (hammer_debug_inode)
1247                                 kprintf("IPDED %p %d\n", ip, error);
1248                         if (error == 0)
1249                                 goto retry;
1250                 }
1251         }
1252
1253         /*
1254          * Ok, write out the initial record or a new record (after deleting
1255          * the old one), unless the DELETED flag is set.  This routine will
1256          * clear DELONDISK if it writes out a record.
1257          *
1258          * Update our inode statistics if this is the first application of
1259          * the inode on-disk.
1260          */
1261         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
1262                 /*
1263                  * Generate a record and write it to the media.  We clean-up
1264                  * the state before releasing so we do not have to set-up
1265                  * a flush_group.
1266                  */
1267                 record = hammer_alloc_mem_record(ip, 0);
1268                 record->type = HAMMER_MEM_RECORD_INODE;
1269                 record->flush_state = HAMMER_FST_FLUSH;
1270                 record->leaf = ip->sync_ino_leaf;
1271                 record->leaf.base.create_tid = trans->tid;
1272                 record->leaf.data_len = sizeof(ip->sync_ino_data);
1273                 record->leaf.create_ts = trans->time32;
1274                 record->data = (void *)&ip->sync_ino_data;
1275                 record->flags |= HAMMER_RECF_INTERLOCK_BE;
1276
1277                 /*
1278                  * If this flag is set we cannot sync the new file size
1279                  * because we haven't finished related truncations.  The
1280                  * inode will be flushed in another flush group to finish
1281                  * the job.
1282                  */
1283                 if ((ip->flags & HAMMER_INODE_WOULDBLOCK) &&
1284                     ip->sync_ino_data.size != ip->ino_data.size) {
1285                         redirty = 1;
1286                         ip->sync_ino_data.size = ip->ino_data.size;
1287                 } else {
1288                         redirty = 0;
1289                 }
1290
1291                 for (;;) {
1292                         error = hammer_ip_sync_record_cursor(cursor, record);
1293                         if (hammer_debug_inode)
1294                                 kprintf("GENREC %p rec %08x %d\n",
1295                                         ip, record->flags, error);
1296                         if (error != EDEADLK)
1297                                 break;
1298                         hammer_done_cursor(cursor);
1299                         error = hammer_init_cursor(trans, cursor,
1300                                                    &ip->cache[0], ip);
1301                         if (hammer_debug_inode)
1302                                 kprintf("GENREC reinit %d\n", error);
1303                         if (error)
1304                                 break;
1305                 }
1306
1307                 /*
1308                  * Note:  The record was never on the inode's record tree
1309                  * so just wave our hands importantly and destroy it.
1310                  */
1311                 record->flags |= HAMMER_RECF_COMMITTED;
1312                 record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
1313                 record->flush_state = HAMMER_FST_IDLE;
1314                 ++ip->rec_generation;
1315                 hammer_rel_mem_record(record);
1316
1317                 /*
1318                  * Finish up.
1319                  */
1320                 if (error == 0) {
1321                         if (hammer_debug_inode)
1322                                 kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
1323                         ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
1324                                             HAMMER_INODE_SDIRTY |
1325                                             HAMMER_INODE_ATIME |
1326                                             HAMMER_INODE_MTIME);
1327                         ip->flags &= ~HAMMER_INODE_DELONDISK;
1328                         if (redirty)
1329                                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
1330
1331                         /*
1332                          * Root volume count of inodes
1333                          */
1334                         hammer_sync_lock_sh(trans);
1335                         if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
1336                                 hammer_modify_volume_field(trans,
1337                                                            trans->rootvol,
1338                                                            vol0_stat_inodes);
1339                                 ++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
1340                                 hammer_modify_volume_done(trans->rootvol);
1341                                 ip->flags |= HAMMER_INODE_ONDISK;
1342                                 if (hammer_debug_inode)
1343                                         kprintf("NOWONDISK %p\n", ip);
1344                         }
1345                         hammer_sync_unlock(trans);
1346                 }
1347         }
1348
1349         /*
1350          * If the inode has been destroyed, clean out any left-over flags
1351          * that may have been set by the frontend.
1352          */
1353         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) {
1354                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
1355                                     HAMMER_INODE_SDIRTY |
1356                                     HAMMER_INODE_ATIME |
1357                                     HAMMER_INODE_MTIME);
1358         }
1359         return(error);
1360 }
1361
1362 /*
1363  * Update only the itimes fields.
1364  *
1365  * ATIME can be updated without generating any UNDO.  MTIME is updated
1366  * with UNDO so it is guaranteed to be synchronized properly in case of
1367  * a crash.
1368  *
1369  * Neither field is included in the B-Tree leaf element's CRC, which is how
1370  * we can get away with updating ATIME the way we do.
1371  */
1372 static int
1373 hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
1374 {
1375         hammer_transaction_t trans = cursor->trans;
1376         int error;
1377
1378 retry:
1379         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) !=
1380             HAMMER_INODE_ONDISK) {
1381                 return(0);
1382         }
1383
1384         hammer_normalize_cursor(cursor);
1385         cursor->key_beg.localization = ip->obj_localization +
1386                                        HAMMER_LOCALIZE_INODE;
1387         cursor->key_beg.obj_id = ip->obj_id;
1388         cursor->key_beg.key = 0;
1389         cursor->key_beg.create_tid = 0;
1390         cursor->key_beg.delete_tid = 0;
1391         cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
1392         cursor->key_beg.obj_type = 0;
1393         cursor->asof = ip->obj_asof;
1394         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1395         cursor->flags |= HAMMER_CURSOR_ASOF;
1396         cursor->flags |= HAMMER_CURSOR_GET_LEAF;
1397         cursor->flags |= HAMMER_CURSOR_GET_DATA;
1398         cursor->flags |= HAMMER_CURSOR_BACKEND;
1399
1400         error = hammer_btree_lookup(cursor);
1401         if (error == 0) {
1402                 hammer_cache_node(&ip->cache[0], cursor->node);
1403                 if (ip->sync_flags & HAMMER_INODE_MTIME) {
1404                         /*
1405                          * Updating MTIME requires an UNDO.  Just cover
1406                          * both atime and mtime.
1407                          */
1408                         hammer_sync_lock_sh(trans);
1409                         hammer_modify_buffer(trans, cursor->data_buffer,
1410                                      HAMMER_ITIMES_BASE(&cursor->data->inode),
1411                                      HAMMER_ITIMES_BYTES);
1412                         cursor->data->inode.atime = ip->sync_ino_data.atime;
1413                         cursor->data->inode.mtime = ip->sync_ino_data.mtime;
1414                         hammer_modify_buffer_done(cursor->data_buffer);
1415                         hammer_sync_unlock(trans);
1416                 } else if (ip->sync_flags & HAMMER_INODE_ATIME) {
1417                         /*
1418                          * Updating atime only can be done in-place with
1419                          * no UNDO.
1420                          */
1421                         hammer_sync_lock_sh(trans);
1422                         hammer_modify_buffer(trans, cursor->data_buffer,
1423                                              NULL, 0);
1424                         cursor->data->inode.atime = ip->sync_ino_data.atime;
1425                         hammer_modify_buffer_done(cursor->data_buffer);
1426                         hammer_sync_unlock(trans);
1427                 }
1428                 ip->sync_flags &= ~(HAMMER_INODE_ATIME | HAMMER_INODE_MTIME);
1429         }
1430         if (error == EDEADLK) {
1431                 hammer_done_cursor(cursor);
1432                 error = hammer_init_cursor(trans, cursor,
1433                                            &ip->cache[0], ip);
1434                 if (error == 0)
1435                         goto retry;
1436         }
1437         return(error);
1438 }
1439
1440 /*
1441  * Release a reference on an inode, flush as requested.
1442  *
1443  * On the last reference we queue the inode to the flusher for its final
1444  * disposition.
1445  */
1446 void
1447 hammer_rel_inode(struct hammer_inode *ip, int flush)
1448 {
1449         /*hammer_mount_t hmp = ip->hmp;*/
1450
1451         /*
1452          * Handle disposition when dropping the last ref.
1453          */
1454         for (;;) {
1455                 if (hammer_oneref(&ip->lock)) {
1456                         /*
1457                          * Determine whether on-disk action is needed for
1458                          * the inode's final disposition.
1459                          */
1460                         KKASSERT(ip->vp == NULL);
1461                         hammer_inode_unloadable_check(ip, 0);
1462                         if (ip->flags & HAMMER_INODE_MODMASK) {
1463                                 hammer_flush_inode(ip, 0);
1464                         } else if (hammer_oneref(&ip->lock)) {
1465                                 hammer_unload_inode(ip);
1466                                 break;
1467                         }
1468                 } else {
1469                         if (flush)
1470                                 hammer_flush_inode(ip, 0);
1471
1472                         /*
1473                          * The inode still has multiple refs, try to drop
1474                          * one ref.
1475                          */
1476                         KKASSERT(hammer_isactive(&ip->lock) >= 1);
1477                         if (hammer_isactive(&ip->lock) > 1) {
1478                                 hammer_rel(&ip->lock);
1479                                 break;
1480                         }
1481                 }
1482         }
1483 }
1484
1485 /*
1486  * Unload and destroy the specified inode.  Must be called with one remaining
1487  * reference.  The reference is disposed of.
1488  *
1489  * The inode must be completely clean.
1490  */
1491 static int
1492 hammer_unload_inode(struct hammer_inode *ip)
1493 {
1494         hammer_mount_t hmp = ip->hmp;
1495
1496         KASSERT(hammer_oneref(&ip->lock),
1497                 ("hammer_unload_inode: %d refs", hammer_isactive(&ip->lock)));
1498         KKASSERT(ip->vp == NULL);
1499         KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
1500         KKASSERT(ip->cursor_ip_refs == 0);
1501         KKASSERT(hammer_notlocked(&ip->lock));
1502         KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
1503
1504         KKASSERT(RB_EMPTY(&ip->rec_tree));
1505         KKASSERT(TAILQ_EMPTY(&ip->target_list));
1506
1507         if (ip->flags & HAMMER_INODE_RDIRTY) {
1508                 RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip);
1509                 ip->flags &= ~HAMMER_INODE_RDIRTY;
1510         }
1511         RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
1512
1513         hammer_free_inode(ip);
1514         return(0);
1515 }
1516
1517 /*
1518  * Called during unmounting if a critical error occured.  The in-memory
1519  * inode and all related structures are destroyed.
1520  *
1521  * If a critical error did not occur the unmount code calls the standard
1522  * release and asserts that the inode is gone.
1523  */
1524 int
1525 hammer_destroy_inode_callback(struct hammer_inode *ip, void *data __unused)
1526 {
1527         hammer_record_t rec;
1528
1529         /*
1530          * Get rid of the inodes in-memory records, regardless of their
1531          * state, and clear the mod-mask.
1532          */
1533         while ((rec = TAILQ_FIRST(&ip->target_list)) != NULL) {
1534                 TAILQ_REMOVE(&ip->target_list, rec, target_entry);
1535                 rec->target_ip = NULL;
1536                 if (rec->flush_state == HAMMER_FST_SETUP)
1537                         rec->flush_state = HAMMER_FST_IDLE;
1538         }
1539         while ((rec = RB_ROOT(&ip->rec_tree)) != NULL) {
1540                 if (rec->flush_state == HAMMER_FST_FLUSH)
1541                         --rec->flush_group->refs;
1542                 else
1543                         hammer_ref(&rec->lock);
1544                 KKASSERT(hammer_oneref(&rec->lock));
1545                 rec->flush_state = HAMMER_FST_IDLE;
1546                 rec->flush_group = NULL;
1547                 rec->flags |= HAMMER_RECF_DELETED_FE; /* wave hands */
1548                 rec->flags |= HAMMER_RECF_DELETED_BE; /* wave hands */
1549                 ++ip->rec_generation;
1550                 hammer_rel_mem_record(rec);
1551         }
1552         ip->flags &= ~HAMMER_INODE_MODMASK;
1553         ip->sync_flags &= ~HAMMER_INODE_MODMASK;
1554         KKASSERT(ip->vp == NULL);
1555
1556         /*
1557          * Remove the inode from any flush group, force it idle.  FLUSH
1558          * and SETUP states have an inode ref.
1559          */
1560         switch(ip->flush_state) {
1561         case HAMMER_FST_FLUSH:
1562                 RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
1563                 --ip->flush_group->refs;
1564                 ip->flush_group = NULL;
1565                 /* fall through */
1566         case HAMMER_FST_SETUP:
1567                 hammer_rel(&ip->lock);
1568                 ip->flush_state = HAMMER_FST_IDLE;
1569                 /* fall through */
1570         case HAMMER_FST_IDLE:
1571                 break;
1572         }
1573
1574         /*
1575          * There shouldn't be any associated vnode.  The unload needs at
1576          * least one ref, if we do have a vp steal its ip ref.
1577          */
1578         if (ip->vp) {
1579                 kprintf("hammer_destroy_inode_callback: Unexpected "
1580                         "vnode association ip %p vp %p\n", ip, ip->vp);
1581                 ip->vp->v_data = NULL;
1582                 ip->vp = NULL;
1583         } else {
1584                 hammer_ref(&ip->lock);
1585         }
1586         hammer_unload_inode(ip);
1587         return(0);
1588 }
1589
1590 /*
1591  * Called on mount -u when switching from RW to RO or vise-versa.  Adjust
1592  * the read-only flag for cached inodes.
1593  *
1594  * This routine is called from a RB_SCAN().
1595  */
1596 int
1597 hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
1598 {
1599         hammer_mount_t hmp = ip->hmp;
1600
1601         if (hmp->ronly || hmp->asof != HAMMER_MAX_TID)
1602                 ip->flags |= HAMMER_INODE_RO;
1603         else
1604                 ip->flags &= ~HAMMER_INODE_RO;
1605         return(0);
1606 }
1607
1608 /*
1609  * A transaction has modified an inode, requiring updates as specified by
1610  * the passed flags.
1611  *
1612  * HAMMER_INODE_DDIRTY: Inode data has been updated, not incl mtime/atime,
1613  *                      and not including size changes due to write-append
1614  *                      (but other size changes are included).
1615  * HAMMER_INODE_SDIRTY: Inode data has been updated, size changes due to
1616  *                      write-append.
1617  * HAMMER_INODE_XDIRTY: Dirty in-memory records
1618  * HAMMER_INODE_BUFS:   Dirty buffer cache buffers
1619  * HAMMER_INODE_DELETED: Inode record/data must be deleted
1620  * HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated
1621  */
1622 void
1623 hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags)
1624 {
1625         /*
1626          * ronly of 0 or 2 does not trigger assertion.
1627          * 2 is a special error state
1628          */
1629         KKASSERT(ip->hmp->ronly != 1 ||
1630                   (flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
1631                             HAMMER_INODE_SDIRTY |
1632                             HAMMER_INODE_BUFS | HAMMER_INODE_DELETED |
1633                             HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) == 0);
1634         if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
1635                 ip->flags |= HAMMER_INODE_RSV_INODES;
1636                 ++ip->hmp->rsv_inodes;
1637         }
1638
1639         /*
1640          * Set the NEWINODE flag in the transaction if the inode
1641          * transitions to a dirty state.  This is used to track
1642          * the load on the inode cache.
1643          */
1644         if (trans &&
1645             (ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1646             (flags & HAMMER_INODE_MODMASK)) {
1647                 trans->flags |= HAMMER_TRANSF_NEWINODE;
1648         }
1649         if (flags & HAMMER_INODE_MODMASK)
1650                 hammer_inode_dirty(ip);
1651         ip->flags |= flags;
1652 }
1653
1654 /*
1655  * Attempt to quickly update the atime for a hammer inode.  Return 0 on
1656  * success, -1 on failure.
1657  *
1658  * We attempt to update the atime with only the ip lock and not the
1659  * whole filesystem lock in order to improve concurrency.  We can only
1660  * do this safely if the ATIME flag is already pending on the inode.
1661  *
1662  * This function is called via a vnops path (ip pointer is stable) without
1663  * fs_token held.
1664  */
1665 int
1666 hammer_update_atime_quick(hammer_inode_t ip)
1667 {
1668         struct timeval tv;
1669         int res = -1;
1670
1671         if ((ip->flags & HAMMER_INODE_RO) ||
1672             (ip->hmp->mp->mnt_flag & MNT_NOATIME)) {
1673                 /*
1674                  * Silently indicate success on read-only mount/snap
1675                  */
1676                 res = 0;
1677         } else if (ip->flags & HAMMER_INODE_ATIME) {
1678                 /*
1679                  * Double check with inode lock held against backend.  This
1680                  * is only safe if all we need to do is update
1681                  * ino_data.atime.
1682                  */
1683                 getmicrotime(&tv);
1684                 hammer_lock_ex(&ip->lock);
1685                 if (ip->flags & HAMMER_INODE_ATIME) {
1686                         ip->ino_data.atime =
1687                             (unsigned long)tv.tv_sec * 1000000ULL + tv.tv_usec;
1688                         res = 0;
1689                 }
1690                 hammer_unlock(&ip->lock);
1691         }
1692         return res;
1693 }
1694
1695 /*
1696  * Request that an inode be flushed.  This whole mess cannot block and may
1697  * recurse (if not synchronous).  Once requested HAMMER will attempt to
1698  * actively flush the inode until the flush can be done.
1699  *
1700  * The inode may already be flushing, or may be in a setup state.  We can
1701  * place the inode in a flushing state if it is currently idle and flag it
1702  * to reflush if it is currently flushing.
1703  *
1704  * Upon return if the inode could not be flushed due to a setup
1705  * dependancy, then it will be automatically flushed when the dependancy
1706  * is satisfied.
1707  */
1708 void
1709 hammer_flush_inode(hammer_inode_t ip, int flags)
1710 {
1711         hammer_mount_t hmp;
1712         hammer_flush_group_t flg;
1713         int good;
1714
1715         /*
1716          * fill_flush_group is the first flush group we may be able to
1717          * continue filling, it may be open or closed but it will always
1718          * be past the currently flushing (running) flg.
1719          *
1720          * next_flush_group is the next open flush group.
1721          */
1722         hmp = ip->hmp;
1723         while ((flg = hmp->fill_flush_group) != NULL) {
1724                 KKASSERT(flg->running == 0);
1725                 if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit &&
1726                     flg->total_count <= hammer_autoflush) {
1727                         break;
1728                 }
1729                 hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
1730                 hammer_flusher_async(ip->hmp, flg);
1731         }
1732         if (flg == NULL) {
1733                 flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK|M_ZERO);
1734                 flg->seq = hmp->flusher.next++;
1735                 if (hmp->next_flush_group == NULL)
1736                         hmp->next_flush_group = flg;
1737                 if (hmp->fill_flush_group == NULL)
1738                         hmp->fill_flush_group = flg;
1739                 RB_INIT(&flg->flush_tree);
1740                 TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
1741         }
1742
1743         /*
1744          * Trivial 'nothing to flush' case.  If the inode is in a SETUP
1745          * state we have to put it back into an IDLE state so we can
1746          * drop the extra ref.
1747          *
1748          * If we have a parent dependancy we must still fall through
1749          * so we can run it.
1750          */
1751         if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
1752                 if (ip->flush_state == HAMMER_FST_SETUP &&
1753                     TAILQ_EMPTY(&ip->target_list)) {
1754                         ip->flush_state = HAMMER_FST_IDLE;
1755                         hammer_rel_inode(ip, 0);
1756                 }
1757                 if (ip->flush_state == HAMMER_FST_IDLE)
1758                         return;
1759         }
1760
1761         /*
1762          * Our flush action will depend on the current state.
1763          */
1764         switch(ip->flush_state) {
1765         case HAMMER_FST_IDLE:
1766                 /*
1767                  * We have no dependancies and can flush immediately.  Some
1768                  * our children may not be flushable so we have to re-test
1769                  * with that additional knowledge.
1770                  */
1771                 hammer_flush_inode_core(ip, flg, flags);
1772                 break;
1773         case HAMMER_FST_SETUP:
1774                 /*
1775                  * Recurse upwards through dependancies via target_list
1776                  * and start their flusher actions going if possible.
1777                  *
1778                  * 'good' is our connectivity.  -1 means we have none and
1779                  * can't flush, 0 means there weren't any dependancies, and
1780                  * 1 means we have good connectivity.
1781                  */
1782                 good = hammer_setup_parent_inodes(ip, 0, flg);
1783
1784                 if (good >= 0) {
1785                         /*
1786                          * We can continue if good >= 0.  Determine how
1787                          * many records under our inode can be flushed (and
1788                          * mark them).
1789                          */
1790                         hammer_flush_inode_core(ip, flg, flags);
1791                 } else {
1792                         /*
1793                          * Parent has no connectivity, tell it to flush
1794                          * us as soon as it does.
1795                          *
1796                          * The REFLUSH flag is also needed to trigger
1797                          * dependancy wakeups.
1798                          */
1799                         ip->flags |= HAMMER_INODE_CONN_DOWN |
1800                                      HAMMER_INODE_REFLUSH;
1801                         if (flags & HAMMER_FLUSH_SIGNAL) {
1802                                 ip->flags |= HAMMER_INODE_RESIGNAL;
1803                                 hammer_flusher_async(ip->hmp, flg);
1804                         }
1805                 }
1806                 break;
1807         case HAMMER_FST_FLUSH:
1808                 /*
1809                  * We are already flushing, flag the inode to reflush
1810                  * if needed after it completes its current flush.
1811                  *
1812                  * The REFLUSH flag is also needed to trigger
1813                  * dependancy wakeups.
1814                  */
1815                 if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
1816                         ip->flags |= HAMMER_INODE_REFLUSH;
1817                 if (flags & HAMMER_FLUSH_SIGNAL) {
1818                         ip->flags |= HAMMER_INODE_RESIGNAL;
1819                         hammer_flusher_async(ip->hmp, flg);
1820                 }
1821                 break;
1822         }
1823 }
1824
1825 /*
1826  * Scan ip->target_list, which is a list of records owned by PARENTS to our
1827  * ip which reference our ip.
1828  *
1829  * XXX This is a huge mess of recursive code, but not one bit of it blocks
1830  *     so for now do not ref/deref the structures.  Note that if we use the
1831  *     ref/rel code later, the rel CAN block.
1832  */
1833 static int
1834 hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
1835                            hammer_flush_group_t flg)
1836 {
1837         hammer_record_t depend;
1838         int good;
1839         int r;
1840
1841         /*
1842          * If we hit our recursion limit and we have parent dependencies
1843          * We cannot continue.  Returning < 0 will cause us to be flagged
1844          * for reflush.  Returning -2 cuts off additional dependency checks
1845          * because they are likely to also hit the depth limit.
1846          *
1847          * We cannot return < 0 if there are no dependencies or there might
1848          * not be anything to wakeup (ip).
1849          */
1850         if (depth == 20 && TAILQ_FIRST(&ip->target_list)) {
1851                 krateprintf(&hammer_gen_krate,
1852                             "HAMMER Warning: depth limit reached on "
1853                             "setup recursion, inode %p %016llx\n",
1854                             ip, (long long)ip->obj_id);
1855                 return(-2);
1856         }
1857
1858         /*
1859          * Scan dependencies
1860          */
1861         good = 0;
1862         TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
1863                 r = hammer_setup_parent_inodes_helper(depend, depth, flg);
1864                 KKASSERT(depend->target_ip == ip);
1865                 if (r < 0 && good == 0)
1866                         good = -1;
1867                 if (r > 0)
1868                         good = 1;
1869
1870                 /*
1871                  * If we failed due to the recursion depth limit then stop
1872                  * now.
1873                  */
1874                 if (r == -2)
1875                         break;
1876         }
1877         return(good);
1878 }
1879
1880 /*
1881  * This helper function takes a record representing the dependancy between
1882  * the parent inode and child inode.
1883  *
1884  * record->ip           = parent inode
1885  * record->target_ip    = child inode
1886  *
1887  * We are asked to recurse upwards and convert the record from SETUP
1888  * to FLUSH if possible.
1889  *
1890  * Return 1 if the record gives us connectivity
1891  *
1892  * Return 0 if the record is not relevant
1893  *
1894  * Return -1 if we can't resolve the dependancy and there is no connectivity.
1895  */
1896 static int
1897 hammer_setup_parent_inodes_helper(hammer_record_t record, int depth,
1898                                   hammer_flush_group_t flg)
1899 {
1900         hammer_inode_t pip;
1901         int good;
1902
1903         KKASSERT(record->flush_state != HAMMER_FST_IDLE);
1904         pip = record->ip;
1905
1906         /*
1907          * If the record is already flushing, is it in our flush group?
1908          *
1909          * If it is in our flush group but it is a general record or a
1910          * delete-on-disk, it does not improve our connectivity (return 0),
1911          * and if the target inode is not trying to destroy itself we can't
1912          * allow the operation yet anyway (the second return -1).
1913          */
1914         if (record->flush_state == HAMMER_FST_FLUSH) {
1915                 /*
1916                  * If not in our flush group ask the parent to reflush
1917                  * us as soon as possible.
1918                  */
1919                 if (record->flush_group != flg) {
1920                         pip->flags |= HAMMER_INODE_REFLUSH;
1921                         record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
1922                         return(-1);
1923                 }
1924
1925                 /*
1926                  * If in our flush group everything is already set up,
1927                  * just return whether the record will improve our
1928                  * visibility or not.
1929                  */
1930                 if (record->type == HAMMER_MEM_RECORD_ADD)
1931                         return(1);
1932                 return(0);
1933         }
1934
1935         /*
1936          * It must be a setup record.  Try to resolve the setup dependancies
1937          * by recursing upwards so we can place ip on the flush list.
1938          *
1939          * Limit ourselves to 20 levels of recursion to avoid blowing out
1940          * the kernel stack.  If we hit the recursion limit we can't flush
1941          * until the parent flushes.  The parent will flush independantly
1942          * on its own and ultimately a deep recursion will be resolved.
1943          */
1944         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
1945
1946         good = hammer_setup_parent_inodes(pip, depth + 1, flg);
1947
1948         /*
1949          * If good < 0 the parent has no connectivity and we cannot safely
1950          * flush the directory entry, which also means we can't flush our
1951          * ip.  Flag us for downward recursion once the parent's
1952          * connectivity is resolved.  Flag the parent for [re]flush or it
1953          * may not check for downward recursions.
1954          */
1955         if (good < 0) {
1956                 pip->flags |= HAMMER_INODE_REFLUSH;
1957                 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
1958                 return(good);
1959         }
1960
1961         /*
1962          * We are go, place the parent inode in a flushing state so we can
1963          * place its record in a flushing state.  Note that the parent
1964          * may already be flushing.  The record must be in the same flush
1965          * group as the parent.
1966          */
1967         if (pip->flush_state != HAMMER_FST_FLUSH)
1968                 hammer_flush_inode_core(pip, flg, HAMMER_FLUSH_RECURSION);
1969         KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
1970
1971         /*
1972          * It is possible for a rename to create a loop in the recursion
1973          * and revisit a record.  This will result in the record being
1974          * placed in a flush state unexpectedly.  This check deals with
1975          * the case.
1976          */
1977         if (record->flush_state == HAMMER_FST_FLUSH) {
1978                 if (record->type == HAMMER_MEM_RECORD_ADD)
1979                         return(1);
1980                 return(0);
1981         }
1982
1983         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
1984
1985 #if 0
1986         if (record->type == HAMMER_MEM_RECORD_DEL &&
1987             (record->target_ip->flags & (HAMMER_INODE_DELETED|HAMMER_INODE_DELONDISK)) == 0) {
1988                 /*
1989                  * Regardless of flushing state we cannot sync this path if the
1990                  * record represents a delete-on-disk but the target inode
1991                  * is not ready to sync its own deletion.
1992                  *
1993                  * XXX need to count effective nlinks to determine whether
1994                  * the flush is ok, otherwise removing a hardlink will
1995                  * just leave the DEL record to rot.
1996                  */
1997                 record->target_ip->flags |= HAMMER_INODE_REFLUSH;
1998                 return(-1);
1999         } else
2000 #endif
2001         if (pip->flush_group == flg) {
2002                 /*
2003                  * Because we have not calculated nlinks yet we can just
2004                  * set records to the flush state if the parent is in
2005                  * the same flush group as we are.
2006                  */
2007                 record->flush_state = HAMMER_FST_FLUSH;
2008                 record->flush_group = flg;
2009                 ++record->flush_group->refs;
2010                 hammer_ref(&record->lock);
2011
2012                 /*
2013                  * A general directory-add contributes to our visibility.
2014                  *
2015                  * Otherwise it is probably a directory-delete or
2016                  * delete-on-disk record and does not contribute to our
2017                  * visbility (but we can still flush it).
2018                  */
2019                 if (record->type == HAMMER_MEM_RECORD_ADD)
2020                         return(1);
2021                 return(0);
2022         } else {
2023                 /*
2024                  * If the parent is not in our flush group we cannot
2025                  * flush this record yet, there is no visibility.
2026                  * We tell the parent to reflush and mark ourselves
2027                  * so the parent knows it should flush us too.
2028                  */
2029                 pip->flags |= HAMMER_INODE_REFLUSH;
2030                 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
2031                 return(-1);
2032         }
2033 }
2034
2035 /*
2036  * This is the core routine placing an inode into the FST_FLUSH state.
2037  */
2038 static void
2039 hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
2040 {
2041         hammer_mount_t hmp = ip->hmp;
2042         int go_count;
2043
2044         /*
2045          * Set flush state and prevent the flusher from cycling into
2046          * the next flush group.  Do not place the ip on the list yet.
2047          * Inodes not in the idle state get an extra reference.
2048          */
2049         KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
2050         if (ip->flush_state == HAMMER_FST_IDLE)
2051                 hammer_ref(&ip->lock);
2052         ip->flush_state = HAMMER_FST_FLUSH;
2053         ip->flush_group = flg;
2054         ++hmp->flusher.group_lock;
2055         ++hmp->count_iqueued;
2056         ++hammer_count_iqueued;
2057         ++flg->total_count;
2058         hammer_redo_fifo_start_flush(ip);
2059
2060 #if 0
2061         /*
2062          * We need to be able to vfsync/truncate from the backend.
2063          *
2064          * XXX Any truncation from the backend will acquire the vnode
2065          *     independently.
2066          */
2067         KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
2068         if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
2069                 ip->flags |= HAMMER_INODE_VHELD;
2070                 vref(ip->vp);
2071         }
2072 #endif
2073
2074         /*
2075          * Figure out how many in-memory records we can actually flush
2076          * (not including inode meta-data, buffers, etc).
2077          */
2078         KKASSERT((ip->flags & HAMMER_INODE_WOULDBLOCK) == 0);
2079         if (flags & HAMMER_FLUSH_RECURSION) {
2080                 /*
2081                  * If this is a upwards recursion we do not want to
2082                  * recurse down again!
2083                  */
2084                 go_count = 1;
2085 #if 0
2086         } else if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
2087                 /*
2088                  * No new records are added if we must complete a flush
2089                  * from a previous cycle, but we do have to move the records
2090                  * from the previous cycle to the current one.
2091                  */
2092 #if 0
2093                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
2094                                    hammer_syncgrp_child_callback, NULL);
2095 #endif
2096                 go_count = 1;
2097 #endif
2098         } else {
2099                 /*
2100                  * Normal flush, scan records and bring them into the flush.
2101                  * Directory adds and deletes are usually skipped (they are
2102                  * grouped with the related inode rather then with the
2103                  * directory).
2104                  *
2105                  * go_count can be negative, which means the scan aborted
2106                  * due to the flush group being over-full and we should
2107                  * flush what we have.
2108                  */
2109                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
2110                                    hammer_setup_child_callback, NULL);
2111         }
2112
2113         /*
2114          * This is a more involved test that includes go_count.  If we
2115          * can't flush, flag the inode and return.  If go_count is 0 we
2116          * were are unable to flush any records in our rec_tree and
2117          * must ignore the XDIRTY flag.
2118          */
2119         if (go_count == 0) {
2120                 if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
2121                         --hmp->count_iqueued;
2122                         --hammer_count_iqueued;
2123
2124                         --flg->total_count;
2125                         ip->flush_state = HAMMER_FST_SETUP;
2126                         ip->flush_group = NULL;
2127                         if (flags & HAMMER_FLUSH_SIGNAL) {
2128                                 ip->flags |= HAMMER_INODE_REFLUSH |
2129                                              HAMMER_INODE_RESIGNAL;
2130                         } else {
2131                                 ip->flags |= HAMMER_INODE_REFLUSH;
2132                         }
2133 #if 0
2134                         if (ip->flags & HAMMER_INODE_VHELD) {
2135                                 ip->flags &= ~HAMMER_INODE_VHELD;
2136                                 vrele(ip->vp);
2137                         }
2138 #endif
2139
2140                         /*
2141                          * REFLUSH is needed to trigger dependancy wakeups
2142                          * when an inode is in SETUP.
2143                          */
2144                         ip->flags |= HAMMER_INODE_REFLUSH;
2145                         if (--hmp->flusher.group_lock == 0)
2146                                 wakeup(&hmp->flusher.group_lock);
2147                         return;
2148                 }
2149         }
2150
2151         /*
2152          * Snapshot the state of the inode for the backend flusher.
2153          *
2154          * We continue to retain save_trunc_off even when all truncations
2155          * have been resolved as an optimization to determine if we can
2156          * skip the B-Tree lookup for overwrite deletions.
2157          *
2158          * NOTE: The DELETING flag is a mod flag, but it is also sticky,
2159          * and stays in ip->flags.  Once set, it stays set until the
2160          * inode is destroyed.
2161          */
2162         if (ip->flags & HAMMER_INODE_TRUNCATED) {
2163                 KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0);
2164                 ip->sync_trunc_off = ip->trunc_off;
2165                 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
2166                 ip->flags &= ~HAMMER_INODE_TRUNCATED;
2167                 ip->sync_flags |= HAMMER_INODE_TRUNCATED;
2168
2169                 /*
2170                  * The save_trunc_off used to cache whether the B-Tree
2171                  * holds any records past that point is not used until
2172                  * after the truncation has succeeded, so we can safely
2173                  * set it now.
2174                  */
2175                 if (ip->save_trunc_off > ip->sync_trunc_off)
2176                         ip->save_trunc_off = ip->sync_trunc_off;
2177         }
2178         ip->sync_flags |= (ip->flags & HAMMER_INODE_MODMASK &
2179                            ~HAMMER_INODE_TRUNCATED);
2180         ip->sync_ino_leaf = ip->ino_leaf;
2181         ip->sync_ino_data = ip->ino_data;
2182         ip->flags &= ~HAMMER_INODE_MODMASK | HAMMER_INODE_TRUNCATED;
2183 #ifdef DEBUG_TRUNCATE
2184         if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp)
2185                 kprintf("truncateS %016llx\n", ip->sync_trunc_off);
2186 #endif
2187
2188         /*
2189          * The flusher list inherits our inode and reference.
2190          */
2191         KKASSERT(flg->running == 0);
2192         RB_INSERT(hammer_fls_rb_tree, &flg->flush_tree, ip);
2193         if (--hmp->flusher.group_lock == 0)
2194                 wakeup(&hmp->flusher.group_lock);
2195
2196         /*
2197          * Auto-flush the group if it grows too large.  Make sure the
2198          * inode reclaim wait pipeline continues to work.
2199          */
2200         if (flg->total_count >= hammer_autoflush ||
2201             flg->total_count >= hammer_limit_reclaims / 4) {
2202                 if (hmp->fill_flush_group == flg)
2203                         hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
2204                 hammer_flusher_async(hmp, flg);
2205         }
2206 }
2207
2208 /*
2209  * Callback for scan of ip->rec_tree.  Try to include each record in our
2210  * flush.  ip->flush_group has been set but the inode has not yet been
2211  * moved into a flushing state.
2212  *
2213  * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
2214  * both inodes.
2215  *
2216  * We return 1 for any record placed or found in FST_FLUSH, which prevents
2217  * the caller from shortcutting the flush.
2218  */
2219 static int
2220 hammer_setup_child_callback(hammer_record_t rec, void *data)
2221 {
2222         hammer_flush_group_t flg;
2223         hammer_inode_t target_ip;
2224         hammer_inode_t ip;
2225         int r;
2226
2227         /*
2228          * Records deleted or committed by the backend are ignored.
2229          * Note that the flush detects deleted frontend records at
2230          * multiple points to deal with races.  This is just the first
2231          * line of defense.  The only time HAMMER_RECF_DELETED_FE cannot
2232          * be set is when HAMMER_RECF_INTERLOCK_BE is set, because it
2233          * messes up link-count calculations.
2234          *
2235          * NOTE: Don't get confused between record deletion and, say,
2236          * directory entry deletion.  The deletion of a directory entry
2237          * which is on-media has nothing to do with the record deletion
2238          * flags.
2239          */
2240         if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE |
2241                           HAMMER_RECF_COMMITTED)) {
2242                 if (rec->flush_state == HAMMER_FST_FLUSH) {
2243                         KKASSERT(rec->flush_group == rec->ip->flush_group);
2244                         r = 1;
2245                 } else {
2246                         r = 0;
2247                 }
2248                 return(r);
2249         }
2250
2251         /*
2252          * If the record is in an idle state it has no dependancies and
2253          * can be flushed.
2254          */
2255         ip = rec->ip;
2256         flg = ip->flush_group;
2257         r = 0;
2258
2259         switch(rec->flush_state) {
2260         case HAMMER_FST_IDLE:
2261                 /*
2262                  * The record has no setup dependancy, we can flush it.
2263                  */
2264                 KKASSERT(rec->target_ip == NULL);
2265                 rec->flush_state = HAMMER_FST_FLUSH;
2266                 rec->flush_group = flg;
2267                 ++flg->refs;
2268                 hammer_ref(&rec->lock);
2269                 r = 1;
2270                 break;
2271         case HAMMER_FST_SETUP:
2272                 /*
2273                  * The record has a setup dependancy.  These are typically
2274                  * directory entry adds and deletes.  Such entries will be
2275                  * flushed when their inodes are flushed so we do not
2276                  * usually have to add them to the flush here.  However,
2277                  * if the target_ip has set HAMMER_INODE_CONN_DOWN then
2278                  * it is asking us to flush this record (and it).
2279                  */
2280                 target_ip = rec->target_ip;
2281                 KKASSERT(target_ip != NULL);
2282                 KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
2283
2284                 /*
2285                  * If the target IP is already flushing in our group
2286                  * we could associate the record, but target_ip has
2287                  * already synced ino_data to sync_ino_data and we
2288                  * would also have to adjust nlinks.   Plus there are
2289                  * ordering issues for adds and deletes.
2290                  *
2291                  * Reflush downward if this is an ADD, and upward if
2292                  * this is a DEL.
2293                  */
2294                 if (target_ip->flush_state == HAMMER_FST_FLUSH) {
2295                         if (rec->type == HAMMER_MEM_RECORD_ADD)
2296                                 ip->flags |= HAMMER_INODE_REFLUSH;
2297                         else
2298                                 target_ip->flags |= HAMMER_INODE_REFLUSH;
2299                         break;
2300                 }
2301
2302                 /*
2303                  * Target IP is not yet flushing.  This can get complex
2304                  * because we have to be careful about the recursion.
2305                  *
2306                  * Directories create an issue for us in that if a flush
2307                  * of a directory is requested the expectation is to flush
2308                  * any pending directory entries, but this will cause the
2309                  * related inodes to recursively flush as well.  We can't
2310                  * really defer the operation so just get as many as we
2311                  * can and
2312                  */
2313 #if 0
2314                 if ((target_ip->flags & HAMMER_INODE_RECLAIM) == 0 &&
2315                     (target_ip->flags & HAMMER_INODE_CONN_DOWN) == 0) {
2316                         /*
2317                          * We aren't reclaiming and the target ip was not
2318                          * previously prevented from flushing due to this
2319                          * record dependancy.  Do not flush this record.
2320                          */
2321                         /*r = 0;*/
2322                 } else
2323 #endif
2324                 if (flg->total_count + flg->refs >
2325                            ip->hmp->undo_rec_limit) {
2326                         /*
2327                          * Our flush group is over-full and we risk blowing
2328                          * out the UNDO FIFO.  Stop the scan, flush what we
2329                          * have, then reflush the directory.
2330                          *
2331                          * The directory may be forced through multiple
2332                          * flush groups before it can be completely
2333                          * flushed.
2334                          */
2335                         ip->flags |= HAMMER_INODE_RESIGNAL |
2336                                      HAMMER_INODE_REFLUSH;
2337                         r = -1;
2338                 } else if (rec->type == HAMMER_MEM_RECORD_ADD) {
2339                         /*
2340                          * If the target IP is not flushing we can force
2341                          * it to flush, even if it is unable to write out
2342                          * any of its own records we have at least one in
2343                          * hand that we CAN deal with.
2344                          */
2345                         rec->flush_state = HAMMER_FST_FLUSH;
2346                         rec->flush_group = flg;
2347                         ++flg->refs;
2348                         hammer_ref(&rec->lock);
2349                         hammer_flush_inode_core(target_ip, flg,
2350                                                 HAMMER_FLUSH_RECURSION);
2351                         r = 1;
2352                 } else {
2353                         /*
2354                          * General or delete-on-disk record.
2355                          *
2356                          * XXX this needs help.  If a delete-on-disk we could
2357                          * disconnect the target.  If the target has its own
2358                          * dependancies they really need to be flushed.
2359                          *
2360                          * XXX
2361                          */
2362                         rec->flush_state = HAMMER_FST_FLUSH;
2363                         rec->flush_group = flg;
2364                         ++flg->refs;
2365                         hammer_ref(&rec->lock);
2366                         hammer_flush_inode_core(target_ip, flg,
2367                                                 HAMMER_FLUSH_RECURSION);
2368                         r = 1;
2369                 }
2370                 break;
2371         case HAMMER_FST_FLUSH:
2372                 /*
2373                  * The record could be part of a previous flush group if the
2374                  * inode is a directory (the record being a directory entry).
2375                  * Once the flush group was closed a hammer_test_inode()
2376                  * function can cause a new flush group to be setup, placing
2377                  * the directory inode itself in a new flush group.
2378                  *
2379                  * When associated with a previous flush group we count it
2380                  * as if it were in our current flush group, since it will
2381                  * effectively be flushed by the time we flush our current
2382                  * flush group.
2383                  */
2384                 KKASSERT(
2385                     rec->ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY ||
2386                     rec->flush_group == flg);
2387                 r = 1;
2388                 break;
2389         }
2390         return(r);
2391 }
2392
2393 #if 0
2394 /*
2395  * This version just moves records already in a flush state to the new
2396  * flush group and that is it.
2397  */
2398 static int
2399 hammer_syncgrp_child_callback(hammer_record_t rec, void *data)
2400 {
2401         hammer_inode_t ip = rec->ip;
2402
2403         switch(rec->flush_state) {
2404         case HAMMER_FST_FLUSH:
2405                 KKASSERT(rec->flush_group == ip->flush_group);
2406                 break;
2407         default:
2408                 break;
2409         }
2410         return(0);
2411 }
2412 #endif
2413
2414 /*
2415  * Wait for a previously queued flush to complete.
2416  *
2417  * If a critical error occured we don't try to wait.
2418  */
2419 void
2420 hammer_wait_inode(hammer_inode_t ip)
2421 {
2422         /*
2423          * The inode can be in a SETUP state in which case RESIGNAL
2424          * should be set.  If RESIGNAL is not set then the previous
2425          * flush completed and a later operation placed the inode
2426          * in a passive setup state again, so we're done.
2427          *
2428          * The inode can be in a FLUSH state in which case we
2429          * can just wait for completion.
2430          */
2431         while (ip->flush_state == HAMMER_FST_FLUSH ||
2432             (ip->flush_state == HAMMER_FST_SETUP &&
2433              (ip->flags & HAMMER_INODE_RESIGNAL))) {
2434                 /*
2435                  * Don't try to flush on a critical error
2436                  */
2437                 if (ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
2438                         break;
2439
2440                 /*
2441                  * If the inode was already being flushed its flg
2442                  * may not have been queued to the backend.  We
2443                  * have to make sure it gets queued or we can wind
2444                  * up blocked or deadlocked (particularly if we are
2445                  * the vnlru thread).
2446                  */
2447                 if (ip->flush_state == HAMMER_FST_FLUSH) {
2448                         KKASSERT(ip->flush_group);
2449                         if (ip->flush_group->closed == 0) {
2450                                 if (hammer_debug_inode) {
2451                                         kprintf("hammer: debug: forcing "
2452                                                 "async flush ip %016jx\n",
2453                                                 (intmax_t)ip->obj_id);
2454                                 }
2455                                 hammer_flusher_async(ip->hmp,
2456                                                      ip->flush_group);
2457                                 continue; /* retest */
2458                         }
2459                 }
2460
2461                 /*
2462                  * In a flush state with the flg queued to the backend
2463                  * or in a setup state with RESIGNAL set, we can safely
2464                  * wait.
2465                  */
2466                 ip->flags |= HAMMER_INODE_FLUSHW;
2467                 tsleep(&ip->flags, 0, "hmrwin", 0);
2468         }
2469
2470 #if 0
2471         /*
2472          * The inode may have been in a passive setup state,
2473          * call flush to make sure we get signaled.
2474          */
2475         if (ip->flush_state == HAMMER_FST_SETUP)
2476                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2477 #endif
2478
2479 }
2480
2481 /*
2482  * Called by the backend code when a flush has been completed.
2483  * The inode has already been removed from the flush list.
2484  *
2485  * A pipelined flush can occur, in which case we must re-enter the
2486  * inode on the list and re-copy its fields.
2487  */
2488 void
2489 hammer_flush_inode_done(hammer_inode_t ip, int error)
2490 {
2491         hammer_mount_t hmp;
2492         int dorel;
2493
2494         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
2495
2496         hmp = ip->hmp;
2497
2498         /*
2499          * Auto-reflush if the backend could not completely flush
2500          * the inode.  This fixes a case where a deferred buffer flush
2501          * could cause fsync to return early.
2502          */
2503         if (ip->sync_flags & HAMMER_INODE_MODMASK)
2504                 ip->flags |= HAMMER_INODE_REFLUSH;
2505
2506         /*
2507          * Merge left-over flags back into the frontend and fix the state.
2508          * Incomplete truncations are retained by the backend.
2509          */
2510         ip->error = error;
2511         ip->flags |= ip->sync_flags & ~HAMMER_INODE_TRUNCATED;
2512         ip->sync_flags &= HAMMER_INODE_TRUNCATED;
2513
2514         /*
2515          * The backend may have adjusted nlinks, so if the adjusted nlinks
2516          * does not match the fronttend set the frontend's DDIRTY flag again.
2517          */
2518         if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
2519                 ip->flags |= HAMMER_INODE_DDIRTY;
2520
2521         /*
2522          * Fix up the dirty buffer status.
2523          */
2524         if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
2525                 ip->flags |= HAMMER_INODE_BUFS;
2526         }
2527         hammer_redo_fifo_end_flush(ip);
2528
2529         /*
2530          * Re-set the XDIRTY flag if some of the inode's in-memory records
2531          * could not be flushed.
2532          */
2533         KKASSERT((RB_EMPTY(&ip->rec_tree) &&
2534                   (ip->flags & HAMMER_INODE_XDIRTY) == 0) ||
2535                  (!RB_EMPTY(&ip->rec_tree) &&
2536                   (ip->flags & HAMMER_INODE_XDIRTY) != 0));
2537
2538         /*
2539          * Do not lose track of inodes which no longer have vnode
2540          * assocations, otherwise they may never get flushed again.
2541          *
2542          * The reflush flag can be set superfluously, causing extra pain
2543          * for no reason.  If the inode is no longer modified it no longer
2544          * needs to be flushed.
2545          */
2546         if (ip->flags & HAMMER_INODE_MODMASK) {
2547                 if (ip->vp == NULL)
2548                         ip->flags |= HAMMER_INODE_REFLUSH;
2549         } else {
2550                 ip->flags &= ~HAMMER_INODE_REFLUSH;
2551         }
2552         if (ip->flags & HAMMER_INODE_MODMASK)
2553                 hammer_inode_dirty(ip);
2554
2555         /*
2556          * Adjust the flush state.
2557          */
2558         if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
2559                 /*
2560                  * We were unable to flush out all our records, leave the
2561                  * inode in a flush state and in the current flush group.
2562                  * The flush group will be re-run.
2563                  *
2564                  * This occurs if the UNDO block gets too full or there is
2565                  * too much dirty meta-data and allows the flusher to
2566                  * finalize the UNDO block and then re-flush.
2567                  */
2568                 ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
2569                 dorel = 0;
2570         } else {
2571                 /*
2572                  * Remove from the flush_group
2573                  */
2574                 RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
2575                 ip->flush_group = NULL;
2576
2577 #if 0
2578                 /*
2579                  * Clean up the vnode ref and tracking counts.
2580                  */
2581                 if (ip->flags & HAMMER_INODE_VHELD) {
2582                         ip->flags &= ~HAMMER_INODE_VHELD;
2583                         vrele(ip->vp);
2584                 }
2585 #endif
2586                 --hmp->count_iqueued;
2587                 --hammer_count_iqueued;
2588
2589                 /*
2590                  * And adjust the state.
2591                  */
2592                 if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
2593                         ip->flush_state = HAMMER_FST_IDLE;
2594                         dorel = 1;
2595                 } else {
2596                         ip->flush_state = HAMMER_FST_SETUP;
2597                         dorel = 0;
2598                 }
2599
2600                 /*
2601                  * If the frontend is waiting for a flush to complete,
2602                  * wake it up.
2603                  */
2604                 if (ip->flags & HAMMER_INODE_FLUSHW) {
2605                         ip->flags &= ~HAMMER_INODE_FLUSHW;
2606                         wakeup(&ip->flags);
2607                 }
2608
2609                 /*
2610                  * If the frontend made more changes and requested another
2611                  * flush, then try to get it running.
2612                  *
2613                  * Reflushes are aborted when the inode is errored out.
2614                  */
2615                 if (ip->flags & HAMMER_INODE_REFLUSH) {
2616                         ip->flags &= ~HAMMER_INODE_REFLUSH;
2617                         if (ip->flags & HAMMER_INODE_RESIGNAL) {
2618                                 ip->flags &= ~HAMMER_INODE_RESIGNAL;
2619                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2620                         } else {
2621                                 hammer_flush_inode(ip, 0);
2622                         }
2623                 }
2624         }
2625
2626         /*
2627          * If we have no parent dependancies we can clear CONN_DOWN
2628          */
2629         if (TAILQ_EMPTY(&ip->target_list))
2630                 ip->flags &= ~HAMMER_INODE_CONN_DOWN;
2631
2632         /*
2633          * If the inode is now clean drop the space reservation.
2634          */
2635         if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2636             (ip->flags & HAMMER_INODE_RSV_INODES)) {
2637                 ip->flags &= ~HAMMER_INODE_RSV_INODES;
2638                 --hmp->rsv_inodes;
2639         }
2640
2641         ip->flags &= ~HAMMER_INODE_SLAVEFLUSH;
2642
2643         if (dorel)
2644                 hammer_rel_inode(ip, 0);
2645 }
2646
2647 /*
2648  * Called from hammer_sync_inode() to synchronize in-memory records
2649  * to the media.
2650  */
2651 static int
2652 hammer_sync_record_callback(hammer_record_t record, void *data)
2653 {
2654         hammer_cursor_t cursor = data;
2655         hammer_transaction_t trans = cursor->trans;
2656         hammer_mount_t hmp = trans->hmp;
2657         int error;
2658
2659         /*
2660          * Skip records that do not belong to the current flush.
2661          */
2662         ++hammer_stats_record_iterations;
2663         if (record->flush_state != HAMMER_FST_FLUSH)
2664                 return(0);
2665
2666 #if 1
2667         if (record->flush_group != record->ip->flush_group) {
2668                 kprintf("sync_record %p ip %p bad flush group %p %p\n", record, record->ip, record->flush_group ,record->ip->flush_group);
2669                 if (hammer_debug_critical)
2670                         Debugger("blah2");
2671                 return(0);
2672         }
2673 #endif
2674         KKASSERT(record->flush_group == record->ip->flush_group);
2675
2676         /*
2677          * Interlock the record using the BE flag.  Once BE is set the
2678          * frontend cannot change the state of FE.
2679          *
2680          * NOTE: If FE is set prior to us setting BE we still sync the
2681          * record out, but the flush completion code converts it to
2682          * a delete-on-disk record instead of destroying it.
2683          */
2684         KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
2685         record->flags |= HAMMER_RECF_INTERLOCK_BE;
2686
2687         /*
2688          * The backend has already disposed of the record.
2689          */
2690         if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) {
2691                 error = 0;
2692                 goto done;
2693         }
2694
2695         /*
2696          * If the whole inode is being deleted and all on-disk records will
2697          * be deleted very soon, we can't sync any new records to disk
2698          * because they will be deleted in the same transaction they were
2699          * created in (delete_tid == create_tid), which will assert.
2700          *
2701          * XXX There may be a case with RECORD_ADD with DELETED_FE set
2702          * that we currently panic on.
2703          */
2704         if (record->ip->sync_flags & HAMMER_INODE_DELETING) {
2705                 switch(record->type) {
2706                 case HAMMER_MEM_RECORD_DATA:
2707                         /*
2708                          * We don't have to do anything, if the record was
2709                          * committed the space will have been accounted for
2710                          * in the blockmap.
2711                          */
2712                         /* fall through */
2713                 case HAMMER_MEM_RECORD_GENERAL:
2714                         /*
2715                          * Set deleted-by-backend flag.  Do not set the
2716                          * backend committed flag, because we are throwing
2717                          * the record away.
2718                          */
2719                         record->flags |= HAMMER_RECF_DELETED_BE;
2720                         ++record->ip->rec_generation;
2721                         error = 0;
2722                         goto done;
2723                 case HAMMER_MEM_RECORD_ADD:
2724                         panic("hammer_sync_record_callback: illegal add "
2725                               "during inode deletion record %p", record);
2726                         break; /* NOT REACHED */
2727                 case HAMMER_MEM_RECORD_INODE:
2728                         panic("hammer_sync_record_callback: attempt to "
2729                               "sync inode record %p?", record);
2730                         break; /* NOT REACHED */
2731                 case HAMMER_MEM_RECORD_DEL:
2732                         /*
2733                          * Follow through and issue the on-disk deletion
2734                          */
2735                         break;
2736                 }
2737         }
2738
2739         /*
2740          * If DELETED_FE is set special handling is needed for directory
2741          * entries.  Dependant pieces related to the directory entry may
2742          * have already been synced to disk.  If this occurs we have to
2743          * sync the directory entry and then change the in-memory record
2744          * from an ADD to a DELETE to cover the fact that it's been
2745          * deleted by the frontend.
2746          *
2747          * A directory delete covering record (MEM_RECORD_DEL) can never
2748          * be deleted by the frontend.
2749          *
2750          * Any other record type (aka DATA) can be deleted by the frontend.
2751          * XXX At the moment the flusher must skip it because there may
2752          * be another data record in the flush group for the same block,
2753          * meaning that some frontend data changes can leak into the backend's
2754          * synchronization point.
2755          */
2756         if (record->flags & HAMMER_RECF_DELETED_FE) {
2757                 if (record->type == HAMMER_MEM_RECORD_ADD) {
2758                         /*
2759                          * Convert a front-end deleted directory-add to
2760                          * a directory-delete entry later.
2761                          */
2762                         record->flags |= HAMMER_RECF_CONVERT_DELETE;
2763                 } else {
2764                         /*
2765                          * Dispose of the record (race case).  Mark as
2766                          * deleted by backend (and not committed).
2767                          */
2768                         KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
2769                         record->flags |= HAMMER_RECF_DELETED_BE;
2770                         ++record->ip->rec_generation;
2771                         error = 0;
2772                         goto done;
2773                 }
2774         }
2775
2776         /*
2777          * Assign the create_tid for new records.  Deletions already
2778          * have the record's entire key properly set up.
2779          */
2780         if (record->type != HAMMER_MEM_RECORD_DEL) {
2781                 record->leaf.base.create_tid = trans->tid;
2782                 record->leaf.create_ts = trans->time32;
2783         }
2784
2785         /*
2786          * This actually moves the record to the on-media B-Tree.  We
2787          * must also generate REDO_TERM entries in the UNDO/REDO FIFO
2788          * indicating that the related REDO_WRITE(s) have been committed.
2789          *
2790          * During recovery any REDO_TERM's within the nominal recovery span
2791          * are ignored since the related meta-data is being undone, causing
2792          * any matching REDO_WRITEs to execute.  The REDO_TERMs outside
2793          * the nominal recovery span will match against REDO_WRITEs and
2794          * prevent them from being executed (because the meta-data has
2795          * already been synchronized).
2796          */
2797         if (record->flags & HAMMER_RECF_REDO) {
2798                 KKASSERT(record->type == HAMMER_MEM_RECORD_DATA);
2799                 hammer_generate_redo(trans, record->ip,
2800                                      record->leaf.base.key -
2801                                          record->leaf.data_len,
2802                                      HAMMER_REDO_TERM_WRITE,
2803                                      NULL,
2804                                      record->leaf.data_len);
2805         }
2806
2807         for (;;) {
2808                 error = hammer_ip_sync_record_cursor(cursor, record);
2809                 if (error != EDEADLK)
2810                         break;
2811                 hammer_done_cursor(cursor);
2812                 error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
2813                                            record->ip);
2814                 if (error)
2815                         break;
2816         }
2817         record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
2818
2819         if (error)
2820                 error = -error;
2821 done:
2822         hammer_flush_record_done(record, error);
2823
2824         /*
2825          * Do partial finalization if we have built up too many dirty
2826          * buffers.  Otherwise a buffer cache deadlock can occur when
2827          * doing things like creating tens of thousands of tiny files.
2828          *
2829          * We must release our cursor lock to avoid a 3-way deadlock
2830          * due to the exclusive sync lock the finalizer must get.
2831          *
2832          * WARNING: See warnings in hammer_unlock_cursor() function.
2833          */
2834         if (hammer_flusher_meta_limit(hmp) ||
2835             vm_page_count_severe()) {
2836                 hammer_unlock_cursor(cursor);
2837                 hammer_flusher_finalize(trans, 0);
2838                 hammer_lock_cursor(cursor);
2839         }
2840         return(error);
2841 }
2842
2843 /*
2844  * Backend function called by the flusher to sync an inode to media.
2845  */
2846 int
2847 hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
2848 {
2849         struct hammer_cursor cursor;
2850         hammer_node_t tmp_node;
2851         hammer_record_t depend;
2852         hammer_record_t next;
2853         int error, tmp_error;
2854         u_int64_t nlinks;
2855
2856         if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
2857                 return(0);
2858
2859         error = hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
2860         if (error)
2861                 goto done;
2862
2863         /*
2864          * Any directory records referencing this inode which are not in
2865          * our current flush group must adjust our nlink count for the
2866          * purposes of synchronizating to disk.
2867          *
2868          * Records which are in our flush group can be unlinked from our
2869          * inode now, potentially allowing the inode to be physically
2870          * deleted.
2871          *
2872          * This cannot block.
2873          */
2874         nlinks = ip->ino_data.nlinks;
2875         next = TAILQ_FIRST(&ip->target_list);
2876         while ((depend = next) != NULL) {
2877                 next = TAILQ_NEXT(depend, target_entry);
2878                 if (depend->flush_state == HAMMER_FST_FLUSH &&
2879                     depend->flush_group == ip->flush_group) {
2880                         /*
2881                          * If this is an ADD that was deleted by the frontend
2882                          * the frontend nlinks count will have already been
2883                          * decremented, but the backend is going to sync its
2884                          * directory entry and must account for it.  The
2885                          * record will be converted to a delete-on-disk when
2886                          * it gets synced.
2887                          *
2888                          * If the ADD was not deleted by the frontend we
2889                          * can remove the dependancy from our target_list.
2890                          */
2891                         if (depend->flags & HAMMER_RECF_DELETED_FE) {
2892                                 ++nlinks;
2893                         } else {
2894                                 TAILQ_REMOVE(&ip->target_list, depend,
2895                                              target_entry);
2896                                 depend->target_ip = NULL;
2897                         }
2898                 } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
2899                         /*
2900                          * Not part of our flush group and not deleted by
2901                          * the front-end, adjust the link count synced to
2902                          * the media (undo what the frontend did when it
2903                          * queued the record).
2904                          */
2905                         KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
2906                         switch(depend->type) {
2907                         case HAMMER_MEM_RECORD_ADD:
2908                                 --nlinks;
2909                                 break;
2910                         case HAMMER_MEM_RECORD_DEL:
2911                                 ++nlinks;
2912                                 break;
2913                         default:
2914                                 break;
2915                         }
2916                 }
2917         }
2918
2919         /*
2920          * Set dirty if we had to modify the link count.
2921          */
2922         if (ip->sync_ino_data.nlinks != nlinks) {
2923                 KKASSERT((int64_t)nlinks >= 0);
2924                 ip->sync_ino_data.nlinks = nlinks;
2925                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
2926         }
2927
2928         /*
2929          * If there is a trunction queued destroy any data past the (aligned)
2930          * truncation point.  Userland will have dealt with the buffer
2931          * containing the truncation point for us.
2932          *
2933          * We don't flush pending frontend data buffers until after we've
2934          * dealt with the truncation.
2935          */
2936         if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2937                 /*
2938                  * Interlock trunc_off.  The VOP front-end may continue to
2939                  * make adjustments to it while we are blocked.
2940                  */
2941                 off_t trunc_off;
2942                 off_t aligned_trunc_off;
2943                 int blkmask;
2944
2945                 trunc_off = ip->sync_trunc_off;
2946                 blkmask = hammer_blocksize(trunc_off) - 1;
2947                 aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask;
2948
2949                 /*
2950                  * Delete any whole blocks on-media.  The front-end has
2951                  * already cleaned out any partial block and made it
2952                  * pending.  The front-end may have updated trunc_off
2953                  * while we were blocked so we only use sync_trunc_off.
2954                  *
2955                  * This operation can blow out the buffer cache, EWOULDBLOCK
2956                  * means we were unable to complete the deletion.  The
2957                  * deletion will update sync_trunc_off in that case.
2958                  */
2959                 error = hammer_ip_delete_range(&cursor, ip,
2960                                                 aligned_trunc_off,
2961                                                 0x7FFFFFFFFFFFFFFFLL, 2);
2962                 if (error == EWOULDBLOCK) {
2963                         ip->flags |= HAMMER_INODE_WOULDBLOCK;
2964                         error = 0;
2965                         goto defer_buffer_flush;
2966                 }
2967
2968                 if (error)
2969                         goto done;
2970
2971                 /*
2972                  * Generate a REDO_TERM_TRUNC entry in the UNDO/REDO FIFO.
2973                  *
2974                  * XXX we do this even if we did not previously generate
2975                  * a REDO_TRUNC record.  This operation may enclosed the
2976                  * range for multiple prior truncation entries in the REDO
2977                  * log.
2978                  */
2979                 if (trans->hmp->version >= HAMMER_VOL_VERSION_FOUR &&
2980                     (ip->flags & HAMMER_INODE_RDIRTY)) {
2981                         hammer_generate_redo(trans, ip, aligned_trunc_off,
2982                                              HAMMER_REDO_TERM_TRUNC,
2983                                              NULL, 0);
2984                 }
2985
2986                 /*
2987                  * Clear the truncation flag on the backend after we have
2988                  * completed the deletions.  Backend data is now good again
2989                  * (including new records we are about to sync, below).
2990                  *
2991                  * Leave sync_trunc_off intact.  As we write additional
2992                  * records the backend will update sync_trunc_off.  This
2993                  * tells the backend whether it can skip the overwrite
2994                  * test.  This should work properly even when the backend
2995                  * writes full blocks where the truncation point straddles
2996                  * the block because the comparison is against the base
2997                  * offset of the record.
2998                  */
2999                 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
3000                 /* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
3001         } else {
3002                 error = 0;
3003         }
3004
3005         /*
3006          * Now sync related records.  These will typically be directory
3007          * entries, records tracking direct-writes, or delete-on-disk records.
3008          */
3009         if (error == 0) {
3010                 tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
3011                                     hammer_sync_record_callback, &cursor);
3012                 if (tmp_error < 0)
3013                         tmp_error = -error;
3014                 if (tmp_error)
3015                         error = tmp_error;
3016         }
3017         hammer_cache_node(&ip->cache[1], cursor.node);
3018
3019         /*
3020          * Re-seek for inode update, assuming our cache hasn't been ripped
3021          * out from under us.
3022          */
3023         if (error == 0) {
3024                 tmp_node = hammer_ref_node_safe(trans, &ip->cache[0], &error);
3025                 if (tmp_node) {
3026                         hammer_cursor_downgrade(&cursor);
3027                         hammer_lock_sh(&tmp_node->lock);
3028                         if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0)
3029                                 hammer_cursor_seek(&cursor, tmp_node, 0);
3030                         hammer_unlock(&tmp_node->lock);
3031                         hammer_rel_node(tmp_node);
3032                 }
3033                 error = 0;
3034         }
3035
3036         /*
3037          * If we are deleting the inode the frontend had better not have
3038          * any active references on elements making up the inode.
3039          *
3040          * The call to hammer_ip_delete_clean() cleans up auxillary records
3041          * but not DB or DATA records.  Those must have already been deleted
3042          * by the normal truncation mechanic.
3043          */
3044         if (error == 0 && ip->sync_ino_data.nlinks == 0 &&
3045                 RB_EMPTY(&ip->rec_tree)  &&
3046             (ip->sync_flags & HAMMER_INODE_DELETING) &&
3047             (ip->flags & HAMMER_INODE_DELETED) == 0) {
3048                 int count1 = 0;
3049
3050                 error = hammer_ip_delete_clean(&cursor, ip, &count1);
3051                 if (error == 0) {
3052                         ip->flags |= HAMMER_INODE_DELETED;
3053                         ip->sync_flags &= ~HAMMER_INODE_DELETING;
3054                         ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
3055                         KKASSERT(RB_EMPTY(&ip->rec_tree));
3056
3057                         /*
3058                          * Set delete_tid in both the frontend and backend
3059                          * copy of the inode record.  The DELETED flag handles
3060                          * this, do not set DDIRTY.
3061                          */
3062                         ip->ino_leaf.base.delete_tid = trans->tid;
3063                         ip->sync_ino_leaf.base.delete_tid = trans->tid;
3064                         ip->ino_leaf.delete_ts = trans->time32;
3065                         ip->sync_ino_leaf.delete_ts = trans->time32;
3066
3067
3068                         /*
3069                          * Adjust the inode count in the volume header
3070                          */
3071                         hammer_sync_lock_sh(trans);
3072                         if (ip->flags & HAMMER_INODE_ONDISK) {
3073                                 hammer_modify_volume_field(trans,
3074                                                            trans->rootvol,
3075                                                            vol0_stat_inodes);
3076                                 --ip->hmp->rootvol->ondisk->vol0_stat_inodes;
3077                                 hammer_modify_volume_done(trans->rootvol);
3078                         }
3079                         hammer_sync_unlock(trans);
3080                 }
3081         }
3082
3083         if (error)
3084                 goto done;
3085         ip->sync_flags &= ~HAMMER_INODE_BUFS;
3086
3087 defer_buffer_flush:
3088         /*
3089          * Now update the inode's on-disk inode-data and/or on-disk record.
3090          * DELETED and ONDISK are managed only in ip->flags.
3091          *
3092          * In the case of a defered buffer flush we still update the on-disk
3093          * inode to satisfy visibility requirements if there happen to be
3094          * directory dependancies.
3095          */
3096         switch(ip->flags & (HAMMER_INODE_DELETED | HAMMER_INODE_ONDISK)) {
3097         case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK:
3098                 /*
3099                  * If deleted and on-disk, don't set any additional flags.
3100                  * the delete flag takes care of things.
3101                  *
3102                  * Clear flags which may have been set by the frontend.
3103                  */
3104                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
3105                                     HAMMER_INODE_SDIRTY |
3106                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
3107                                     HAMMER_INODE_DELETING);
3108                 break;
3109         case HAMMER_INODE_DELETED:
3110                 /*
3111                  * Take care of the case where a deleted inode was never
3112                  * flushed to the disk in the first place.
3113                  *
3114                  * Clear flags which may have been set by the frontend.
3115                  */
3116                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
3117                                     HAMMER_INODE_SDIRTY |
3118                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
3119                                     HAMMER_INODE_DELETING);
3120                 while (RB_ROOT(&ip->rec_tree)) {
3121                         hammer_record_t record = RB_ROOT(&ip->rec_tree);
3122                         hammer_ref(&record->lock);
3123                         KKASSERT(hammer_oneref(&record->lock));
3124                         record->flags |= HAMMER_RECF_DELETED_BE;
3125                         ++record->ip->rec_generation;
3126                         hammer_rel_mem_record(record);
3127                 }
3128                 break;
3129         case HAMMER_INODE_ONDISK:
3130                 /*
3131                  * If already on-disk, do not set any additional flags.
3132                  */
3133                 break;
3134         default:
3135                 /*
3136                  * If not on-disk and not deleted, set DDIRTY to force
3137                  * an initial record to be written.
3138                  *
3139                  * Also set the create_tid in both the frontend and backend
3140                  * copy of the inode record.
3141                  */
3142                 ip->ino_leaf.base.create_tid = trans->tid;
3143                 ip->ino_leaf.create_ts = trans->time32;
3144                 ip->sync_ino_leaf.base.create_tid = trans->tid;
3145                 ip->sync_ino_leaf.create_ts = trans->time32;
3146                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
3147                 break;
3148         }
3149
3150         /*
3151          * If DDIRTY or SDIRTY is set, write out a new record.
3152          * If the inode is already on-disk the old record is marked as
3153          * deleted.
3154          *
3155          * If DELETED is set hammer_update_inode() will delete the existing
3156          * record without writing out a new one.
3157          *
3158          * If *ONLY* the ITIMES flag is set we can update the record in-place.
3159          */
3160         if (ip->flags & HAMMER_INODE_DELETED) {
3161                 error = hammer_update_inode(&cursor, ip);
3162         } else
3163         if (!(ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY)) &&
3164             (ip->sync_flags & (HAMMER_INODE_ATIME | HAMMER_INODE_MTIME))) {
3165                 error = hammer_update_itimes(&cursor, ip);
3166         } else
3167         if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY |
3168                               HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) {
3169                 error = hammer_update_inode(&cursor, ip);
3170         }
3171 done:
3172         if (ip->flags & HAMMER_INODE_MODMASK)
3173                 hammer_inode_dirty(ip);
3174         if (error) {
3175                 hammer_critical_error(ip->hmp, ip, error,
3176                                       "while syncing inode");
3177         }
3178         hammer_done_cursor(&cursor);
3179         return(error);
3180 }
3181
3182 /*
3183  * This routine is called when the OS is no longer actively referencing
3184  * the inode (but might still be keeping it cached), or when releasing
3185  * the last reference to an inode.
3186  *
3187  * At this point if the inode's nlinks count is zero we want to destroy
3188  * it, which may mean destroying it on-media too.
3189  */
3190 void
3191 hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
3192 {
3193         struct vnode *vp;
3194
3195         /*
3196          * Set the DELETING flag when the link count drops to 0 and the
3197          * OS no longer has any opens on the inode.
3198          *
3199          * The backend will clear DELETING (a mod flag) and set DELETED
3200          * (a state flag) when it is actually able to perform the
3201          * operation.
3202          *
3203          * Don't reflag the deletion if the flusher is currently syncing
3204          * one that was already flagged.  A previously set DELETING flag
3205          * may bounce around flags and sync_flags until the operation is
3206          * completely done.
3207          *
3208          * Do not attempt to modify a snapshot inode (one set to read-only).
3209          */
3210         if (ip->ino_data.nlinks == 0 &&
3211             ((ip->flags | ip->sync_flags) & (HAMMER_INODE_RO|HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) == 0) {
3212                 ip->flags |= HAMMER_INODE_DELETING;
3213                 ip->flags |= HAMMER_INODE_TRUNCATED;
3214                 ip->trunc_off = 0;
3215                 vp = NULL;
3216                 if (getvp) {
3217                         if (hammer_get_vnode(ip, &vp) != 0)
3218                                 return;
3219                 }
3220
3221                 /*
3222                  * Final cleanup
3223                  */
3224                 if (ip->vp)
3225                         nvtruncbuf(ip->vp, 0, HAMMER_BUFSIZE, 0, 0);
3226                 if (ip->flags & HAMMER_INODE_MODMASK)
3227                         hammer_inode_dirty(ip);
3228                 if (getvp)
3229                         vput(vp);
3230         }
3231 }
3232
3233 /*
3234  * After potentially resolving a dependancy the inode is tested
3235  * to determine whether it needs to be reflushed.
3236  */
3237 void
3238 hammer_test_inode(hammer_inode_t ip)
3239 {
3240         if (ip->flags & HAMMER_INODE_REFLUSH) {
3241                 ip->flags &= ~HAMMER_INODE_REFLUSH;
3242                 hammer_ref(&ip->lock);
3243                 if (ip->flags & HAMMER_INODE_RESIGNAL) {
3244                         ip->flags &= ~HAMMER_INODE_RESIGNAL;
3245                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
3246                 } else {
3247                         hammer_flush_inode(ip, 0);
3248                 }
3249                 hammer_rel_inode(ip, 0);
3250         }
3251 }
3252
3253 /*
3254  * Clear the RECLAIM flag on an inode.  This occurs when the inode is
3255  * reassociated with a vp or just before it gets freed.
3256  *
3257  * Pipeline wakeups to threads blocked due to an excessive number of
3258  * detached inodes.  This typically occurs when atime updates accumulate
3259  * while scanning a directory tree.
3260  */
3261 static void
3262 hammer_inode_wakereclaims(hammer_inode_t ip)
3263 {
3264         struct hammer_reclaim *reclaim;
3265         hammer_mount_t hmp = ip->hmp;
3266
3267         if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
3268                 return;
3269
3270         --hammer_count_reclaims;
3271         --hmp->count_reclaims;
3272         ip->flags &= ~HAMMER_INODE_RECLAIM;
3273
3274         if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
3275                 KKASSERT(reclaim->count > 0);
3276                 if (--reclaim->count == 0) {
3277                         TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
3278                         wakeup(reclaim);
3279                 }
3280         }
3281 }
3282
3283 /*
3284  * Setup our reclaim pipeline.  We only let so many detached (and dirty)
3285  * inodes build up before we start blocking.  This routine is called
3286  * if a new inode is created or an inode is loaded from media.
3287  *
3288  * When we block we don't care *which* inode has finished reclaiming,
3289  * as long as one does.
3290  *
3291  * The reclaim pipeline is primarily governed by the auto-flush which is
3292  * 1/4 hammer_limit_reclaims.  We don't want to block if the count is
3293  * less than 1/2 hammer_limit_reclaims.  From 1/2 to full count is
3294  * dynamically governed.
3295  */
3296 void
3297 hammer_inode_waitreclaims(hammer_transaction_t trans)
3298 {
3299         hammer_mount_t hmp = trans->hmp;
3300         struct hammer_reclaim reclaim;
3301         int lower_limit;
3302
3303         /*
3304          * Track inode load, delay if the number of reclaiming inodes is
3305          * between 2/4 and 4/4 hammer_limit_reclaims, depending.
3306          */
3307         if (curthread->td_proc) {
3308                 struct hammer_inostats *stats;
3309
3310                 stats = hammer_inode_inostats(hmp, curthread->td_proc->p_pid);
3311                 ++stats->count;
3312
3313                 if (stats->count > hammer_limit_reclaims / 2)
3314                         stats->count = hammer_limit_reclaims / 2;
3315                 lower_limit = hammer_limit_reclaims - stats->count;
3316                 if (hammer_debug_general & 0x10000) {
3317                         kprintf("pid %5d limit %d\n",
3318                                 (int)curthread->td_proc->p_pid, lower_limit);
3319                 }
3320         } else {
3321                 lower_limit = hammer_limit_reclaims * 3 / 4;
3322         }
3323         if (hmp->count_reclaims >= lower_limit) {
3324                 reclaim.count = 1;
3325                 TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
3326                 tsleep(&reclaim, 0, "hmrrcm", hz);
3327                 if (reclaim.count > 0)
3328                         TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
3329         }
3330 }
3331
3332 /*
3333  * Keep track of reclaim statistics on a per-pid basis using a loose
3334  * 4-way set associative hash table.  Collisions inherit the count of
3335  * the previous entry.
3336  *
3337  * NOTE: We want to be careful here to limit the chain size.  If the chain
3338  *       size is too large a pid will spread its stats out over too many
3339  *       entries under certain types of heavy filesystem activity and
3340  *       wind up not delaying long enough.
3341  */
3342 static
3343 struct hammer_inostats *
3344 hammer_inode_inostats(hammer_mount_t hmp, pid_t pid)
3345 {
3346         struct hammer_inostats *stats;
3347         int delta;
3348         int chain;
3349         static volatile int iterator;   /* we don't care about MP races */
3350
3351         /*
3352          * Chain up to 4 times to find our entry.
3353          */
3354         for (chain = 0; chain < 4; ++chain) {
3355                 stats = &hmp->inostats[(pid + chain) & HAMMER_INOSTATS_HMASK];
3356                 if (stats->pid == pid)
3357                         break;
3358         }
3359
3360         /*
3361          * Replace one of the four chaining entries with our new entry.
3362          */
3363         if (chain == 4) {
3364                 stats = &hmp->inostats[(pid + (iterator++ & 3)) &
3365                                        HAMMER_INOSTATS_HMASK];
3366                 stats->pid = pid;
3367         }
3368
3369         /*
3370          * Decay the entry
3371          */
3372         if (stats->count && stats->ltick != ticks) {
3373                 delta = ticks - stats->ltick;
3374                 stats->ltick = ticks;
3375                 if (delta <= 0 || delta > hz * 60)
3376                         stats->count = 0;
3377                 else
3378                         stats->count = stats->count * hz / (hz + delta);
3379         }
3380         if (hammer_debug_general & 0x10000)
3381                 kprintf("pid %5d stats %d\n", (int)pid, stats->count);
3382         return (stats);
3383 }
3384
3385 #if 0
3386
3387 /*
3388  * XXX not used, doesn't work very well due to the large batching nature
3389  * of flushes.
3390  *
3391  * A larger then normal backlog of inodes is sitting in the flusher,
3392  * enforce a general slowdown to let it catch up.  This routine is only
3393  * called on completion of a non-flusher-related transaction which
3394  * performed B-Tree node I/O.
3395  *
3396  * It is possible for the flusher to stall in a continuous load.
3397  * blogbench -i1000 -o seems to do a good job generating this sort of load.
3398  * If the flusher is unable to catch up the inode count can bloat until
3399  * we run out of kvm.
3400  *
3401  * This is a bit of a hack.
3402  */
3403 void
3404 hammer_inode_waithard(hammer_mount_t hmp)
3405 {
3406         /*
3407          * Hysteresis.
3408          */
3409         if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) {
3410                 if (hmp->count_reclaims < hammer_limit_reclaims / 2 &&
3411                     hmp->count_iqueued < hmp->count_inodes / 20) {
3412                         hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY;
3413                         return;
3414                 }
3415         } else {
3416                 if (hmp->count_reclaims < hammer_limit_reclaims ||
3417                     hmp->count_iqueued < hmp->count_inodes / 10) {
3418                         return;
3419                 }
3420                 hmp->flags |= HAMMER_MOUNT_FLUSH_RECOVERY;
3421         }
3422
3423         /*
3424          * Block for one flush cycle.
3425          */
3426         hammer_flusher_wait_next(hmp);
3427 }
3428
3429 #endif