sys/vfs/hammer/hammer_io.c

   1 /*
   2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34 /*
  35  * IO Primitives and buffer cache management
  36  *
  37  * All major data-tracking structures in HAMMER contain a struct hammer_io
  38  * which is used to manage their backing store.  We use filesystem buffers
  39  * for backing store and we leave them passively associated with their
  40  * HAMMER structures.
  41  *
  42  * If the kernel tries to destroy a passively associated buf which we cannot
  43  * yet let go we set B_LOCKED in the buffer and then actively released it
  44  * later when we can.
  45  *
  46  * The io_token is required for anything which might race bioops and bio_done
  47  * callbacks, with one exception: A successful hammer_try_interlock_norefs().
  48  * the fs_token will be held in all other cases.
  49  */
  50
  51 #include "hammer.h"
  52 #include <sys/fcntl.h>
  53 #include <sys/nlookup.h>
  54 #include <sys/buf.h>
  55
  56 #include <sys/buf2.h>
  57
  58 static void hammer_io_modify(hammer_io_t io, int count);
  59 static void hammer_io_deallocate(struct buf *bp);
  60 static void hammer_indirect_callback(struct bio *bio);
  61 static void hammer_io_direct_write_complete(struct bio *nbio);
  62 static int hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data);
  63 static void hammer_io_set_modlist(struct hammer_io *io);
  64 static void hammer_io_flush_mark(hammer_volume_t volume);
  65
  66 static int
  67 hammer_mod_rb_compare(hammer_io_t io1, hammer_io_t io2)
  68 {
  69         hammer_off_t io1_offset;
  70         hammer_off_t io2_offset;
  71
  72         io1_offset = ((io1->offset & HAMMER_OFF_SHORT_MASK) << 8) |
  73                      io1->volume->vol_no;
  74         io2_offset = ((io2->offset & HAMMER_OFF_SHORT_MASK) << 8) |
  75                      io2->volume->vol_no;
  76
  77         if (io1_offset < io2_offset)
  78                 return(-1);
  79         if (io1_offset > io2_offset)
  80                 return(1);
  81         return(0);
  82 }
  83
  84 RB_GENERATE(hammer_mod_rb_tree, hammer_io, rb_node, hammer_mod_rb_compare);
  85
  86 /*
  87  * Initialize a new, already-zero'd hammer_io structure, or reinitialize
  88  * an existing hammer_io structure which may have switched to another type.
  89  */
  90 void
  91 hammer_io_init(hammer_io_t io, hammer_volume_t volume, enum hammer_io_type type)
  92 {
  93         io->volume = volume;
  94         io->hmp = volume->io.hmp;
  95         io->type = type;
  96 }
  97
  98 /*
  99  * Helper routine to disassociate a buffer cache buffer from an I/O
 100  * structure.  The io must be interlocked and marked appropriately for
 101  * reclamation.
 102  *
 103  * The io must be in a released state with the io->bp owned and
 104  * locked by the caller of this function.  When not called from an
 105  * io_deallocate() this cannot race an io_deallocate() since the
 106  * kernel would be unable to get the buffer lock in that case.
 107  * (The released state in this case means we own the bp, not the
 108  * hammer_io structure).
 109  *
 110  * The io may have 0 or 1 references depending on who called us.  The
 111  * caller is responsible for dealing with the refs.
 112  *
 113  * This call can only be made when no action is required on the buffer.
 114  *
 115  * This function is guaranteed not to race against anything because we
 116  * own both the io lock and the bp lock and are interlocked with no
 117  * references.
 118  */
 119 static void
 120 hammer_io_disassociate(hammer_io_structure_t iou)
 121 {
 122         struct buf *bp = iou->io.bp;
 123
 124         KKASSERT(iou->io.released);
 125         KKASSERT(iou->io.modified == 0);
 126         KKASSERT(LIST_FIRST(&bp->b_dep) == (void *)iou);
 127         buf_dep_init(bp);
 128         iou->io.bp = NULL;
 129
 130         /*
 131          * If the buffer was locked someone wanted to get rid of it.
 132          */
 133         if (bp->b_flags & B_LOCKED) {
 134                 atomic_add_int(&hammer_count_io_locked, -1);
 135                 bp->b_flags &= ~B_LOCKED;
 136         }
 137         if (iou->io.reclaim) {
 138                 bp->b_flags |= B_NOCACHE|B_RELBUF;
 139                 iou->io.reclaim = 0;
 140         }
 141
 142         switch(iou->io.type) {
 143         case HAMMER_STRUCTURE_VOLUME:
 144                 iou->volume.ondisk = NULL;
 145                 break;
 146         case HAMMER_STRUCTURE_DATA_BUFFER:
 147         case HAMMER_STRUCTURE_META_BUFFER:
 148         case HAMMER_STRUCTURE_UNDO_BUFFER:
 149                 iou->buffer.ondisk = NULL;
 150                 break;
 151         case HAMMER_STRUCTURE_DUMMY:
 152                 panic("hammer_io_disassociate: bad io type");
 153                 break;
 154         }
 155 }
 156
 157 /*
 158  * Wait for any physical IO to complete
 159  *
 160  * XXX we aren't interlocked against a spinlock or anything so there
 161  *     is a small window in the interlock / io->running == 0 test.
 162  */
 163 void
 164 hammer_io_wait(hammer_io_t io)
 165 {
 166         if (io->running) {
 167                 hammer_mount_t hmp = io->hmp;
 168
 169                 lwkt_gettoken(&hmp->io_token);
 170                 while (io->running) {
 171                         io->waiting = 1;
 172                         tsleep_interlock(io, 0);
 173                         if (io->running)
 174                                 tsleep(io, PINTERLOCKED, "hmrflw", hz);
 175                 }
 176                 lwkt_reltoken(&hmp->io_token);
 177         }
 178 }
 179
 180 /*
 181  * Wait for all currently queued HAMMER-initiated I/Os to complete.
 182  *
 183  * This is not supposed to count direct I/O's but some can leak
 184  * through (for non-full-sized direct I/Os).
 185  */
 186 void
 187 hammer_io_wait_all(hammer_mount_t hmp, const char *ident, int doflush)
 188 {
 189         struct hammer_io iodummy;
 190         hammer_io_t io;
 191
 192         /*
 193          * Degenerate case, no I/O is running
 194          */
 195         lwkt_gettoken(&hmp->io_token);
 196         if (TAILQ_EMPTY(&hmp->iorun_list)) {
 197                 lwkt_reltoken(&hmp->io_token);
 198                 if (doflush)
 199                         hammer_io_flush_sync(hmp);
 200                 return;
 201         }
 202         bzero(&iodummy, sizeof(iodummy));
 203         iodummy.type = HAMMER_STRUCTURE_DUMMY;
 204
 205         /*
 206          * Add placemarker and then wait until it becomes the head of
 207          * the list.
 208          */
 209         TAILQ_INSERT_TAIL(&hmp->iorun_list, &iodummy, iorun_entry);
 210         while (TAILQ_FIRST(&hmp->iorun_list) != &iodummy) {
 211                 tsleep(&iodummy, 0, ident, 0);
 212         }
 213
 214         /*
 215          * Chain in case several placemarkers are present.
 216          */
 217         TAILQ_REMOVE(&hmp->iorun_list, &iodummy, iorun_entry);
 218         io = TAILQ_FIRST(&hmp->iorun_list);
 219         if (io && io->type == HAMMER_STRUCTURE_DUMMY)
 220                 wakeup(io);
 221         lwkt_reltoken(&hmp->io_token);
 222
 223         if (doflush)
 224                 hammer_io_flush_sync(hmp);
 225 }
 226
 227 /*
 228  * Clear a flagged error condition on a I/O buffer.  The caller must hold
 229  * its own ref on the buffer.
 230  */
 231 void
 232 hammer_io_clear_error(struct hammer_io *io)
 233 {
 234         hammer_mount_t hmp = io->hmp;
 235
 236         lwkt_gettoken(&hmp->io_token);
 237         if (io->ioerror) {
 238                 io->ioerror = 0;
 239                 hammer_rel(&io->lock);
 240                 KKASSERT(hammer_isactive(&io->lock));
 241         }
 242         lwkt_reltoken(&hmp->io_token);
 243 }
 244
 245 void
 246 hammer_io_clear_error_noassert(struct hammer_io *io)
 247 {
 248         hammer_mount_t hmp = io->hmp;
 249
 250         lwkt_gettoken(&hmp->io_token);
 251         if (io->ioerror) {
 252                 io->ioerror = 0;
 253                 hammer_rel(&io->lock);
 254         }
 255         lwkt_reltoken(&hmp->io_token);
 256 }
 257
 258 /*
 259  * This is an advisory function only which tells the buffer cache
 260  * the bp is not a meta-data buffer, even though it is backed by
 261  * a block device.
 262  *
 263  * This is used by HAMMER's reblocking code to avoid trying to
 264  * swapcache the filesystem's data when it is read or written
 265  * by the reblocking code.
 266  *
 267  * The caller has a ref on the buffer preventing the bp from
 268  * being disassociated from it.
 269  */
 270 void
 271 hammer_io_notmeta(hammer_buffer_t buffer)
 272 {
 273         if ((buffer->io.bp->b_flags & B_NOTMETA) == 0) {
 274                 hammer_mount_t hmp = buffer->io.hmp;
 275
 276                 lwkt_gettoken(&hmp->io_token);
 277                 buffer->io.bp->b_flags |= B_NOTMETA;
 278                 lwkt_reltoken(&hmp->io_token);
 279         }
 280 }
 281
 282 /*
 283  * Load bp for a HAMMER structure.  The io must be exclusively locked by
 284  * the caller.
 285  *
 286  * This routine is mostly used on meta-data and small-data blocks.  Generally
 287  * speaking HAMMER assumes some locality of reference and will cluster.
 288  *
 289  * Note that the caller (hammer_ondisk.c) may place further restrictions
 290  * on clusterability via the limit (in bytes).  Typically large-data
 291  * zones cannot be clustered due to their mixed buffer sizes.  This is
 292  * not an issue since such clustering occurs in hammer_vnops at the
 293  * regular file layer, whereas this is the buffered block device layer.
 294  *
 295  * No I/O callbacks can occur while we hold the buffer locked.
 296  */
 297 int
 298 hammer_io_read(struct vnode *devvp, struct hammer_io *io, int limit)
 299 {
 300         struct buf *bp;
 301         int   error;
 302
 303         if ((bp = io->bp) == NULL) {
 304                 atomic_add_long(&hammer_count_io_running_read, io->bytes);
 305                 if (hammer_cluster_enable && limit > io->bytes) {
 306                         error = cluster_read(devvp, io->offset + limit,
 307                                              io->offset, io->bytes,
 308                                              HAMMER_CLUSTER_SIZE,
 309                                              HAMMER_CLUSTER_SIZE,
 310                                              &io->bp);
 311                 } else {
 312                         error = bread(devvp, io->offset, io->bytes, &io->bp);
 313                 }
 314                 hammer_stats_disk_read += io->bytes;
 315                 atomic_add_long(&hammer_count_io_running_read, -io->bytes);
 316
 317                 /*
 318                  * The code generally assumes b_ops/b_dep has been set-up,
 319                  * even if we error out here.
 320                  */
 321                 bp = io->bp;
 322                 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
 323                         const char *metatype;
 324
 325                         switch(io->type) {
 326                         case HAMMER_STRUCTURE_VOLUME:
 327                                 metatype = "volume";
 328                                 break;
 329                         case HAMMER_STRUCTURE_META_BUFFER:
 330                                 switch(((struct hammer_buffer *)io)->
 331                                         zoneX_offset & HAMMER_OFF_ZONE_MASK) {
 332                                 case HAMMER_ZONE_BTREE:
 333                                         metatype = "btree";
 334                                         break;
 335                                 case HAMMER_ZONE_META:
 336                                         metatype = "meta";
 337                                         break;
 338                                 case HAMMER_ZONE_FREEMAP:
 339                                         metatype = "freemap";
 340                                         break;
 341                                 default:
 342                                         metatype = "meta?";
 343                                         break;
 344                                 }
 345                                 break;
 346                         case HAMMER_STRUCTURE_DATA_BUFFER:
 347                                 metatype = "data";
 348                                 break;
 349                         case HAMMER_STRUCTURE_UNDO_BUFFER:
 350                                 metatype = "undo";
 351                                 break;
 352                         default:
 353                                 metatype = "unknown";
 354                                 break;
 355                         }
 356                         kprintf("doff %016jx %s\n",
 357                                 (intmax_t)bp->b_bio2.bio_offset,
 358                                 metatype);
 359                 }
 360                 bp->b_flags &= ~B_IODEBUG;
 361                 bp->b_ops = &hammer_bioops;
 362                 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
 363
 364                 /* io->worklist is locked by the io lock */
 365                 LIST_INSERT_HEAD(&bp->b_dep, &io->worklist, node);
 366                 BUF_KERNPROC(bp);
 367                 KKASSERT(io->modified == 0);
 368                 KKASSERT(io->running == 0);
 369                 KKASSERT(io->waiting == 0);
 370                 io->released = 0;       /* we hold an active lock on bp */
 371         } else {
 372                 error = 0;
 373         }
 374         return(error);
 375 }
 376
 377 /*
 378  * Similar to hammer_io_read() but returns a zero'd out buffer instead.
 379  * Must be called with the IO exclusively locked.
 380  *
 381  * vfs_bio_clrbuf() is kinda nasty, enforce serialization against background
 382  * I/O by forcing the buffer to not be in a released state before calling
 383  * it.
 384  *
 385  * This function will also mark the IO as modified but it will not
 386  * increment the modify_refs count.
 387  *
 388  * No I/O callbacks can occur while we hold the buffer locked.
 389  */
 390 int
 391 hammer_io_new(struct vnode *devvp, struct hammer_io *io)
 392 {
 393         struct buf *bp;
 394
 395         if ((bp = io->bp) == NULL) {
 396                 io->bp = getblk(devvp, io->offset, io->bytes, 0, 0);
 397                 bp = io->bp;
 398                 bp->b_ops = &hammer_bioops;
 399                 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
 400
 401                 /* io->worklist is locked by the io lock */
 402                 LIST_INSERT_HEAD(&bp->b_dep, &io->worklist, node);
 403                 io->released = 0;
 404                 KKASSERT(io->running == 0);
 405                 io->waiting = 0;
 406                 BUF_KERNPROC(bp);
 407         } else {
 408                 if (io->released) {
 409                         regetblk(bp);
 410                         BUF_KERNPROC(bp);
 411                         io->released = 0;
 412                 }
 413         }
 414         hammer_io_modify(io, 0);
 415         vfs_bio_clrbuf(bp);
 416         return(0);
 417 }
 418
 419 /*
 420  * Advance the activity count on the underlying buffer because
 421  * HAMMER does not getblk/brelse on every access.
 422  *
 423  * The io->bp cannot go away while the buffer is referenced.
 424  */
 425 void
 426 hammer_io_advance(struct hammer_io *io)
 427 {
 428         if (io->bp)
 429                 buf_act_advance(io->bp);
 430 }
 431
 432 /*
 433  * Remove potential device level aliases against buffers managed by high level
 434  * vnodes.  Aliases can also be created due to mixed buffer sizes or via
 435  * direct access to the backing store device.
 436  *
 437  * This is nasty because the buffers are also VMIO-backed.  Even if a buffer
 438  * does not exist its backing VM pages might, and we have to invalidate
 439  * those as well or a getblk() will reinstate them.
 440  *
 441  * Buffer cache buffers associated with hammer_buffers cannot be
 442  * invalidated.
 443  */
 444 int
 445 hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset)
 446 {
 447         hammer_io_structure_t iou;
 448         hammer_mount_t hmp;
 449         hammer_off_t phys_offset;
 450         struct buf *bp;
 451         int error;
 452
 453         hmp = volume->io.hmp;
 454         lwkt_gettoken(&hmp->io_token);
 455
 456         /*
 457          * If a device buffer already exists for the specified physical
 458          * offset use that, otherwise instantiate a buffer to cover any
 459          * related VM pages, set BNOCACHE, and brelse().
 460          */
 461         phys_offset = volume->ondisk->vol_buf_beg +
 462                       (zone2_offset & HAMMER_OFF_SHORT_MASK);
 463         if ((bp = findblk(volume->devvp, phys_offset, 0)) != NULL)
 464                 bremfree(bp);
 465         else
 466                 bp = getblk(volume->devvp, phys_offset, HAMMER_BUFSIZE, 0, 0);
 467
 468         if ((iou = (void *)LIST_FIRST(&bp->b_dep)) != NULL) {
 469 #if 0
 470                 hammer_ref(&iou->io.lock);
 471                 hammer_io_clear_modify(&iou->io, 1);
 472                 bundirty(bp);
 473                 iou->io.released = 0;
 474                 BUF_KERNPROC(bp);
 475                 iou->io.reclaim = 1;
 476                 iou->io.waitdep = 1;    /* XXX this is a fs_token field */
 477                 KKASSERT(hammer_isactive(&iou->io.lock) == 1);
 478                 hammer_rel_buffer(&iou->buffer, 0);
 479                 /*hammer_io_deallocate(bp);*/
 480 #endif
 481                 bqrelse(bp);
 482                 error = EAGAIN;
 483         } else {
 484                 KKASSERT((bp->b_flags & B_LOCKED) == 0);
 485                 bundirty(bp);
 486                 bp->b_flags |= B_NOCACHE|B_RELBUF;
 487                 brelse(bp);
 488                 error = 0;
 489         }
 490         lwkt_reltoken(&hmp->io_token);
 491         return(error);
 492 }
 493
 494 /*
 495  * This routine is called on the last reference to a hammer structure.
 496  * The io must be interlocked with a refcount of zero.  The hammer structure
 497  * will remain interlocked on return.
 498  *
 499  * This routine may return a non-NULL bp to the caller for dispoal.
 500  * The caller typically brelse()'s the bp.
 501  *
 502  * The bp may or may not still be passively associated with the IO.  It
 503  * will remain passively associated if it is unreleasable (e.g. a modified
 504  * meta-data buffer).
 505  *
 506  * The only requirement here is that modified meta-data and volume-header
 507  * buffer may NOT be disassociated from the IO structure, and consequently
 508  * we also leave such buffers actively associated with the IO if they already
 509  * are (since the kernel can't do anything with them anyway).  Only the
 510  * flusher is allowed to write such buffers out.  Modified pure-data and
 511  * undo buffers are returned to the kernel but left passively associated
 512  * so we can track when the kernel writes the bp out.
 513  */
 514 struct buf *
 515 hammer_io_release(struct hammer_io *io, int flush)
 516 {
 517         union hammer_io_structure *iou = (void *)io;
 518         struct buf *bp;
 519
 520         if ((bp = io->bp) == NULL)
 521                 return(NULL);
 522
 523         /*
 524          * Try to flush a dirty IO to disk if asked to by the
 525          * caller or if the kernel tried to flush the buffer in the past.
 526          *
 527          * Kernel-initiated flushes are only allowed for pure-data buffers.
 528          * meta-data and volume buffers can only be flushed explicitly
 529          * by HAMMER.
 530          */
 531         if (io->modified) {
 532                 if (flush) {
 533                         hammer_io_flush(io, 0);
 534                 } else if (bp->b_flags & B_LOCKED) {
 535                         switch(io->type) {
 536                         case HAMMER_STRUCTURE_DATA_BUFFER:
 537                                 hammer_io_flush(io, 0);
 538                                 break;
 539                         case HAMMER_STRUCTURE_UNDO_BUFFER:
 540                                 hammer_io_flush(io, hammer_undo_reclaim(io));
 541                                 break;
 542                         default:
 543                                 break;
 544                         }
 545                 } /* else no explicit request to flush the buffer */
 546         }
 547
 548         /*
 549          * Wait for the IO to complete if asked to.  This occurs when
 550          * the buffer must be disposed of definitively during an umount
 551          * or buffer invalidation.
 552          */
 553         if (io->waitdep && io->running) {
 554                 hammer_io_wait(io);
 555         }
 556
 557         /*
 558          * Return control of the buffer to the kernel (with the provisio
 559          * that our bioops can override kernel decisions with regards to
 560          * the buffer).
 561          */
 562         if ((flush || io->reclaim) && io->modified == 0 && io->running == 0) {
 563                 /*
 564                  * Always disassociate the bp if an explicit flush
 565                  * was requested and the IO completed with no error
 566                  * (so unmount can really clean up the structure).
 567                  */
 568                 if (io->released) {
 569                         regetblk(bp);
 570                         BUF_KERNPROC(bp);
 571                 } else {
 572                         io->released = 1;
 573                 }
 574                 hammer_io_disassociate((hammer_io_structure_t)io);
 575                 /* return the bp */
 576         } else if (io->modified) {
 577                 /*
 578                  * Only certain IO types can be released to the kernel if
 579                  * the buffer has been modified.
 580                  *
 581                  * volume and meta-data IO types may only be explicitly
 582                  * flushed by HAMMER.
 583                  */
 584                 switch(io->type) {
 585                 case HAMMER_STRUCTURE_DATA_BUFFER:
 586                 case HAMMER_STRUCTURE_UNDO_BUFFER:
 587                         if (io->released == 0) {
 588                                 io->released = 1;
 589                                 bp->b_flags |= B_CLUSTEROK;
 590                                 bdwrite(bp);
 591                         }
 592                         break;
 593                 default:
 594                         break;
 595                 }
 596                 bp = NULL;      /* bp left associated */
 597         } else if (io->released == 0) {
 598                 /*
 599                  * Clean buffers can be generally released to the kernel.
 600                  * We leave the bp passively associated with the HAMMER
 601                  * structure and use bioops to disconnect it later on
 602                  * if the kernel wants to discard the buffer.
 603                  *
 604                  * We can steal the structure's ownership of the bp.
 605                  */
 606                 io->released = 1;
 607                 if (bp->b_flags & B_LOCKED) {
 608                         hammer_io_disassociate(iou);
 609                         /* return the bp */
 610                 } else {
 611                         if (io->reclaim) {
 612                                 hammer_io_disassociate(iou);
 613                                 /* return the bp */
 614                         } else {
 615                                 /* return the bp (bp passively associated) */
 616                         }
 617                 }
 618         } else {
 619                 /*
 620                  * A released buffer is passively associate with our
 621                  * hammer_io structure.  The kernel cannot destroy it
 622                  * without making a bioops call.  If the kernel (B_LOCKED)
 623                  * or we (reclaim) requested that the buffer be destroyed
 624                  * we destroy it, otherwise we do a quick get/release to
 625                  * reset its position in the kernel's LRU list.
 626                  *
 627                  * Leaving the buffer passively associated allows us to
 628                  * use the kernel's LRU buffer flushing mechanisms rather
 629                  * then rolling our own.
 630                  *
 631                  * XXX there are two ways of doing this.  We can re-acquire
 632                  * and passively release to reset the LRU, or not.
 633                  */
 634                 if (io->running == 0) {
 635                         regetblk(bp);
 636                         if ((bp->b_flags & B_LOCKED) || io->reclaim) {
 637                                 hammer_io_disassociate(iou);
 638                                 /* return the bp */
 639                         } else {
 640                                 /* return the bp (bp passively associated) */
 641                         }
 642                 } else {
 643                         /*
 644                          * bp is left passively associated but we do not
 645                          * try to reacquire it.  Interactions with the io
 646                          * structure will occur on completion of the bp's
 647                          * I/O.
 648                          */
 649                         bp = NULL;
 650                 }
 651         }
 652         return(bp);
 653 }
 654
 655 /*
 656  * This routine is called with a locked IO when a flush is desired and
 657  * no other references to the structure exists other then ours.  This
 658  * routine is ONLY called when HAMMER believes it is safe to flush a
 659  * potentially modified buffer out.
 660  *
 661  * The locked io or io reference prevents a flush from being initiated
 662  * by the kernel.
 663  */
 664 void
 665 hammer_io_flush(struct hammer_io *io, int reclaim)
 666 {
 667         struct buf *bp;
 668         hammer_mount_t hmp;
 669
 670         /*
 671          * Degenerate case - nothing to flush if nothing is dirty.
 672          */
 673         if (io->modified == 0)
 674                 return;
 675
 676         KKASSERT(io->bp);
 677         KKASSERT(io->modify_refs <= 0);
 678
 679         /*
 680          * Acquire ownership of the bp, particularly before we clear our
 681          * modified flag.
 682          *
 683          * We are going to bawrite() this bp.  Don't leave a window where
 684          * io->released is set, we actually own the bp rather then our
 685          * buffer.
 686          *
 687          * The io_token should not be required here as only
 688          */
 689         hmp = io->hmp;
 690         bp = io->bp;
 691         if (io->released) {
 692                 regetblk(bp);
 693                 /* BUF_KERNPROC(io->bp); */
 694                 /* io->released = 0; */
 695                 KKASSERT(io->released);
 696                 KKASSERT(io->bp == bp);
 697         } else {
 698                 io->released = 1;
 699         }
 700
 701         if (reclaim) {
 702                 io->reclaim = 1;
 703                 if ((bp->b_flags & B_LOCKED) == 0) {
 704                         bp->b_flags |= B_LOCKED;
 705                         atomic_add_int(&hammer_count_io_locked, 1);
 706                 }
 707         }
 708
 709         /*
 710          * Acquire exclusive access to the bp and then clear the modified
 711          * state of the buffer prior to issuing I/O to interlock any
 712          * modifications made while the I/O is in progress.  This shouldn't
 713          * happen anyway but losing data would be worse.  The modified bit
 714          * will be rechecked after the IO completes.
 715          *
 716          * NOTE: This call also finalizes the buffer's content (inval == 0).
 717          *
 718          * This is only legal when lock.refs == 1 (otherwise we might clear
 719          * the modified bit while there are still users of the cluster
 720          * modifying the data).
 721          *
 722          * Do this before potentially blocking so any attempt to modify the
 723          * ondisk while we are blocked blocks waiting for us.
 724          */
 725         hammer_ref(&io->lock);
 726         hammer_io_clear_modify(io, 0);
 727         hammer_rel(&io->lock);
 728
 729         if (hammer_debug_io & 0x0002)
 730                 kprintf("hammer io_write %016jx\n", bp->b_bio1.bio_offset);
 731
 732         /*
 733          * Transfer ownership to the kernel and initiate I/O.
 734          *
 735          * NOTE: We do not hold io_token so an atomic op is required to
 736          *       update io_running_space.
 737          */
 738         io->running = 1;
 739         atomic_add_long(&hmp->io_running_space, io->bytes);
 740         atomic_add_long(&hammer_count_io_running_write, io->bytes);
 741         lwkt_gettoken(&hmp->io_token);
 742         TAILQ_INSERT_TAIL(&hmp->iorun_list, io, iorun_entry);
 743         lwkt_reltoken(&hmp->io_token);
 744         cluster_awrite(bp);
 745         hammer_io_flush_mark(io->volume);
 746 }
 747
 748 /************************************************************************
 749  *                              BUFFER DIRTYING                         *
 750  ************************************************************************
 751  *
 752  * These routines deal with dependancies created when IO buffers get
 753  * modified.  The caller must call hammer_modify_*() on a referenced
 754  * HAMMER structure prior to modifying its on-disk data.
 755  *
 756  * Any intent to modify an IO buffer acquires the related bp and imposes
 757  * various write ordering dependancies.
 758  */
 759
 760 /*
 761  * Mark a HAMMER structure as undergoing modification.  Meta-data buffers
 762  * are locked until the flusher can deal with them, pure data buffers
 763  * can be written out.
 764  *
 765  * The referenced io prevents races.
 766  */
 767 static
 768 void
 769 hammer_io_modify(hammer_io_t io, int count)
 770 {
 771         /*
 772          * io->modify_refs must be >= 0
 773          */
 774         while (io->modify_refs < 0) {
 775                 io->waitmod = 1;
 776                 tsleep(io, 0, "hmrmod", 0);
 777         }
 778
 779         /*
 780          * Shortcut if nothing to do.
 781          */
 782         KKASSERT(hammer_isactive(&io->lock) && io->bp != NULL);
 783         io->modify_refs += count;
 784         if (io->modified && io->released == 0)
 785                 return;
 786
 787         /*
 788          * NOTE: It is important not to set the modified bit
 789          *       until after we have acquired the bp or we risk
 790          *       racing against checkwrite.
 791          */
 792         hammer_lock_ex(&io->lock);
 793         if (io->released) {
 794                 regetblk(io->bp);
 795                 BUF_KERNPROC(io->bp);
 796                 io->released = 0;
 797         }
 798         if (io->modified == 0) {
 799                 hammer_io_set_modlist(io);
 800                 io->modified = 1;
 801         }
 802         hammer_unlock(&io->lock);
 803 }
 804
 805 static __inline
 806 void
 807 hammer_io_modify_done(hammer_io_t io)
 808 {
 809         KKASSERT(io->modify_refs > 0);
 810         --io->modify_refs;
 811         if (io->modify_refs == 0 && io->waitmod) {
 812                 io->waitmod = 0;
 813                 wakeup(io);
 814         }
 815 }
 816
 817 /*
 818  * The write interlock blocks other threads trying to modify a buffer
 819  * (they block in hammer_io_modify()) after us, or blocks us while other
 820  * threads are in the middle of modifying a buffer.
 821  *
 822  * The caller also has a ref on the io, however if we are not careful
 823  * we will race bioops callbacks (checkwrite).  To deal with this
 824  * we must at least acquire and release the io_token, and it is probably
 825  * better to hold it through the setting of modify_refs.
 826  */
 827 void
 828 hammer_io_write_interlock(hammer_io_t io)
 829 {
 830         hammer_mount_t hmp = io->hmp;
 831
 832         lwkt_gettoken(&hmp->io_token);
 833         while (io->modify_refs != 0) {
 834                 io->waitmod = 1;
 835                 tsleep(io, 0, "hmrmod", 0);
 836         }
 837         io->modify_refs = -1;
 838         lwkt_reltoken(&hmp->io_token);
 839 }
 840
 841 void
 842 hammer_io_done_interlock(hammer_io_t io)
 843 {
 844         KKASSERT(io->modify_refs == -1);
 845         io->modify_refs = 0;
 846         if (io->waitmod) {
 847                 io->waitmod = 0;
 848                 wakeup(io);
 849         }
 850 }
 851
 852 /*
 853  * Caller intends to modify a volume's ondisk structure.
 854  *
 855  * This is only allowed if we are the flusher or we have a ref on the
 856  * sync_lock.
 857  */
 858 void
 859 hammer_modify_volume(hammer_transaction_t trans, hammer_volume_t volume,
 860                      void *base, int len)
 861 {
 862         KKASSERT (trans == NULL || trans->sync_lock_refs > 0);
 863
 864         hammer_io_modify(&volume->io, 1);
 865         if (len) {
 866                 intptr_t rel_offset = (intptr_t)base - (intptr_t)volume->ondisk;
 867                 KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0);
 868                 hammer_generate_undo(trans,
 869                          HAMMER_ENCODE_RAW_VOLUME(volume->vol_no, rel_offset),
 870                          base, len);
 871         }
 872 }
 873
 874 /*
 875  * Caller intends to modify a buffer's ondisk structure.
 876  *
 877  * This is only allowed if we are the flusher or we have a ref on the
 878  * sync_lock.
 879  */
 880 void
 881 hammer_modify_buffer(hammer_transaction_t trans, hammer_buffer_t buffer,
 882                      void *base, int len)
 883 {
 884         KKASSERT (trans == NULL || trans->sync_lock_refs > 0);
 885
 886         hammer_io_modify(&buffer->io, 1);
 887         if (len) {
 888                 intptr_t rel_offset = (intptr_t)base - (intptr_t)buffer->ondisk;
 889                 KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0);
 890                 hammer_generate_undo(trans,
 891                                      buffer->zone2_offset + rel_offset,
 892                                      base, len);
 893         }
 894 }
 895
 896 void
 897 hammer_modify_volume_done(hammer_volume_t volume)
 898 {
 899         hammer_io_modify_done(&volume->io);
 900 }
 901
 902 void
 903 hammer_modify_buffer_done(hammer_buffer_t buffer)
 904 {
 905         hammer_io_modify_done(&buffer->io);
 906 }
 907
 908 /*
 909  * Mark an entity as not being dirty any more and finalize any
 910  * delayed adjustments to the buffer.
 911  *
 912  * Delayed adjustments are an important performance enhancement, allowing
 913  * us to avoid recalculating B-Tree node CRCs over and over again when
 914  * making bulk-modifications to the B-Tree.
 915  *
 916  * If inval is non-zero delayed adjustments are ignored.
 917  *
 918  * This routine may dereference related btree nodes and cause the
 919  * buffer to be dereferenced.  The caller must own a reference on io.
 920  */
 921 void
 922 hammer_io_clear_modify(struct hammer_io *io, int inval)
 923 {
 924         hammer_mount_t hmp;
 925
 926         /*
 927          * io_token is needed to avoid races on mod_root
 928          */
 929         if (io->modified == 0)
 930                 return;
 931         hmp = io->hmp;
 932         lwkt_gettoken(&hmp->io_token);
 933         if (io->modified == 0) {
 934                 lwkt_reltoken(&hmp->io_token);
 935                 return;
 936         }
 937
 938         /*
 939          * Take us off the mod-list and clear the modified bit.
 940          */
 941         KKASSERT(io->mod_root != NULL);
 942         if (io->mod_root == &io->hmp->volu_root ||
 943             io->mod_root == &io->hmp->meta_root) {
 944                 io->hmp->locked_dirty_space -= io->bytes;
 945                 atomic_add_long(&hammer_count_dirtybufspace, -io->bytes);
 946         }
 947         RB_REMOVE(hammer_mod_rb_tree, io->mod_root, io);
 948         io->mod_root = NULL;
 949         io->modified = 0;
 950
 951         lwkt_reltoken(&hmp->io_token);
 952
 953         /*
 954          * If this bit is not set there are no delayed adjustments.
 955          */
 956         if (io->gencrc == 0)
 957                 return;
 958         io->gencrc = 0;
 959
 960         /*
 961          * Finalize requested CRCs.  The NEEDSCRC flag also holds a reference
 962          * on the node (& underlying buffer).  Release the node after clearing
 963          * the flag.
 964          */
 965         if (io->type == HAMMER_STRUCTURE_META_BUFFER) {
 966                 hammer_buffer_t buffer = (void *)io;
 967                 hammer_node_t node;
 968
 969 restart:
 970                 TAILQ_FOREACH(node, &buffer->clist, entry) {
 971                         if ((node->flags & HAMMER_NODE_NEEDSCRC) == 0)
 972                                 continue;
 973                         node->flags &= ~HAMMER_NODE_NEEDSCRC;
 974                         KKASSERT(node->ondisk);
 975                         if (inval == 0)
 976                                 node->ondisk->crc = crc32(&node->ondisk->crc + 1, HAMMER_BTREE_CRCSIZE);
 977                         hammer_rel_node(node);
 978                         goto restart;
 979                 }
 980         }
 981         /* caller must still have ref on io */
 982         KKASSERT(hammer_isactive(&io->lock));
 983 }
 984
 985 /*
 986  * Clear the IO's modify list.  Even though the IO is no longer modified
 987  * it may still be on the lose_root.  This routine is called just before
 988  * the governing hammer_buffer is destroyed.
 989  *
 990  * mod_root requires io_token protection.
 991  */
 992 void
 993 hammer_io_clear_modlist(struct hammer_io *io)
 994 {
 995         hammer_mount_t hmp = io->hmp;
 996
 997         KKASSERT(io->modified == 0);
 998         if (io->mod_root) {
 999                 lwkt_gettoken(&hmp->io_token);
1000                 if (io->mod_root) {
1001                         KKASSERT(io->mod_root == &io->hmp->lose_root);
1002                         RB_REMOVE(hammer_mod_rb_tree, io->mod_root, io);
1003                         io->mod_root = NULL;
1004                 }
1005                 lwkt_reltoken(&hmp->io_token);
1006         }
1007 }
1008
1009 static void
1010 hammer_io_set_modlist(struct hammer_io *io)
1011 {
1012         struct hammer_mount *hmp = io->hmp;
1013
1014         lwkt_gettoken(&hmp->io_token);
1015         KKASSERT(io->mod_root == NULL);
1016
1017         switch(io->type) {
1018         case HAMMER_STRUCTURE_VOLUME:
1019                 io->mod_root = &hmp->volu_root;
1020                 hmp->locked_dirty_space += io->bytes;
1021                 atomic_add_long(&hammer_count_dirtybufspace, io->bytes);
1022                 break;
1023         case HAMMER_STRUCTURE_META_BUFFER:
1024                 io->mod_root = &hmp->meta_root;
1025                 hmp->locked_dirty_space += io->bytes;
1026                 atomic_add_long(&hammer_count_dirtybufspace, io->bytes);
1027                 break;
1028         case HAMMER_STRUCTURE_UNDO_BUFFER:
1029                 io->mod_root = &hmp->undo_root;
1030                 break;
1031         case HAMMER_STRUCTURE_DATA_BUFFER:
1032                 io->mod_root = &hmp->data_root;
1033                 break;
1034         case HAMMER_STRUCTURE_DUMMY:
1035                 panic("hammer_io_set_modlist: bad io type");
1036                 break; /* NOT REACHED */
1037         }
1038         if (RB_INSERT(hammer_mod_rb_tree, io->mod_root, io)) {
1039                 panic("hammer_io_set_modlist: duplicate entry");
1040                 /* NOT REACHED */
1041         }
1042         lwkt_reltoken(&hmp->io_token);
1043 }
1044
1045 /************************************************************************
1046  *                              HAMMER_BIOOPS                           *
1047  ************************************************************************
1048  *
1049  */
1050
1051 /*
1052  * Pre-IO initiation kernel callback - cluster build only
1053  *
1054  * bioops callback - hold io_token
1055  */
1056 static void
1057 hammer_io_start(struct buf *bp)
1058 {
1059         /* nothing to do, so io_token not needed */
1060 }
1061
1062 /*
1063  * Post-IO completion kernel callback - MAY BE CALLED FROM INTERRUPT!
1064  *
1065  * NOTE: HAMMER may modify a data buffer after we have initiated write
1066  *       I/O.
1067  *
1068  * NOTE: MPSAFE callback
1069  *
1070  * bioops callback - hold io_token
1071  */
1072 static void
1073 hammer_io_complete(struct buf *bp)
1074 {
1075         union hammer_io_structure *iou = (void *)LIST_FIRST(&bp->b_dep);
1076         struct hammer_mount *hmp = iou->io.hmp;
1077         struct hammer_io *ionext;
1078
1079         lwkt_gettoken(&hmp->io_token);
1080
1081         KKASSERT(iou->io.released == 1);
1082
1083         /*
1084          * Deal with people waiting for I/O to drain
1085          */
1086         if (iou->io.running) {
1087                 /*
1088                  * Deal with critical write errors.  Once a critical error
1089                  * has been flagged in hmp the UNDO FIFO will not be updated.
1090                  * That way crash recover will give us a consistent
1091                  * filesystem.
1092                  *
1093                  * Because of this we can throw away failed UNDO buffers.  If
1094                  * we throw away META or DATA buffers we risk corrupting
1095                  * the now read-only version of the filesystem visible to
1096                  * the user.  Clear B_ERROR so the buffer is not re-dirtied
1097                  * by the kernel and ref the io so it doesn't get thrown
1098                  * away.
1099                  */
1100                 if (bp->b_flags & B_ERROR) {
1101                         lwkt_gettoken(&hmp->fs_token);
1102                         hammer_critical_error(hmp, NULL, bp->b_error,
1103                                               "while flushing meta-data");
1104                         lwkt_reltoken(&hmp->fs_token);
1105
1106                         switch(iou->io.type) {
1107                         case HAMMER_STRUCTURE_UNDO_BUFFER:
1108                                 break;
1109                         default:
1110                                 if (iou->io.ioerror == 0) {
1111                                         iou->io.ioerror = 1;
1112                                         hammer_ref(&iou->io.lock);
1113                                 }
1114                                 break;
1115                         }
1116                         bp->b_flags &= ~B_ERROR;
1117                         bundirty(bp);
1118 #if 0
1119                         hammer_io_set_modlist(&iou->io);
1120                         iou->io.modified = 1;
1121 #endif
1122                 }
1123                 hammer_stats_disk_write += iou->io.bytes;
1124                 atomic_add_long(&hammer_count_io_running_write, -iou->io.bytes);
1125                 atomic_add_long(&hmp->io_running_space, -iou->io.bytes);
1126                 KKASSERT(hmp->io_running_space >= 0);
1127                 iou->io.running = 0;
1128
1129                 /*
1130                  * Remove from iorun list and wakeup any multi-io waiter(s).
1131                  */
1132                 if (TAILQ_FIRST(&hmp->iorun_list) == &iou->io) {
1133                         ionext = TAILQ_NEXT(&iou->io, iorun_entry);
1134                         if (ionext && ionext->type == HAMMER_STRUCTURE_DUMMY)
1135                                 wakeup(ionext);
1136                 }
1137                 TAILQ_REMOVE(&hmp->iorun_list, &iou->io, iorun_entry);
1138         } else {
1139                 hammer_stats_disk_read += iou->io.bytes;
1140         }
1141
1142         if (iou->io.waiting) {
1143                 iou->io.waiting = 0;
1144                 wakeup(iou);
1145         }
1146
1147         /*
1148          * If B_LOCKED is set someone wanted to deallocate the bp at some
1149          * point, try to do it now.  The operation will fail if there are
1150          * refs or if hammer_io_deallocate() is unable to gain the
1151          * interlock.
1152          */
1153         if (bp->b_flags & B_LOCKED) {
1154                 atomic_add_int(&hammer_count_io_locked, -1);
1155                 bp->b_flags &= ~B_LOCKED;
1156                 hammer_io_deallocate(bp);
1157                 /* structure may be dead now */
1158         }
1159         lwkt_reltoken(&hmp->io_token);
1160 }
1161
1162 /*
1163  * Callback from kernel when it wishes to deallocate a passively
1164  * associated structure.  This mostly occurs with clean buffers
1165  * but it may be possible for a holding structure to be marked dirty
1166  * while its buffer is passively associated.  The caller owns the bp.
1167  *
1168  * If we cannot disassociate we set B_LOCKED to prevent the buffer
1169  * from getting reused.
1170  *
1171  * WARNING: Because this can be called directly by getnewbuf we cannot
1172  * recurse into the tree.  If a bp cannot be immediately disassociated
1173  * our only recourse is to set B_LOCKED.
1174  *
1175  * WARNING: This may be called from an interrupt via hammer_io_complete()
1176  *
1177  * bioops callback - hold io_token
1178  */
1179 static void
1180 hammer_io_deallocate(struct buf *bp)
1181 {
1182         hammer_io_structure_t iou = (void *)LIST_FIRST(&bp->b_dep);
1183         hammer_mount_t hmp;
1184
1185         hmp = iou->io.hmp;
1186
1187         lwkt_gettoken(&hmp->io_token);
1188
1189         KKASSERT((bp->b_flags & B_LOCKED) == 0 && iou->io.running == 0);
1190         if (hammer_try_interlock_norefs(&iou->io.lock) == 0) {
1191                 /*
1192                  * We cannot safely disassociate a bp from a referenced
1193                  * or interlocked HAMMER structure.
1194                  */
1195                 bp->b_flags |= B_LOCKED;
1196                 atomic_add_int(&hammer_count_io_locked, 1);
1197         } else if (iou->io.modified) {
1198                 /*
1199                  * It is not legal to disassociate a modified buffer.  This
1200                  * case really shouldn't ever occur.
1201                  */
1202                 bp->b_flags |= B_LOCKED;
1203                 atomic_add_int(&hammer_count_io_locked, 1);
1204                 hammer_put_interlock(&iou->io.lock, 0);
1205         } else {
1206                 /*
1207                  * Disassociate the BP.  If the io has no refs left we
1208                  * have to add it to the loose list.  The kernel has
1209                  * locked the buffer and therefore our io must be
1210                  * in a released state.
1211                  */
1212                 hammer_io_disassociate(iou);
1213                 if (iou->io.type != HAMMER_STRUCTURE_VOLUME) {
1214                         KKASSERT(iou->io.bp == NULL);
1215                         KKASSERT(iou->io.mod_root == NULL);
1216                         iou->io.mod_root = &hmp->lose_root;
1217                         if (RB_INSERT(hammer_mod_rb_tree, iou->io.mod_root,
1218                                       &iou->io)) {
1219                                 panic("hammer_io_deallocate: duplicate entry");
1220                         }
1221                 }
1222                 hammer_put_interlock(&iou->io.lock, 1);
1223         }
1224         lwkt_reltoken(&hmp->io_token);
1225 }
1226
1227 /*
1228  * bioops callback - hold io_token
1229  */
1230 static int
1231 hammer_io_fsync(struct vnode *vp)
1232 {
1233         /* nothing to do, so io_token not needed */
1234         return(0);
1235 }
1236
1237 /*
1238  * NOTE: will not be called unless we tell the kernel about the
1239  * bioops.  Unused... we use the mount's VFS_SYNC instead.
1240  *
1241  * bioops callback - hold io_token
1242  */
1243 static int
1244 hammer_io_sync(struct mount *mp)
1245 {
1246         /* nothing to do, so io_token not needed */
1247         return(0);
1248 }
1249
1250 /*
1251  * bioops callback - hold io_token
1252  */
1253 static void
1254 hammer_io_movedeps(struct buf *bp1, struct buf *bp2)
1255 {
1256         /* nothing to do, so io_token not needed */
1257 }
1258
1259 /*
1260  * I/O pre-check for reading and writing.  HAMMER only uses this for
1261  * B_CACHE buffers so checkread just shouldn't happen, but if it does
1262  * allow it.
1263  *
1264  * Writing is a different case.  We don't want the kernel to try to write
1265  * out a buffer that HAMMER may be modifying passively or which has a
1266  * dependancy.  In addition, kernel-demanded writes can only proceed for
1267  * certain types of buffers (i.e. UNDO and DATA types).  Other dirty
1268  * buffer types can only be explicitly written by the flusher.
1269  *
1270  * checkwrite will only be called for bdwrite()n buffers.  If we return
1271  * success the kernel is guaranteed to initiate the buffer write.
1272  *
1273  * bioops callback - hold io_token
1274  */
1275 static int
1276 hammer_io_checkread(struct buf *bp)
1277 {
1278         /* nothing to do, so io_token not needed */
1279         return(0);
1280 }
1281
1282 /*
1283  * The kernel is asking us whether it can write out a dirty buffer or not.
1284  *
1285  * bioops callback - hold io_token
1286  */
1287 static int
1288 hammer_io_checkwrite(struct buf *bp)
1289 {
1290         hammer_io_t io = (void *)LIST_FIRST(&bp->b_dep);
1291         hammer_mount_t hmp = io->hmp;
1292
1293         /*
1294          * This shouldn't happen under normal operation.
1295          */
1296         lwkt_gettoken(&hmp->io_token);
1297         if (io->type == HAMMER_STRUCTURE_VOLUME ||
1298             io->type == HAMMER_STRUCTURE_META_BUFFER) {
1299                 if (!panicstr)
1300                         panic("hammer_io_checkwrite: illegal buffer");
1301                 if ((bp->b_flags & B_LOCKED) == 0) {
1302                         bp->b_flags |= B_LOCKED;
1303                         atomic_add_int(&hammer_count_io_locked, 1);
1304                 }
1305                 lwkt_reltoken(&hmp->io_token);
1306                 return(1);
1307         }
1308
1309         /*
1310          * We have to be able to interlock the IO to safely modify any
1311          * of its fields without holding the fs_token.  If we can't lock
1312          * it then we are racing someone.
1313          *
1314          * Our ownership of the bp lock prevents the io from being ripped
1315          * out from under us.
1316          */
1317         if (hammer_try_interlock_norefs(&io->lock) == 0) {
1318                 bp->b_flags |= B_LOCKED;
1319                 atomic_add_int(&hammer_count_io_locked, 1);
1320                 lwkt_reltoken(&hmp->io_token);
1321                 return(1);
1322         }
1323
1324         /*
1325          * The modified bit must be cleared prior to the initiation of
1326          * any IO (returning 0 initiates the IO).  Because this is a
1327          * normal data buffer hammer_io_clear_modify() runs through a
1328          * simple degenerate case.
1329          *
1330          * Return 0 will cause the kernel to initiate the IO, and we
1331          * must normally clear the modified bit before we begin.  If
1332          * the io has modify_refs we do not clear the modified bit,
1333          * otherwise we may miss changes.
1334          *
1335          * Only data and undo buffers can reach here.  These buffers do
1336          * not have terminal crc functions but we temporarily reference
1337          * the IO anyway, just in case.
1338          */
1339         if (io->modify_refs == 0 && io->modified) {
1340                 hammer_ref(&io->lock);
1341                 hammer_io_clear_modify(io, 0);
1342                 hammer_rel(&io->lock);
1343         } else if (io->modified) {
1344                 KKASSERT(io->type == HAMMER_STRUCTURE_DATA_BUFFER);
1345         }
1346
1347         /*
1348          * The kernel is going to start the IO, set io->running.
1349          */
1350         KKASSERT(io->running == 0);
1351         io->running = 1;
1352         atomic_add_long(&io->hmp->io_running_space, io->bytes);
1353         atomic_add_long(&hammer_count_io_running_write, io->bytes);
1354         TAILQ_INSERT_TAIL(&io->hmp->iorun_list, io, iorun_entry);
1355
1356         hammer_put_interlock(&io->lock, 1);
1357         lwkt_reltoken(&hmp->io_token);
1358
1359         return(0);
1360 }
1361
1362 /*
1363  * Return non-zero if we wish to delay the kernel's attempt to flush
1364  * this buffer to disk.
1365  *
1366  * bioops callback - hold io_token
1367  */
1368 static int
1369 hammer_io_countdeps(struct buf *bp, int n)
1370 {
1371         /* nothing to do, so io_token not needed */
1372         return(0);
1373 }
1374
1375 struct bio_ops hammer_bioops = {
1376         .io_start       = hammer_io_start,
1377         .io_complete    = hammer_io_complete,
1378         .io_deallocate  = hammer_io_deallocate,
1379         .io_fsync       = hammer_io_fsync,
1380         .io_sync        = hammer_io_sync,
1381         .io_movedeps    = hammer_io_movedeps,
1382         .io_countdeps   = hammer_io_countdeps,
1383         .io_checkread   = hammer_io_checkread,
1384         .io_checkwrite  = hammer_io_checkwrite,
1385 };
1386
1387 /************************************************************************
1388  *                              DIRECT IO OPS                           *
1389  ************************************************************************
1390  *
1391  * These functions operate directly on the buffer cache buffer associated
1392  * with a front-end vnode rather then a back-end device vnode.
1393  */
1394
1395 /*
1396  * Read a buffer associated with a front-end vnode directly from the
1397  * disk media.  The bio may be issued asynchronously.  If leaf is non-NULL
1398  * we validate the CRC.
1399  *
1400  * We must check for the presence of a HAMMER buffer to handle the case
1401  * where the reblocker has rewritten the data (which it does via the HAMMER
1402  * buffer system, not via the high-level vnode buffer cache), but not yet
1403  * committed the buffer to the media.
1404  */
1405 int
1406 hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio,
1407                       hammer_btree_leaf_elm_t leaf)
1408 {
1409         hammer_off_t buf_offset;
1410         hammer_off_t zone2_offset;
1411         hammer_volume_t volume;
1412         struct buf *bp;
1413         struct bio *nbio;
1414         int vol_no;
1415         int error;
1416
1417         buf_offset = bio->bio_offset;
1418         KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) ==
1419                  HAMMER_ZONE_LARGE_DATA);
1420
1421         /*
1422          * The buffer cache may have an aliased buffer (the reblocker can
1423          * write them).  If it does we have to sync any dirty data before
1424          * we can build our direct-read.  This is a non-critical code path.
1425          */
1426         bp = bio->bio_buf;
1427         hammer_sync_buffers(hmp, buf_offset, bp->b_bufsize);
1428
1429         /*
1430          * Resolve to a zone-2 offset.  The conversion just requires
1431          * munging the top 4 bits but we want to abstract it anyway
1432          * so the blockmap code can verify the zone assignment.
1433          */
1434         zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error);
1435         if (error)
1436                 goto done;
1437         KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) ==
1438                  HAMMER_ZONE_RAW_BUFFER);
1439
1440         /*
1441          * Resolve volume and raw-offset for 3rd level bio.  The
1442          * offset will be specific to the volume.
1443          */
1444         vol_no = HAMMER_VOL_DECODE(zone2_offset);
1445         volume = hammer_get_volume(hmp, vol_no, &error);
1446         if (error == 0 && zone2_offset >= volume->maxbuf_off)
1447                 error = EIO;
1448
1449         if (error == 0) {
1450                 /*
1451                  * 3rd level bio
1452                  */
1453                 nbio = push_bio(bio);
1454                 nbio->bio_offset = volume->ondisk->vol_buf_beg +
1455                                    (zone2_offset & HAMMER_OFF_SHORT_MASK);
1456                 hammer_stats_disk_read += bp->b_bufsize;
1457                 vn_strategy(volume->devvp, nbio);
1458         }
1459         hammer_rel_volume(volume, 0);
1460 done:
1461         if (error) {
1462                 kprintf("hammer_direct_read: failed @ %016llx\n",
1463                         (long long)zone2_offset);
1464                 bp->b_error = error;
1465                 bp->b_flags |= B_ERROR;
1466                 biodone(bio);
1467         }
1468         return(error);
1469 }
1470
1471 /*
1472  * This works similarly to hammer_io_direct_read() except instead of
1473  * directly reading from the device into the bio we instead indirectly
1474  * read through the device's buffer cache and then copy the data into
1475  * the bio.
1476  *
1477  * If leaf is non-NULL and validation is enabled, the CRC will be checked.
1478  *
1479  * This routine also executes asynchronously.  It allows hammer strategy
1480  * calls to operate asynchronously when in double_buffer mode (in addition
1481  * to operating asynchronously when in normal mode).
1482  */
1483 int
1484 hammer_io_indirect_read(hammer_mount_t hmp, struct bio *bio,
1485                         hammer_btree_leaf_elm_t leaf)
1486 {
1487         hammer_off_t buf_offset;
1488         hammer_off_t zone2_offset;
1489         hammer_volume_t volume;
1490         struct buf *bp;
1491         int vol_no;
1492         int error;
1493
1494         buf_offset = bio->bio_offset;
1495         KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) ==
1496                  HAMMER_ZONE_LARGE_DATA);
1497
1498         /*
1499          * The buffer cache may have an aliased buffer (the reblocker can
1500          * write them).  If it does we have to sync any dirty data before
1501          * we can build our direct-read.  This is a non-critical code path.
1502          */
1503         bp = bio->bio_buf;
1504         hammer_sync_buffers(hmp, buf_offset, bp->b_bufsize);
1505
1506         /*
1507          * Resolve to a zone-2 offset.  The conversion just requires
1508          * munging the top 4 bits but we want to abstract it anyway
1509          * so the blockmap code can verify the zone assignment.
1510          */
1511         zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error);
1512         if (error)
1513                 goto done;
1514         KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) ==
1515                  HAMMER_ZONE_RAW_BUFFER);
1516
1517         /*
1518          * Resolve volume and raw-offset for 3rd level bio.  The
1519          * offset will be specific to the volume.
1520          */
1521         vol_no = HAMMER_VOL_DECODE(zone2_offset);
1522         volume = hammer_get_volume(hmp, vol_no, &error);
1523         if (error == 0 && zone2_offset >= volume->maxbuf_off)
1524                 error = EIO;
1525
1526         if (error == 0) {
1527                 /*
1528                  * Convert to the raw volume->devvp offset and acquire
1529                  * the buf, issuing async I/O if necessary.
1530                  */
1531                 buf_offset = volume->ondisk->vol_buf_beg +
1532                              (zone2_offset & HAMMER_OFF_SHORT_MASK);
1533
1534                 if (leaf && hammer_verify_data) {
1535                         bio->bio_caller_info1.uvalue32 = leaf->data_crc;
1536                         bio->bio_caller_info2.index = 1;
1537                 } else {
1538                         bio->bio_caller_info2.index = 0;
1539                 }
1540                 breadcb(volume->devvp, buf_offset, bp->b_bufsize,
1541                         hammer_indirect_callback, bio);
1542         }
1543         hammer_rel_volume(volume, 0);
1544 done:
1545         if (error) {
1546                 kprintf("hammer_direct_read: failed @ %016llx\n",
1547                         (long long)zone2_offset);
1548                 bp->b_error = error;
1549                 bp->b_flags |= B_ERROR;
1550                 biodone(bio);
1551         }
1552         return(error);
1553 }
1554
1555 /*
1556  * Indirect callback on completion.  bio/bp specify the device-backed
1557  * buffer.  bio->bio_caller_info1.ptr holds obio.
1558  *
1559  * obio/obp is the original regular file buffer.  obio->bio_caller_info*
1560  * contains the crc specification.
1561  *
1562  * We are responsible for calling bpdone() and bqrelse() on bio/bp, and
1563  * for calling biodone() on obio.
1564  */
1565 static void
1566 hammer_indirect_callback(struct bio *bio)
1567 {
1568         struct buf *bp = bio->bio_buf;
1569         struct buf *obp;
1570         struct bio *obio;
1571
1572         /*
1573          * If BIO_DONE is already set the device buffer was already
1574          * fully valid (B_CACHE).  If it is not set then I/O was issued
1575          * and we have to run I/O completion as the last bio.
1576          *
1577          * Nobody is waiting for our device I/O to complete, we are
1578          * responsible for bqrelse()ing it which means we also have to do
1579          * the equivalent of biowait() and clear BIO_DONE (which breadcb()
1580          * may have set).
1581          *
1582          * Any preexisting device buffer should match the requested size,
1583          * but due to bigblock recycling and other factors there is some
1584          * fragility there, so we assert that the device buffer covers
1585          * the request.
1586          */
1587         if ((bio->bio_flags & BIO_DONE) == 0)
1588                 bpdone(bp, 0);
1589         bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
1590
1591         obio = bio->bio_caller_info1.ptr;
1592         obp = obio->bio_buf;
1593
1594         if (bp->b_flags & B_ERROR) {
1595                 obp->b_flags |= B_ERROR;
1596                 obp->b_error = bp->b_error;
1597         } else if (obio->bio_caller_info2.index &&
1598                    obio->bio_caller_info1.uvalue32 !=
1599                     crc32(bp->b_data, bp->b_bufsize)) {
1600                 obp->b_flags |= B_ERROR;
1601                 obp->b_error = EIO;
1602         } else {
1603                 KKASSERT(bp->b_bufsize >= obp->b_bufsize);
1604                 bcopy(bp->b_data, obp->b_data, obp->b_bufsize);
1605                 obp->b_resid = 0;
1606                 obp->b_flags |= B_AGE;
1607         }
1608         biodone(obio);
1609         bqrelse(bp);
1610 }
1611
1612 /*
1613  * Write a buffer associated with a front-end vnode directly to the
1614  * disk media.  The bio may be issued asynchronously.
1615  *
1616  * The BIO is associated with the specified record and RECG_DIRECT_IO
1617  * is set.  The recorded is added to its object.
1618  */
1619 int
1620 hammer_io_direct_write(hammer_mount_t hmp, struct bio *bio,
1621                        hammer_record_t record)
1622 {
1623         hammer_btree_leaf_elm_t leaf = &record->leaf;
1624         hammer_off_t buf_offset;
1625         hammer_off_t zone2_offset;
1626         hammer_volume_t volume;
1627         hammer_buffer_t buffer;
1628         struct buf *bp;
1629         struct bio *nbio;
1630         char *ptr;
1631         int vol_no;
1632         int error;
1633
1634         buf_offset = leaf->data_offset;
1635
1636         KKASSERT(buf_offset > HAMMER_ZONE_BTREE);
1637         KKASSERT(bio->bio_buf->b_cmd == BUF_CMD_WRITE);
1638
1639         /*
1640          * Issue or execute the I/O.  The new memory record must replace
1641          * the old one before the I/O completes, otherwise a reaquisition of
1642          * the buffer will load the old media data instead of the new.
1643          */
1644         if ((buf_offset & HAMMER_BUFMASK) == 0 &&
1645             leaf->data_len >= HAMMER_BUFSIZE) {
1646                 /*
1647                  * We are using the vnode's bio to write directly to the
1648                  * media, any hammer_buffer at the same zone-X offset will
1649                  * now have stale data.
1650                  */
1651                 zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error);
1652                 vol_no = HAMMER_VOL_DECODE(zone2_offset);
1653                 volume = hammer_get_volume(hmp, vol_no, &error);
1654
1655                 if (error == 0 && zone2_offset >= volume->maxbuf_off)
1656                         error = EIO;
1657                 if (error == 0) {
1658                         bp = bio->bio_buf;
1659                         KKASSERT((bp->b_bufsize & HAMMER_BUFMASK) == 0);
1660                         /*
1661                         hammer_del_buffers(hmp, buf_offset,
1662                                            zone2_offset, bp->b_bufsize);
1663                         */
1664
1665                         /*
1666                          * Second level bio - cached zone2 offset.
1667                          *
1668                          * (We can put our bio_done function in either the
1669                          *  2nd or 3rd level).
1670                          */
1671                         nbio = push_bio(bio);
1672                         nbio->bio_offset = zone2_offset;
1673                         nbio->bio_done = hammer_io_direct_write_complete;
1674                         nbio->bio_caller_info1.ptr = record;
1675                         record->zone2_offset = zone2_offset;
1676                         record->gflags |= HAMMER_RECG_DIRECT_IO |
1677                                          HAMMER_RECG_DIRECT_INVAL;
1678
1679                         /*
1680                          * Third level bio - raw offset specific to the
1681                          * correct volume.
1682                          */
1683                         zone2_offset &= HAMMER_OFF_SHORT_MASK;
1684                         nbio = push_bio(nbio);
1685                         nbio->bio_offset = volume->ondisk->vol_buf_beg +
1686                                            zone2_offset;
1687                         hammer_stats_disk_write += bp->b_bufsize;
1688                         hammer_ip_replace_bulk(hmp, record);
1689                         vn_strategy(volume->devvp, nbio);
1690                         hammer_io_flush_mark(volume);
1691                 }
1692                 hammer_rel_volume(volume, 0);
1693         } else {
1694                 /*
1695                  * Must fit in a standard HAMMER buffer.  In this case all
1696                  * consumers use the HAMMER buffer system and RECG_DIRECT_IO
1697                  * does not need to be set-up.
1698                  */
1699                 KKASSERT(((buf_offset ^ (buf_offset + leaf->data_len - 1)) & ~HAMMER_BUFMASK64) == 0);
1700                 buffer = NULL;
1701                 ptr = hammer_bread(hmp, buf_offset, &error, &buffer);
1702                 if (error == 0) {
1703                         bp = bio->bio_buf;
1704                         bp->b_flags |= B_AGE;
1705                         hammer_io_modify(&buffer->io, 1);
1706                         bcopy(bp->b_data, ptr, leaf->data_len);
1707                         hammer_io_modify_done(&buffer->io);
1708                         hammer_rel_buffer(buffer, 0);
1709                         bp->b_resid = 0;
1710                         hammer_ip_replace_bulk(hmp, record);
1711                         biodone(bio);
1712                 }
1713         }
1714         if (error) {
1715                 /*
1716                  * Major suckage occured.  Also note:  The record was
1717                  * never added to the tree so we do not have to worry
1718                  * about the backend.
1719                  */
1720                 kprintf("hammer_direct_write: failed @ %016llx\n",
1721                         (long long)leaf->data_offset);
1722                 bp = bio->bio_buf;
1723                 bp->b_resid = 0;
1724                 bp->b_error = EIO;
1725                 bp->b_flags |= B_ERROR;
1726                 biodone(bio);
1727                 record->flags |= HAMMER_RECF_DELETED_FE;
1728                 hammer_rel_mem_record(record);
1729         }
1730         return(error);
1731 }
1732
1733 /*
1734  * On completion of the BIO this callback must disconnect
1735  * it from the hammer_record and chain to the previous bio.
1736  *
1737  * An I/O error forces the mount to read-only.  Data buffers
1738  * are not B_LOCKED like meta-data buffers are, so we have to
1739  * throw the buffer away to prevent the kernel from retrying.
1740  *
1741  * NOTE: MPSAFE callback, only modify fields we have explicit
1742  *       access to (the bp and the record->gflags).
1743  */
1744 static
1745 void
1746 hammer_io_direct_write_complete(struct bio *nbio)
1747 {
1748         struct bio *obio;
1749         struct buf *bp;
1750         hammer_record_t record;
1751         hammer_mount_t hmp;
1752
1753         record = nbio->bio_caller_info1.ptr;
1754         KKASSERT(record != NULL);
1755         hmp = record->ip->hmp;
1756
1757         lwkt_gettoken(&hmp->io_token);
1758
1759         bp = nbio->bio_buf;
1760         obio = pop_bio(nbio);
1761         if (bp->b_flags & B_ERROR) {
1762                 lwkt_gettoken(&hmp->fs_token);
1763                 hammer_critical_error(hmp, record->ip,
1764                                       bp->b_error,
1765                                       "while writing bulk data");
1766                 lwkt_reltoken(&hmp->fs_token);
1767                 bp->b_flags |= B_INVAL;
1768         }
1769         biodone(obio);
1770
1771         KKASSERT(record->gflags & HAMMER_RECG_DIRECT_IO);
1772         if (record->gflags & HAMMER_RECG_DIRECT_WAIT) {
1773                 record->gflags &= ~(HAMMER_RECG_DIRECT_IO |
1774                                     HAMMER_RECG_DIRECT_WAIT);
1775                 /* record can disappear once DIRECT_IO flag is cleared */
1776                 wakeup(&record->flags);
1777         } else {
1778                 record->gflags &= ~HAMMER_RECG_DIRECT_IO;
1779                 /* record can disappear once DIRECT_IO flag is cleared */
1780         }
1781         lwkt_reltoken(&hmp->io_token);
1782 }
1783
1784
1785 /*
1786  * This is called before a record is either committed to the B-Tree
1787  * or destroyed, to resolve any associated direct-IO.
1788  *
1789  * (1) We must wait for any direct-IO related to the record to complete.
1790  *
1791  * (2) We must remove any buffer cache aliases for data accessed via
1792  *     leaf->data_offset or zone2_offset so non-direct-IO consumers
1793  *     (the mirroring and reblocking code) do not see stale data.
1794  */
1795 void
1796 hammer_io_direct_wait(hammer_record_t record)
1797 {
1798         hammer_mount_t hmp = record->ip->hmp;
1799
1800         /*
1801          * Wait for I/O to complete
1802          */
1803         if (record->gflags & HAMMER_RECG_DIRECT_IO) {
1804                 lwkt_gettoken(&hmp->io_token);
1805                 while (record->gflags & HAMMER_RECG_DIRECT_IO) {
1806                         record->gflags |= HAMMER_RECG_DIRECT_WAIT;
1807                         tsleep(&record->flags, 0, "hmdiow", 0);
1808                 }
1809                 lwkt_reltoken(&hmp->io_token);
1810         }
1811
1812         /*
1813          * Invalidate any related buffer cache aliases associated with the
1814          * backing device.  This is needed because the buffer cache buffer
1815          * for file data is associated with the file vnode, not the backing
1816          * device vnode.
1817          *
1818          * XXX I do not think this case can occur any more now that
1819          * reservations ensure that all such buffers are removed before
1820          * an area can be reused.
1821          */
1822         if (record->gflags & HAMMER_RECG_DIRECT_INVAL) {
1823                 KKASSERT(record->leaf.data_offset);
1824                 hammer_del_buffers(hmp, record->leaf.data_offset,
1825                                    record->zone2_offset, record->leaf.data_len,
1826                                    1);
1827                 record->gflags &= ~HAMMER_RECG_DIRECT_INVAL;
1828         }
1829 }
1830
1831 /*
1832  * This is called to remove the second-level cached zone-2 offset from
1833  * frontend buffer cache buffers, now stale due to a data relocation.
1834  * These offsets are generated by cluster_read() via VOP_BMAP, or directly
1835  * by hammer_vop_strategy_read().
1836  *
1837  * This is rather nasty because here we have something like the reblocker
1838  * scanning the raw B-Tree with no held references on anything, really,
1839  * other then a shared lock on the B-Tree node, and we have to access the
1840  * frontend's buffer cache to check for and clean out the association.
1841  * Specifically, if the reblocker is moving data on the disk, these cached
1842  * offsets will become invalid.
1843  *
1844  * Only data record types associated with the large-data zone are subject
1845  * to direct-io and need to be checked.
1846  *
1847  */
1848 void
1849 hammer_io_direct_uncache(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf)
1850 {
1851         struct hammer_inode_info iinfo;
1852         int zone;
1853
1854         if (leaf->base.rec_type != HAMMER_RECTYPE_DATA)
1855                 return;
1856         zone = HAMMER_ZONE_DECODE(leaf->data_offset);
1857         if (zone != HAMMER_ZONE_LARGE_DATA_INDEX)
1858                 return;
1859         iinfo.obj_id = leaf->base.obj_id;
1860         iinfo.obj_asof = 0;     /* unused */
1861         iinfo.obj_localization = leaf->base.localization &
1862                                  HAMMER_LOCALIZE_PSEUDOFS_MASK;
1863         iinfo.u.leaf = leaf;
1864         hammer_scan_inode_snapshots(hmp, &iinfo,
1865                                     hammer_io_direct_uncache_callback,
1866                                     leaf);
1867 }
1868
1869 static int
1870 hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data)
1871 {
1872         hammer_inode_info_t iinfo = data;
1873         hammer_off_t file_offset;
1874         struct vnode *vp;
1875         struct buf *bp;
1876         int blksize;
1877
1878         if (ip->vp == NULL)
1879                 return(0);
1880         file_offset = iinfo->u.leaf->base.key - iinfo->u.leaf->data_len;
1881         blksize = iinfo->u.leaf->data_len;
1882         KKASSERT((blksize & HAMMER_BUFMASK) == 0);
1883
1884         /*
1885          * Warning: FINDBLK_TEST return stable storage but not stable
1886          *          contents.  It happens to be ok in this case.
1887          */
1888         hammer_ref(&ip->lock);
1889         if (hammer_get_vnode(ip, &vp) == 0) {
1890                 if ((bp = findblk(ip->vp, file_offset, FINDBLK_TEST)) != NULL &&
1891                     bp->b_bio2.bio_offset != NOOFFSET) {
1892                         bp = getblk(ip->vp, file_offset, blksize, 0, 0);
1893                         bp->b_bio2.bio_offset = NOOFFSET;
1894                         brelse(bp);
1895                 }
1896                 vput(vp);
1897         }
1898         hammer_rel_inode(ip, 0);
1899         return(0);
1900 }
1901
1902
1903 /*
1904  * This function is called when writes may have occured on the volume,
1905  * indicating that the device may be holding cached writes.
1906  */
1907 static void
1908 hammer_io_flush_mark(hammer_volume_t volume)
1909 {
1910         atomic_set_int(&volume->vol_flags, HAMMER_VOLF_NEEDFLUSH);
1911 }
1912
1913 /*
1914  * This function ensures that the device has flushed any cached writes out.
1915  */
1916 void
1917 hammer_io_flush_sync(hammer_mount_t hmp)
1918 {
1919         hammer_volume_t volume;
1920         struct buf *bp_base = NULL;
1921         struct buf *bp;
1922
1923         RB_FOREACH(volume, hammer_vol_rb_tree, &hmp->rb_vols_root) {
1924                 if (volume->vol_flags & HAMMER_VOLF_NEEDFLUSH) {
1925                         atomic_clear_int(&volume->vol_flags,
1926                                          HAMMER_VOLF_NEEDFLUSH);
1927                         bp = getpbuf(NULL);
1928                         bp->b_bio1.bio_offset = 0;
1929                         bp->b_bufsize = 0;
1930                         bp->b_bcount = 0;
1931                         bp->b_cmd = BUF_CMD_FLUSH;
1932                         bp->b_bio1.bio_caller_info1.cluster_head = bp_base;
1933                         bp->b_bio1.bio_done = biodone_sync;
1934                         bp->b_bio1.bio_flags |= BIO_SYNC;
1935                         bp_base = bp;
1936                         vn_strategy(volume->devvp, &bp->b_bio1);
1937                 }
1938         }
1939         while ((bp = bp_base) != NULL) {
1940                 bp_base = bp->b_bio1.bio_caller_info1.cluster_head;
1941                 biowait(&bp->b_bio1, "hmrFLS");
1942                 relpbuf(bp, NULL);
1943         }
1944 }
1945
1946 /*
1947  * Limit the amount of backlog which we allow to build up
1948  */
1949 void
1950 hammer_io_limit_backlog(hammer_mount_t hmp)
1951 {
1952         waitrunningbufspace();
1953 }