| Commit | Line | Data |
|---|---|---|
| 66325755 | 1 | /* |
| b84de5af | 2 | * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. |
| 66325755 MD |
3 | * |
| 4 | * This code is derived from software contributed to The DragonFly Project | |
| 5 | * by Matthew Dillon <dillon@backplane.com> | |
| 6 | * | |
| 7 | * Redistribution and use in source and binary forms, with or without | |
| 8 | * modification, are permitted provided that the following conditions | |
| 9 | * are met: | |
| 10 | * | |
| 11 | * 1. Redistributions of source code must retain the above copyright | |
| 12 | * notice, this list of conditions and the following disclaimer. | |
| 13 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 14 | * notice, this list of conditions and the following disclaimer in | |
| 15 | * the documentation and/or other materials provided with the | |
| 16 | * distribution. | |
| 17 | * 3. Neither the name of The DragonFly Project nor the names of its | |
| 18 | * contributors may be used to endorse or promote products derived | |
| 19 | * from this software without specific, prior written permission. | |
| 20 | * | |
| 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| 22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |
| 24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | |
| 25 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
| 26 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
| 27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |
| 29 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 30 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |
| 31 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 32 | * SUCH DAMAGE. | |
| 66325755 MD |
33 | */ |
| 34 | /* | |
| 35 | * IO Primitives and buffer cache management | |
| 36 | * | |
| 37 | * All major data-tracking structures in HAMMER contain a struct hammer_io | |
| 38 | * which is used to manage their backing store. We use filesystem buffers | |
| 39 | * for backing store and we leave them passively associated with their | |
| 40 | * HAMMER structures. | |
| 41 | * | |
| 9f5097dc | 42 | * If the kernel tries to destroy a passively associated buf which we cannot |
| 66325755 MD |
43 | * yet let go we set B_LOCKED in the buffer and then actively released it |
| 44 | * later when we can. | |
| 77912481 MD |
45 | * |
| 46 | * The io_token is required for anything which might race bioops and bio_done | |
| 47 | * callbacks, with one exception: A successful hammer_try_interlock_norefs(). | |
| 48 | * the fs_token will be held in all other cases. | |
| 66325755 MD |
49 | */ |
| 50 | ||
| 51 | #include "hammer.h" | |
| 52 | #include <sys/fcntl.h> | |
| 53 | #include <sys/nlookup.h> | |
| 54 | #include <sys/buf.h> | |
| 54341a3b | 55 | |
| 66325755 MD |
56 | #include <sys/buf2.h> |
| 57 | ||
| 10a5d1ba | 58 | static void hammer_io_modify(hammer_io_t io, int count); |
| 055f5ff8 | 59 | static void hammer_io_deallocate(struct buf *bp); |
| 9a98f3cc | 60 | static void hammer_indirect_callback(struct bio *bio); |
| 1b0ab2c3 MD |
61 | #if 0 |
| 62 | static void hammer_io_direct_read_complete(struct bio *nbio); | |
| 63 | #endif | |
| 64 | static void hammer_io_direct_write_complete(struct bio *nbio); | |
| 43c665ae | 65 | static int hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data); |
| cdb6e4e6 | 66 | static void hammer_io_set_modlist(struct hammer_io *io); |
| 748efb59 | 67 | static void hammer_io_flush_mark(hammer_volume_t volume); |
| 748efb59 | 68 | |
| 1afb73cf MD |
69 | static int |
| 70 | hammer_mod_rb_compare(hammer_io_t io1, hammer_io_t io2) | |
| 71 | { | |
| 72 | hammer_off_t io1_offset; | |
| 73 | hammer_off_t io2_offset; | |
| 74 | ||
| 75 | io1_offset = ((io1->offset & HAMMER_OFF_SHORT_MASK) << 8) | | |
| 76 | HAMMER_VOL_DECODE(io1->offset); | |
| 77 | io2_offset = ((io2->offset & HAMMER_OFF_SHORT_MASK) << 8) | | |
| 78 | HAMMER_VOL_DECODE(io2->offset); | |
| 79 | ||
| 80 | if (io1_offset < io2_offset) | |
| 81 | return(-1); | |
| 82 | if (io1_offset > io2_offset) | |
| 83 | return(1); | |
| 84 | return(0); | |
| 85 | } | |
| 86 | ||
| 87 | RB_GENERATE(hammer_mod_rb_tree, hammer_io, rb_node, hammer_mod_rb_compare); | |
| 88 | ||
| 055f5ff8 | 89 | /* |
| 10a5d1ba MD |
90 | * Initialize a new, already-zero'd hammer_io structure, or reinitialize |
| 91 | * an existing hammer_io structure which may have switched to another type. | |
| 055f5ff8 MD |
92 | */ |
| 93 | void | |
| 748efb59 | 94 | hammer_io_init(hammer_io_t io, hammer_volume_t volume, enum hammer_io_type type) |
| 055f5ff8 | 95 | { |
| 748efb59 MD |
96 | io->volume = volume; |
| 97 | io->hmp = volume->io.hmp; | |
| 055f5ff8 | 98 | io->type = type; |
| 055f5ff8 MD |
99 | } |
| 100 | ||
| 66325755 | 101 | /* |
| fbc6e32a | 102 | * Helper routine to disassociate a buffer cache buffer from an I/O |
| 77912481 | 103 | * structure. The io must be interlocked and marked appropriately for |
| b0aab9b9 | 104 | * reclamation. |
| 055f5ff8 | 105 | * |
| b0aab9b9 MD |
106 | * The io must be in a released state with the io->bp owned and |
| 107 | * locked by the caller of this function. When not called from an | |
| 108 | * io_deallocate() this cannot race an io_deallocate() since the | |
| 109 | * kernel would be unable to get the buffer lock in that case. | |
| 77912481 MD |
110 | * (The released state in this case means we own the bp, not the |
| 111 | * hammer_io structure). | |
| 112 | * | |
| 113 | * The io may have 0 or 1 references depending on who called us. The | |
| 114 | * caller is responsible for dealing with the refs. | |
| b0aab9b9 | 115 | * |
| 055f5ff8 | 116 | * This call can only be made when no action is required on the buffer. |
| ecca949a | 117 | * |
| 77912481 MD |
118 | * This function is guaranteed not to race against anything because we |
| 119 | * own both the io lock and the bp lock and are interlocked with no | |
| 120 | * references. | |
| 66325755 MD |
121 | */ |
| 122 | static void | |
| ecca949a | 123 | hammer_io_disassociate(hammer_io_structure_t iou) |
| 66325755 | 124 | { |
| 055f5ff8 | 125 | struct buf *bp = iou->io.bp; |
| 66325755 | 126 | |
| ecca949a | 127 | KKASSERT(iou->io.released); |
| b58c6388 | 128 | KKASSERT(iou->io.modified == 0); |
| af209b0f | 129 | KKASSERT(LIST_FIRST(&bp->b_dep) == (void *)iou); |
| 4d75d829 | 130 | buf_dep_init(bp); |
| 055f5ff8 | 131 | iou->io.bp = NULL; |
| 9f5097dc MD |
132 | |
| 133 | /* | |
| 134 | * If the buffer was locked someone wanted to get rid of it. | |
| 135 | */ | |
| a99b9ea2 | 136 | if (bp->b_flags & B_LOCKED) { |
| b0aab9b9 | 137 | atomic_add_int(&hammer_count_io_locked, -1); |
| 9f5097dc | 138 | bp->b_flags &= ~B_LOCKED; |
| a99b9ea2 | 139 | } |
| ecca949a MD |
140 | if (iou->io.reclaim) { |
| 141 | bp->b_flags |= B_NOCACHE|B_RELBUF; | |
| 142 | iou->io.reclaim = 0; | |
| 055f5ff8 | 143 | } |
| 66325755 | 144 | |
| 055f5ff8 | 145 | switch(iou->io.type) { |
| 66325755 | 146 | case HAMMER_STRUCTURE_VOLUME: |
| 055f5ff8 | 147 | iou->volume.ondisk = NULL; |
| 66325755 | 148 | break; |
| 10a5d1ba MD |
149 | case HAMMER_STRUCTURE_DATA_BUFFER: |
| 150 | case HAMMER_STRUCTURE_META_BUFFER: | |
| 151 | case HAMMER_STRUCTURE_UNDO_BUFFER: | |
| 055f5ff8 | 152 | iou->buffer.ondisk = NULL; |
| 66325755 | 153 | break; |
| eddadaee MD |
154 | case HAMMER_STRUCTURE_DUMMY: |
| 155 | panic("hammer_io_disassociate: bad io type"); | |
| 156 | break; | |
| 66325755 | 157 | } |
| fbc6e32a MD |
158 | } |
| 159 | ||
| 160 | /* | |
| 055f5ff8 | 161 | * Wait for any physical IO to complete |
| ae8e83e6 MD |
162 | * |
| 163 | * XXX we aren't interlocked against a spinlock or anything so there | |
| 164 | * is a small window in the interlock / io->running == 0 test. | |
| fbc6e32a | 165 | */ |
| 1b0ab2c3 | 166 | void |
| 055f5ff8 | 167 | hammer_io_wait(hammer_io_t io) |
| fbc6e32a | 168 | { |
| 055f5ff8 | 169 | if (io->running) { |
| b0aab9b9 MD |
170 | hammer_mount_t hmp = io->hmp; |
| 171 | ||
| 172 | lwkt_gettoken(&hmp->io_token); | |
| 173 | while (io->running) { | |
| ae8e83e6 MD |
174 | io->waiting = 1; |
| 175 | tsleep_interlock(io, 0); | |
| b0aab9b9 MD |
176 | if (io->running) |
| 177 | tsleep(io, PINTERLOCKED, "hmrflw", hz); | |
| 055f5ff8 | 178 | } |
| b0aab9b9 | 179 | lwkt_reltoken(&hmp->io_token); |
| 66325755 MD |
180 | } |
| 181 | } | |
| 182 | ||
| af209b0f | 183 | /* |
| eddadaee MD |
184 | * Wait for all currently queued HAMMER-initiated I/Os to complete. |
| 185 | * | |
| 186 | * This is not supposed to count direct I/O's but some can leak | |
| 187 | * through (for non-full-sized direct I/Os). | |
| af209b0f MD |
188 | */ |
| 189 | void | |
| eddadaee | 190 | hammer_io_wait_all(hammer_mount_t hmp, const char *ident, int doflush) |
| af209b0f | 191 | { |
| eddadaee MD |
192 | struct hammer_io iodummy; |
| 193 | hammer_io_t io; | |
| 194 | ||
| 195 | /* | |
| 196 | * Degenerate case, no I/O is running | |
| 197 | */ | |
| b0aab9b9 | 198 | lwkt_gettoken(&hmp->io_token); |
| eddadaee | 199 | if (TAILQ_EMPTY(&hmp->iorun_list)) { |
| b0aab9b9 | 200 | lwkt_reltoken(&hmp->io_token); |
| eddadaee MD |
201 | if (doflush) |
| 202 | hammer_io_flush_sync(hmp); | |
| 203 | return; | |
| 204 | } | |
| 205 | bzero(&iodummy, sizeof(iodummy)); | |
| 206 | iodummy.type = HAMMER_STRUCTURE_DUMMY; | |
| 207 | ||
| 208 | /* | |
| 209 | * Add placemarker and then wait until it becomes the head of | |
| 210 | * the list. | |
| 211 | */ | |
| 212 | TAILQ_INSERT_TAIL(&hmp->iorun_list, &iodummy, iorun_entry); | |
| 213 | while (TAILQ_FIRST(&hmp->iorun_list) != &iodummy) { | |
| 214 | tsleep(&iodummy, 0, ident, 0); | |
| 215 | } | |
| 216 | ||
| 217 | /* | |
| 218 | * Chain in case several placemarkers are present. | |
| 219 | */ | |
| 220 | TAILQ_REMOVE(&hmp->iorun_list, &iodummy, iorun_entry); | |
| 221 | io = TAILQ_FIRST(&hmp->iorun_list); | |
| 222 | if (io && io->type == HAMMER_STRUCTURE_DUMMY) | |
| 223 | wakeup(io); | |
| b0aab9b9 | 224 | lwkt_reltoken(&hmp->io_token); |
| eddadaee MD |
225 | |
| 226 | if (doflush) | |
| 227 | hammer_io_flush_sync(hmp); | |
| af209b0f MD |
228 | } |
| 229 | ||
| 2faf0737 MD |
230 | /* |
| 231 | * Clear a flagged error condition on a I/O buffer. The caller must hold | |
| 232 | * its own ref on the buffer. | |
| 233 | */ | |
| 234 | void | |
| 235 | hammer_io_clear_error(struct hammer_io *io) | |
| 236 | { | |
| 77912481 MD |
237 | hammer_mount_t hmp = io->hmp; |
| 238 | ||
| 239 | lwkt_gettoken(&hmp->io_token); | |
| 2faf0737 MD |
240 | if (io->ioerror) { |
| 241 | io->ioerror = 0; | |
| 250aec18 MD |
242 | hammer_rel(&io->lock); |
| 243 | KKASSERT(hammer_isactive(&io->lock)); | |
| 2faf0737 | 244 | } |
| 77912481 MD |
245 | lwkt_reltoken(&hmp->io_token); |
| 246 | } | |
| 247 | ||
| 248 | void | |
| 249 | hammer_io_clear_error_noassert(struct hammer_io *io) | |
| 250 | { | |
| 251 | hammer_mount_t hmp = io->hmp; | |
| 252 | ||
| 253 | lwkt_gettoken(&hmp->io_token); | |
| 254 | if (io->ioerror) { | |
| 255 | io->ioerror = 0; | |
| 256 | hammer_rel(&io->lock); | |
| 257 | } | |
| 258 | lwkt_reltoken(&hmp->io_token); | |
| 2faf0737 MD |
259 | } |
| 260 | ||
| b8a41159 MD |
261 | /* |
| 262 | * This is an advisory function only which tells the buffer cache | |
| 263 | * the bp is not a meta-data buffer, even though it is backed by | |
| 264 | * a block device. | |
| 265 | * | |
| 266 | * This is used by HAMMER's reblocking code to avoid trying to | |
| 267 | * swapcache the filesystem's data when it is read or written | |
| 268 | * by the reblocking code. | |
| b0aab9b9 MD |
269 | * |
| 270 | * The caller has a ref on the buffer preventing the bp from | |
| 271 | * being disassociated from it. | |
| b8a41159 MD |
272 | */ |
| 273 | void | |
| 274 | hammer_io_notmeta(hammer_buffer_t buffer) | |
| 275 | { | |
| b0aab9b9 MD |
276 | if ((buffer->io.bp->b_flags & B_NOTMETA) == 0) { |
| 277 | hammer_mount_t hmp = buffer->io.hmp; | |
| 278 | ||
| 279 | lwkt_gettoken(&hmp->io_token); | |
| 280 | buffer->io.bp->b_flags |= B_NOTMETA; | |
| 281 | lwkt_reltoken(&hmp->io_token); | |
| 282 | } | |
| b8a41159 MD |
283 | } |
| 284 | ||
| 66325755 | 285 | /* |
| 10a5d1ba MD |
286 | * Load bp for a HAMMER structure. The io must be exclusively locked by |
| 287 | * the caller. | |
| 2f85fa4d | 288 | * |
| a99b9ea2 | 289 | * This routine is mostly used on meta-data and small-data blocks. Generally |
| b7de8aa5 | 290 | * speaking HAMMER assumes some locality of reference and will cluster. |
| af209b0f | 291 | * |
| b7de8aa5 MD |
292 | * Note that the caller (hammer_ondisk.c) may place further restrictions |
| 293 | * on clusterability via the limit (in bytes). Typically large-data | |
| 294 | * zones cannot be clustered due to their mixed buffer sizes. This is | |
| 295 | * not an issue since such clustering occurs in hammer_vnops at the | |
| 296 | * regular file layer, whereas this is the buffered block device layer. | |
| b0aab9b9 MD |
297 | * |
| 298 | * No I/O callbacks can occur while we hold the buffer locked. | |
| 66325755 MD |
299 | */ |
| 300 | int | |
| b7de8aa5 | 301 | hammer_io_read(struct vnode *devvp, struct hammer_io *io, int limit) |
| 66325755 MD |
302 | { |
| 303 | struct buf *bp; | |
| 2f85fa4d | 304 | int error; |
| 66325755 MD |
305 | |
| 306 | if ((bp = io->bp) == NULL) { | |
| 3583bbb4 | 307 | atomic_add_long(&hammer_count_io_running_read, io->bytes); |
| b7de8aa5 MD |
308 | if (hammer_cluster_enable && limit > io->bytes) { |
| 309 | error = cluster_read(devvp, io->offset + limit, | |
| ce0138a6 MD |
310 | io->offset, io->bytes, |
| 311 | HAMMER_CLUSTER_SIZE, | |
| 364c022c MD |
312 | HAMMER_CLUSTER_SIZE, |
| 313 | &io->bp); | |
| ce0138a6 MD |
314 | } else { |
| 315 | error = bread(devvp, io->offset, io->bytes, &io->bp); | |
| 316 | } | |
| 317 | hammer_stats_disk_read += io->bytes; | |
| 3583bbb4 | 318 | atomic_add_long(&hammer_count_io_running_read, -io->bytes); |
| cdb6e4e6 MD |
319 | |
| 320 | /* | |
| 321 | * The code generally assumes b_ops/b_dep has been set-up, | |
| 322 | * even if we error out here. | |
| 323 | */ | |
| 324 | bp = io->bp; | |
| 24c8374a MD |
325 | if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { |
| 326 | const char *metatype; | |
| 327 | ||
| 328 | switch(io->type) { | |
| 329 | case HAMMER_STRUCTURE_VOLUME: | |
| 330 | metatype = "volume"; | |
| 331 | break; | |
| 332 | case HAMMER_STRUCTURE_META_BUFFER: | |
| 333 | switch(((struct hammer_buffer *)io)-> | |
| 334 | zoneX_offset & HAMMER_OFF_ZONE_MASK) { | |
| 335 | case HAMMER_ZONE_BTREE: | |
| 336 | metatype = "btree"; | |
| 337 | break; | |
| 338 | case HAMMER_ZONE_META: | |
| 339 | metatype = "meta"; | |
| 340 | break; | |
| 341 | case HAMMER_ZONE_FREEMAP: | |
| 342 | metatype = "freemap"; | |
| 343 | break; | |
| 344 | default: | |
| 345 | metatype = "meta?"; | |
| 346 | break; | |
| 347 | } | |
| 348 | break; | |
| 349 | case HAMMER_STRUCTURE_DATA_BUFFER: | |
| 350 | metatype = "data"; | |
| 351 | break; | |
| 352 | case HAMMER_STRUCTURE_UNDO_BUFFER: | |
| 353 | metatype = "undo"; | |
| 354 | break; | |
| 355 | default: | |
| 356 | metatype = "unknown"; | |
| 357 | break; | |
| 358 | } | |
| 359 | kprintf("doff %016jx %s\n", | |
| 360 | (intmax_t)bp->b_bio2.bio_offset, | |
| 361 | metatype); | |
| 362 | } | |
| 363 | bp->b_flags &= ~B_IODEBUG; | |
| cdb6e4e6 MD |
364 | bp->b_ops = &hammer_bioops; |
| 365 | KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); | |
| b0aab9b9 MD |
366 | |
| 367 | /* io->worklist is locked by the io lock */ | |
| cdb6e4e6 MD |
368 | LIST_INSERT_HEAD(&bp->b_dep, &io->worklist, node); |
| 369 | BUF_KERNPROC(bp); | |
| 10a5d1ba MD |
370 | KKASSERT(io->modified == 0); |
| 371 | KKASSERT(io->running == 0); | |
| 372 | KKASSERT(io->waiting == 0); | |
| 66325755 MD |
373 | io->released = 0; /* we hold an active lock on bp */ |
| 374 | } else { | |
| 375 | error = 0; | |
| 376 | } | |
| 377 | return(error); | |
| 378 | } | |
| 379 | ||
| 380 | /* | |
| 381 | * Similar to hammer_io_read() but returns a zero'd out buffer instead. | |
| 10a5d1ba MD |
382 | * Must be called with the IO exclusively locked. |
| 383 | * | |
| 66325755 | 384 | * vfs_bio_clrbuf() is kinda nasty, enforce serialization against background |
| 10a5d1ba MD |
385 | * I/O by forcing the buffer to not be in a released state before calling |
| 386 | * it. | |
| 055f5ff8 | 387 | * |
| 10a5d1ba MD |
388 | * This function will also mark the IO as modified but it will not |
| 389 | * increment the modify_refs count. | |
| b0aab9b9 MD |
390 | * |
| 391 | * No I/O callbacks can occur while we hold the buffer locked. | |
| 66325755 MD |
392 | */ |
| 393 | int | |
| 394 | hammer_io_new(struct vnode *devvp, struct hammer_io *io) | |
| 395 | { | |
| 396 | struct buf *bp; | |
| 397 | ||
| 398 | if ((bp = io->bp) == NULL) { | |
| 4a2796f3 | 399 | io->bp = getblk(devvp, io->offset, io->bytes, 0, 0); |
| 66325755 MD |
400 | bp = io->bp; |
| 401 | bp->b_ops = &hammer_bioops; | |
| af209b0f | 402 | KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); |
| b0aab9b9 MD |
403 | |
| 404 | /* io->worklist is locked by the io lock */ | |
| 66325755 | 405 | LIST_INSERT_HEAD(&bp->b_dep, &io->worklist, node); |
| 055f5ff8 | 406 | io->released = 0; |
| 10a5d1ba | 407 | KKASSERT(io->running == 0); |
| 055f5ff8 | 408 | io->waiting = 0; |
| 66325755 MD |
409 | BUF_KERNPROC(bp); |
| 410 | } else { | |
| 411 | if (io->released) { | |
| 412 | regetblk(bp); | |
| 66325755 | 413 | BUF_KERNPROC(bp); |
| d113fda1 | 414 | io->released = 0; |
| 66325755 MD |
415 | } |
| 416 | } | |
| 10a5d1ba | 417 | hammer_io_modify(io, 0); |
| 66325755 MD |
418 | vfs_bio_clrbuf(bp); |
| 419 | return(0); | |
| 420 | } | |
| 421 | ||
| 422 | /* | |
| 0e8bd897 MD |
423 | * Advance the activity count on the underlying buffer because |
| 424 | * HAMMER does not getblk/brelse on every access. | |
| b0aab9b9 MD |
425 | * |
| 426 | * The io->bp cannot go away while the buffer is referenced. | |
| 0e8bd897 MD |
427 | */ |
| 428 | void | |
| 429 | hammer_io_advance(struct hammer_io *io) | |
| 430 | { | |
| 431 | if (io->bp) | |
| 432 | buf_act_advance(io->bp); | |
| 433 | } | |
| 434 | ||
| 435 | /* | |
| 47637bff | 436 | * Remove potential device level aliases against buffers managed by high level |
| 362ec2dc MD |
437 | * vnodes. Aliases can also be created due to mixed buffer sizes or via |
| 438 | * direct access to the backing store device. | |
| e469566b MD |
439 | * |
| 440 | * This is nasty because the buffers are also VMIO-backed. Even if a buffer | |
| 441 | * does not exist its backing VM pages might, and we have to invalidate | |
| 442 | * those as well or a getblk() will reinstate them. | |
| 362ec2dc MD |
443 | * |
| 444 | * Buffer cache buffers associated with hammer_buffers cannot be | |
| 445 | * invalidated. | |
| 47637bff | 446 | */ |
| 362ec2dc | 447 | int |
| 47637bff MD |
448 | hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset) |
| 449 | { | |
| cebe9493 | 450 | hammer_io_structure_t iou; |
| b0aab9b9 | 451 | hammer_mount_t hmp; |
| 47637bff MD |
452 | hammer_off_t phys_offset; |
| 453 | struct buf *bp; | |
| 362ec2dc | 454 | int error; |
| 47637bff | 455 | |
| b0aab9b9 MD |
456 | hmp = volume->io.hmp; |
| 457 | lwkt_gettoken(&hmp->io_token); | |
| 458 | ||
| 9c90dba2 | 459 | /* |
| 3b98d912 MD |
460 | * If a device buffer already exists for the specified physical |
| 461 | * offset use that, otherwise instantiate a buffer to cover any | |
| 462 | * related VM pages, set BNOCACHE, and brelse(). | |
| 9c90dba2 | 463 | */ |
| 47637bff MD |
464 | phys_offset = volume->ondisk->vol_buf_beg + |
| 465 | (zone2_offset & HAMMER_OFF_SHORT_MASK); | |
| 3b98d912 MD |
466 | if ((bp = findblk(volume->devvp, phys_offset, 0)) != NULL) |
| 467 | bremfree(bp); | |
| e469566b MD |
468 | else |
| 469 | bp = getblk(volume->devvp, phys_offset, HAMMER_BUFSIZE, 0, 0); | |
| b0aab9b9 | 470 | |
| e469566b | 471 | if ((iou = (void *)LIST_FIRST(&bp->b_dep)) != NULL) { |
| 362ec2dc | 472 | #if 0 |
| 5c8d05e2 | 473 | hammer_ref(&iou->io.lock); |
| e469566b MD |
474 | hammer_io_clear_modify(&iou->io, 1); |
| 475 | bundirty(bp); | |
| e83ca595 MD |
476 | iou->io.released = 0; |
| 477 | BUF_KERNPROC(bp); | |
| e469566b | 478 | iou->io.reclaim = 1; |
| 77912481 | 479 | iou->io.waitdep = 1; /* XXX this is a fs_token field */ |
| 250aec18 | 480 | KKASSERT(hammer_isactive(&iou->io.lock) == 1); |
| 5c8d05e2 MD |
481 | hammer_rel_buffer(&iou->buffer, 0); |
| 482 | /*hammer_io_deallocate(bp);*/ | |
| 362ec2dc | 483 | #endif |
| 04b04ca6 | 484 | bqrelse(bp); |
| 362ec2dc | 485 | error = EAGAIN; |
| e469566b MD |
486 | } else { |
| 487 | KKASSERT((bp->b_flags & B_LOCKED) == 0); | |
| 488 | bundirty(bp); | |
| 489 | bp->b_flags |= B_NOCACHE|B_RELBUF; | |
| e83ca595 | 490 | brelse(bp); |
| 362ec2dc | 491 | error = 0; |
| 47637bff | 492 | } |
| b0aab9b9 | 493 | lwkt_reltoken(&hmp->io_token); |
| 362ec2dc | 494 | return(error); |
| 47637bff MD |
495 | } |
| 496 | ||
| 497 | /* | |
| b3deaf57 | 498 | * This routine is called on the last reference to a hammer structure. |
| 250aec18 MD |
499 | * The io must be interlocked with a refcount of zero. The hammer structure |
| 500 | * will remain interlocked on return. | |
| b3deaf57 | 501 | * |
| 250aec18 MD |
502 | * This routine may return a non-NULL bp to the caller for dispoal. |
| 503 | * The caller typically brelse()'s the bp. | |
| 504 | * | |
| 505 | * The bp may or may not still be passively associated with the IO. It | |
| 506 | * will remain passively associated if it is unreleasable (e.g. a modified | |
| 507 | * meta-data buffer). | |
| ecca949a MD |
508 | * |
| 509 | * The only requirement here is that modified meta-data and volume-header | |
| 510 | * buffer may NOT be disassociated from the IO structure, and consequently | |
| 511 | * we also leave such buffers actively associated with the IO if they already | |
| 512 | * are (since the kernel can't do anything with them anyway). Only the | |
| 513 | * flusher is allowed to write such buffers out. Modified pure-data and | |
| 514 | * undo buffers are returned to the kernel but left passively associated | |
| 515 | * so we can track when the kernel writes the bp out. | |
| 66325755 | 516 | */ |
| ecca949a | 517 | struct buf * |
| 09ac686b | 518 | hammer_io_release(struct hammer_io *io, int flush) |
| 66325755 | 519 | { |
| 9f5097dc | 520 | union hammer_io_structure *iou = (void *)io; |
| 66325755 MD |
521 | struct buf *bp; |
| 522 | ||
| 055f5ff8 | 523 | if ((bp = io->bp) == NULL) |
| ecca949a | 524 | return(NULL); |
| fbc6e32a | 525 | |
| 055f5ff8 | 526 | /* |
| 10a5d1ba MD |
527 | * Try to flush a dirty IO to disk if asked to by the |
| 528 | * caller or if the kernel tried to flush the buffer in the past. | |
| 055f5ff8 | 529 | * |
| 10a5d1ba MD |
530 | * Kernel-initiated flushes are only allowed for pure-data buffers. |
| 531 | * meta-data and volume buffers can only be flushed explicitly | |
| 532 | * by HAMMER. | |
| 055f5ff8 | 533 | */ |
| 10a5d1ba | 534 | if (io->modified) { |
| 09ac686b | 535 | if (flush) { |
| 710733a6 | 536 | hammer_io_flush(io, 0); |
| 10a5d1ba MD |
537 | } else if (bp->b_flags & B_LOCKED) { |
| 538 | switch(io->type) { | |
| 539 | case HAMMER_STRUCTURE_DATA_BUFFER: | |
| 710733a6 MD |
540 | hammer_io_flush(io, 0); |
| 541 | break; | |
| 10a5d1ba | 542 | case HAMMER_STRUCTURE_UNDO_BUFFER: |
| 710733a6 | 543 | hammer_io_flush(io, hammer_undo_reclaim(io)); |
| 10a5d1ba MD |
544 | break; |
| 545 | default: | |
| 546 | break; | |
| 547 | } | |
| 548 | } /* else no explicit request to flush the buffer */ | |
| 549 | } | |
| fbc6e32a | 550 | |
| 055f5ff8 | 551 | /* |
| 5c8d05e2 MD |
552 | * Wait for the IO to complete if asked to. This occurs when |
| 553 | * the buffer must be disposed of definitively during an umount | |
| 554 | * or buffer invalidation. | |
| 055f5ff8 | 555 | */ |
| b58c6388 | 556 | if (io->waitdep && io->running) { |
| 055f5ff8 MD |
557 | hammer_io_wait(io); |
| 558 | } | |
| 559 | ||
| 560 | /* | |
| 10a5d1ba MD |
561 | * Return control of the buffer to the kernel (with the provisio |
| 562 | * that our bioops can override kernel decisions with regards to | |
| 563 | * the buffer). | |
| 055f5ff8 | 564 | */ |
| cebe9493 | 565 | if ((flush || io->reclaim) && io->modified == 0 && io->running == 0) { |
| 10a5d1ba MD |
566 | /* |
| 567 | * Always disassociate the bp if an explicit flush | |
| 568 | * was requested and the IO completed with no error | |
| 569 | * (so unmount can really clean up the structure). | |
| 570 | */ | |
| 055f5ff8 | 571 | if (io->released) { |
| b3deaf57 | 572 | regetblk(bp); |
| 46fe7ae1 | 573 | BUF_KERNPROC(bp); |
| ecca949a MD |
574 | } else { |
| 575 | io->released = 1; | |
| 055f5ff8 | 576 | } |
| ecca949a MD |
577 | hammer_io_disassociate((hammer_io_structure_t)io); |
| 578 | /* return the bp */ | |
| 055f5ff8 | 579 | } else if (io->modified) { |
| 10a5d1ba | 580 | /* |
| ecca949a MD |
581 | * Only certain IO types can be released to the kernel if |
| 582 | * the buffer has been modified. | |
| 583 | * | |
| 584 | * volume and meta-data IO types may only be explicitly | |
| 585 | * flushed by HAMMER. | |
| 10a5d1ba MD |
586 | */ |
| 587 | switch(io->type) { | |
| 588 | case HAMMER_STRUCTURE_DATA_BUFFER: | |
| 589 | case HAMMER_STRUCTURE_UNDO_BUFFER: | |
| 590 | if (io->released == 0) { | |
| 591 | io->released = 1; | |
| 9d4e78c7 | 592 | bp->b_flags |= B_CLUSTEROK; |
| 10a5d1ba MD |
593 | bdwrite(bp); |
| 594 | } | |
| 595 | break; | |
| 596 | default: | |
| 597 | break; | |
| 055f5ff8 | 598 | } |
| ecca949a | 599 | bp = NULL; /* bp left associated */ |
| 055f5ff8 | 600 | } else if (io->released == 0) { |
| 10a5d1ba MD |
601 | /* |
| 602 | * Clean buffers can be generally released to the kernel. | |
| 603 | * We leave the bp passively associated with the HAMMER | |
| 604 | * structure and use bioops to disconnect it later on | |
| 605 | * if the kernel wants to discard the buffer. | |
| ecca949a MD |
606 | * |
| 607 | * We can steal the structure's ownership of the bp. | |
| 10a5d1ba | 608 | */ |
| ecca949a | 609 | io->released = 1; |
| 9f5097dc | 610 | if (bp->b_flags & B_LOCKED) { |
| ecca949a MD |
611 | hammer_io_disassociate(iou); |
| 612 | /* return the bp */ | |
| 9f5097dc | 613 | } else { |
| cebe9493 | 614 | if (io->reclaim) { |
| ecca949a MD |
615 | hammer_io_disassociate(iou); |
| 616 | /* return the bp */ | |
| cebe9493 | 617 | } else { |
| ecca949a | 618 | /* return the bp (bp passively associated) */ |
| cebe9493 | 619 | } |
| 9f5097dc | 620 | } |
| 19b97e01 MD |
621 | } else { |
| 622 | /* | |
| af209b0f MD |
623 | * A released buffer is passively associate with our |
| 624 | * hammer_io structure. The kernel cannot destroy it | |
| 625 | * without making a bioops call. If the kernel (B_LOCKED) | |
| 626 | * or we (reclaim) requested that the buffer be destroyed | |
| 627 | * we destroy it, otherwise we do a quick get/release to | |
| 628 | * reset its position in the kernel's LRU list. | |
| 629 | * | |
| 630 | * Leaving the buffer passively associated allows us to | |
| 631 | * use the kernel's LRU buffer flushing mechanisms rather | |
| 632 | * then rolling our own. | |
| cb51be26 MD |
633 | * |
| 634 | * XXX there are two ways of doing this. We can re-acquire | |
| 635 | * and passively release to reset the LRU, or not. | |
| 19b97e01 | 636 | */ |
| af209b0f | 637 | if (io->running == 0) { |
| 19b97e01 | 638 | regetblk(bp); |
| cebe9493 | 639 | if ((bp->b_flags & B_LOCKED) || io->reclaim) { |
| ecca949a MD |
640 | hammer_io_disassociate(iou); |
| 641 | /* return the bp */ | |
| 9f5097dc | 642 | } else { |
| ecca949a | 643 | /* return the bp (bp passively associated) */ |
| 9f5097dc | 644 | } |
| ecca949a MD |
645 | } else { |
| 646 | /* | |
| 647 | * bp is left passively associated but we do not | |
| 648 | * try to reacquire it. Interactions with the io | |
| 649 | * structure will occur on completion of the bp's | |
| 650 | * I/O. | |
| 651 | */ | |
| 652 | bp = NULL; | |
| 19b97e01 | 653 | } |
| 66325755 | 654 | } |
| ecca949a | 655 | return(bp); |
| 66325755 MD |
656 | } |
| 657 | ||
| 658 | /* | |
| b33e2cc0 MD |
659 | * This routine is called with a locked IO when a flush is desired and |
| 660 | * no other references to the structure exists other then ours. This | |
| 661 | * routine is ONLY called when HAMMER believes it is safe to flush a | |
| 662 | * potentially modified buffer out. | |
| 77912481 MD |
663 | * |
| 664 | * The locked io or io reference prevents a flush from being initiated | |
| 665 | * by the kernel. | |
| fbc6e32a MD |
666 | */ |
| 667 | void | |
| 710733a6 | 668 | hammer_io_flush(struct hammer_io *io, int reclaim) |
| fbc6e32a MD |
669 | { |
| 670 | struct buf *bp; | |
| 77912481 | 671 | hammer_mount_t hmp; |
| fbc6e32a | 672 | |
| 055f5ff8 | 673 | /* |
| 10a5d1ba | 674 | * Degenerate case - nothing to flush if nothing is dirty. |
| 055f5ff8 | 675 | */ |
| b0aab9b9 | 676 | if (io->modified == 0) |
| 055f5ff8 MD |
677 | return; |
| 678 | ||
| 679 | KKASSERT(io->bp); | |
| 9f5097dc | 680 | KKASSERT(io->modify_refs <= 0); |
| 055f5ff8 | 681 | |
| b33e2cc0 | 682 | /* |
| 77062c8a MD |
683 | * Acquire ownership of the bp, particularly before we clear our |
| 684 | * modified flag. | |
| 685 | * | |
| 686 | * We are going to bawrite() this bp. Don't leave a window where | |
| 687 | * io->released is set, we actually own the bp rather then our | |
| 688 | * buffer. | |
| b0aab9b9 MD |
689 | * |
| 690 | * The io_token should not be required here as only | |
| 77062c8a | 691 | */ |
| 77912481 | 692 | hmp = io->hmp; |
| 77062c8a MD |
693 | bp = io->bp; |
| 694 | if (io->released) { | |
| 695 | regetblk(bp); | |
| 696 | /* BUF_KERNPROC(io->bp); */ | |
| 697 | /* io->released = 0; */ | |
| 698 | KKASSERT(io->released); | |
| 699 | KKASSERT(io->bp == bp); | |
| b0aab9b9 MD |
700 | } else { |
| 701 | io->released = 1; | |
| 77062c8a | 702 | } |
| 77062c8a | 703 | |
| 710733a6 MD |
704 | if (reclaim) { |
| 705 | io->reclaim = 1; | |
| 706 | if ((bp->b_flags & B_LOCKED) == 0) { | |
| 707 | bp->b_flags |= B_LOCKED; | |
| b0aab9b9 | 708 | atomic_add_int(&hammer_count_io_locked, 1); |
| 710733a6 MD |
709 | } |
| 710 | } | |
| 711 | ||
| 77062c8a | 712 | /* |
| 10a5d1ba MD |
713 | * Acquire exclusive access to the bp and then clear the modified |
| 714 | * state of the buffer prior to issuing I/O to interlock any | |
| 715 | * modifications made while the I/O is in progress. This shouldn't | |
| 716 | * happen anyway but losing data would be worse. The modified bit | |
| 717 | * will be rechecked after the IO completes. | |
| 718 | * | |
| 4a2796f3 MD |
719 | * NOTE: This call also finalizes the buffer's content (inval == 0). |
| 720 | * | |
| b33e2cc0 MD |
721 | * This is only legal when lock.refs == 1 (otherwise we might clear |
| 722 | * the modified bit while there are still users of the cluster | |
| 723 | * modifying the data). | |
| 724 | * | |
| b33e2cc0 MD |
725 | * Do this before potentially blocking so any attempt to modify the |
| 726 | * ondisk while we are blocked blocks waiting for us. | |
| 727 | */ | |
| 5c8d05e2 | 728 | hammer_ref(&io->lock); |
| 4a2796f3 | 729 | hammer_io_clear_modify(io, 0); |
| 250aec18 | 730 | hammer_rel(&io->lock); |
| bcac4bbb | 731 | |
| 6367d0f9 MD |
732 | if (hammer_debug_io & 0x0002) |
| 733 | kprintf("hammer io_write %016jx\n", bp->b_bio1.bio_offset); | |
| 734 | ||
| bcac4bbb | 735 | /* |
| 10a5d1ba | 736 | * Transfer ownership to the kernel and initiate I/O. |
| b0aab9b9 MD |
737 | * |
| 738 | * NOTE: We do not hold io_token so an atomic op is required to | |
| 739 | * update io_running_space. | |
| 10a5d1ba | 740 | */ |
| 055f5ff8 | 741 | io->running = 1; |
| 3583bbb4 MD |
742 | atomic_add_long(&hmp->io_running_space, io->bytes); |
| 743 | atomic_add_long(&hammer_count_io_running_write, io->bytes); | |
| 77912481 MD |
744 | lwkt_gettoken(&hmp->io_token); |
| 745 | TAILQ_INSERT_TAIL(&hmp->iorun_list, io, iorun_entry); | |
| 746 | lwkt_reltoken(&hmp->io_token); | |
| 9d4e78c7 | 747 | cluster_awrite(bp); |
| 748efb59 | 748 | hammer_io_flush_mark(io->volume); |
| fbc6e32a MD |
749 | } |
| 750 | ||
| 055f5ff8 MD |
751 | /************************************************************************ |
| 752 | * BUFFER DIRTYING * | |
| 753 | ************************************************************************ | |
| 754 | * | |
| 755 | * These routines deal with dependancies created when IO buffers get | |
| 756 | * modified. The caller must call hammer_modify_*() on a referenced | |
| 757 | * HAMMER structure prior to modifying its on-disk data. | |
| 0b075555 | 758 | * |
| 055f5ff8 MD |
759 | * Any intent to modify an IO buffer acquires the related bp and imposes |
| 760 | * various write ordering dependancies. | |
| 0b075555 | 761 | */ |
| 055f5ff8 MD |
762 | |
| 763 | /* | |
| 10a5d1ba MD |
764 | * Mark a HAMMER structure as undergoing modification. Meta-data buffers |
| 765 | * are locked until the flusher can deal with them, pure data buffers | |
| 766 | * can be written out. | |
| 77912481 MD |
767 | * |
| 768 | * The referenced io prevents races. | |
| 055f5ff8 | 769 | */ |
| 10a5d1ba | 770 | static |
| b58c6388 | 771 | void |
| 10a5d1ba | 772 | hammer_io_modify(hammer_io_t io, int count) |
| 0b075555 | 773 | { |
| 46fe7ae1 | 774 | /* |
| 9f5097dc MD |
775 | * io->modify_refs must be >= 0 |
| 776 | */ | |
| 777 | while (io->modify_refs < 0) { | |
| 778 | io->waitmod = 1; | |
| 779 | tsleep(io, 0, "hmrmod", 0); | |
| 780 | } | |
| 781 | ||
| 782 | /* | |
| 46fe7ae1 MD |
783 | * Shortcut if nothing to do. |
| 784 | */ | |
| 250aec18 | 785 | KKASSERT(hammer_isactive(&io->lock) && io->bp != NULL); |
| 10a5d1ba | 786 | io->modify_refs += count; |
| b58c6388 MD |
787 | if (io->modified && io->released == 0) |
| 788 | return; | |
| 46fe7ae1 | 789 | |
| 77912481 MD |
790 | /* |
| 791 | * NOTE: It is important not to set the modified bit | |
| 792 | * until after we have acquired the bp or we risk | |
| 793 | * racing against checkwrite. | |
| 794 | */ | |
| 46fe7ae1 | 795 | hammer_lock_ex(&io->lock); |
| 46fe7ae1 MD |
796 | if (io->released) { |
| 797 | regetblk(io->bp); | |
| 798 | BUF_KERNPROC(io->bp); | |
| 799 | io->released = 0; | |
| 77912481 MD |
800 | } |
| 801 | if (io->modified == 0) { | |
| 802 | hammer_io_set_modlist(io); | |
| 803 | io->modified = 1; | |
| 46fe7ae1 | 804 | } |
| 46fe7ae1 | 805 | hammer_unlock(&io->lock); |
| 055f5ff8 MD |
806 | } |
| 807 | ||
| 10a5d1ba MD |
808 | static __inline |
| 809 | void | |
| 810 | hammer_io_modify_done(hammer_io_t io) | |
| 811 | { | |
| 812 | KKASSERT(io->modify_refs > 0); | |
| 813 | --io->modify_refs; | |
| 9f5097dc MD |
814 | if (io->modify_refs == 0 && io->waitmod) { |
| 815 | io->waitmod = 0; | |
| 816 | wakeup(io); | |
| 817 | } | |
| 818 | } | |
| 819 | ||
| 77912481 MD |
820 | /* |
| 821 | * The write interlock blocks other threads trying to modify a buffer | |
| 822 | * (they block in hammer_io_modify()) after us, or blocks us while other | |
| 823 | * threads are in the middle of modifying a buffer. | |
| 824 | * | |
| 825 | * The caller also has a ref on the io, however if we are not careful | |
| 826 | * we will race bioops callbacks (checkwrite). To deal with this | |
| 827 | * we must at least acquire and release the io_token, and it is probably | |
| 828 | * better to hold it through the setting of modify_refs. | |
| 829 | */ | |
| 9f5097dc MD |
830 | void |
| 831 | hammer_io_write_interlock(hammer_io_t io) | |
| 832 | { | |
| 77912481 MD |
833 | hammer_mount_t hmp = io->hmp; |
| 834 | ||
| 835 | lwkt_gettoken(&hmp->io_token); | |
| 9f5097dc MD |
836 | while (io->modify_refs != 0) { |
| 837 | io->waitmod = 1; | |
| 838 | tsleep(io, 0, "hmrmod", 0); | |
| 839 | } | |
| 840 | io->modify_refs = -1; | |
| 77912481 | 841 | lwkt_reltoken(&hmp->io_token); |
| 9f5097dc MD |
842 | } |
| 843 | ||
| 844 | void | |
| 845 | hammer_io_done_interlock(hammer_io_t io) | |
| 846 | { | |
| 847 | KKASSERT(io->modify_refs == -1); | |
| 848 | io->modify_refs = 0; | |
| 849 | if (io->waitmod) { | |
| 850 | io->waitmod = 0; | |
| 851 | wakeup(io); | |
| 852 | } | |
| 10a5d1ba MD |
853 | } |
| 854 | ||
| 2f85fa4d MD |
855 | /* |
| 856 | * Caller intends to modify a volume's ondisk structure. | |
| 857 | * | |
| 858 | * This is only allowed if we are the flusher or we have a ref on the | |
| 859 | * sync_lock. | |
| 860 | */ | |
| 055f5ff8 | 861 | void |
| 36f82b23 MD |
862 | hammer_modify_volume(hammer_transaction_t trans, hammer_volume_t volume, |
| 863 | void *base, int len) | |
| 055f5ff8 | 864 | { |
| 2f85fa4d | 865 | KKASSERT (trans == NULL || trans->sync_lock_refs > 0); |
| 055f5ff8 | 866 | |
| 2f85fa4d | 867 | hammer_io_modify(&volume->io, 1); |
| 47197d71 MD |
868 | if (len) { |
| 869 | intptr_t rel_offset = (intptr_t)base - (intptr_t)volume->ondisk; | |
| 870 | KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0); | |
| 02428fb6 | 871 | hammer_generate_undo(trans, |
| 47197d71 MD |
872 | HAMMER_ENCODE_RAW_VOLUME(volume->vol_no, rel_offset), |
| 873 | base, len); | |
| 874 | } | |
| 0b075555 MD |
875 | } |
| 876 | ||
| 055f5ff8 | 877 | /* |
| 2f85fa4d MD |
878 | * Caller intends to modify a buffer's ondisk structure. |
| 879 | * | |
| 880 | * This is only allowed if we are the flusher or we have a ref on the | |
| 881 | * sync_lock. | |
| 055f5ff8 | 882 | */ |
| 0b075555 | 883 | void |
| 36f82b23 MD |
884 | hammer_modify_buffer(hammer_transaction_t trans, hammer_buffer_t buffer, |
| 885 | void *base, int len) | |
| 46fe7ae1 | 886 | { |
| 2f85fa4d MD |
887 | KKASSERT (trans == NULL || trans->sync_lock_refs > 0); |
| 888 | ||
| 10a5d1ba | 889 | hammer_io_modify(&buffer->io, 1); |
| 47197d71 MD |
890 | if (len) { |
| 891 | intptr_t rel_offset = (intptr_t)base - (intptr_t)buffer->ondisk; | |
| 892 | KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0); | |
| 02428fb6 | 893 | hammer_generate_undo(trans, |
| 34d829f7 | 894 | buffer->zone2_offset + rel_offset, |
| 47197d71 MD |
895 | base, len); |
| 896 | } | |
| 46fe7ae1 MD |
897 | } |
| 898 | ||
| 10a5d1ba MD |
899 | void |
| 900 | hammer_modify_volume_done(hammer_volume_t volume) | |
| 901 | { | |
| 902 | hammer_io_modify_done(&volume->io); | |
| 903 | } | |
| 904 | ||
| 905 | void | |
| 906 | hammer_modify_buffer_done(hammer_buffer_t buffer) | |
| 907 | { | |
| 908 | hammer_io_modify_done(&buffer->io); | |
| 909 | } | |
| 910 | ||
| 46fe7ae1 | 911 | /* |
| 4a2796f3 MD |
912 | * Mark an entity as not being dirty any more and finalize any |
| 913 | * delayed adjustments to the buffer. | |
| 914 | * | |
| 915 | * Delayed adjustments are an important performance enhancement, allowing | |
| 916 | * us to avoid recalculating B-Tree node CRCs over and over again when | |
| 917 | * making bulk-modifications to the B-Tree. | |
| 918 | * | |
| 919 | * If inval is non-zero delayed adjustments are ignored. | |
| 5c8d05e2 MD |
920 | * |
| 921 | * This routine may dereference related btree nodes and cause the | |
| 922 | * buffer to be dereferenced. The caller must own a reference on io. | |
| 61aeeb33 MD |
923 | */ |
| 924 | void | |
| 4a2796f3 | 925 | hammer_io_clear_modify(struct hammer_io *io, int inval) |
| 61aeeb33 | 926 | { |
| 77912481 MD |
927 | hammer_mount_t hmp; |
| 928 | ||
| 929 | /* | |
| 1afb73cf | 930 | * io_token is needed to avoid races on mod_root |
| 77912481 | 931 | */ |
| 4a2796f3 MD |
932 | if (io->modified == 0) |
| 933 | return; | |
| 77912481 MD |
934 | hmp = io->hmp; |
| 935 | lwkt_gettoken(&hmp->io_token); | |
| 936 | if (io->modified == 0) { | |
| 937 | lwkt_reltoken(&hmp->io_token); | |
| 938 | return; | |
| 939 | } | |
| 4a2796f3 MD |
940 | |
| 941 | /* | |
| 942 | * Take us off the mod-list and clear the modified bit. | |
| 943 | */ | |
| 1afb73cf MD |
944 | KKASSERT(io->mod_root != NULL); |
| 945 | if (io->mod_root == &io->hmp->volu_root || | |
| 946 | io->mod_root == &io->hmp->meta_root) { | |
| f5a07a7a | 947 | io->hmp->locked_dirty_space -= io->bytes; |
| 3583bbb4 | 948 | atomic_add_long(&hammer_count_dirtybufspace, -io->bytes); |
| 4a2796f3 | 949 | } |
| 1afb73cf MD |
950 | RB_REMOVE(hammer_mod_rb_tree, io->mod_root, io); |
| 951 | io->mod_root = NULL; | |
| 4a2796f3 MD |
952 | io->modified = 0; |
| 953 | ||
| 77912481 MD |
954 | lwkt_reltoken(&hmp->io_token); |
| 955 | ||
| 4a2796f3 MD |
956 | /* |
| 957 | * If this bit is not set there are no delayed adjustments. | |
| 958 | */ | |
| 959 | if (io->gencrc == 0) | |
| 960 | return; | |
| 961 | io->gencrc = 0; | |
| 962 | ||
| 963 | /* | |
| 964 | * Finalize requested CRCs. The NEEDSCRC flag also holds a reference | |
| 965 | * on the node (& underlying buffer). Release the node after clearing | |
| 966 | * the flag. | |
| 967 | */ | |
| 968 | if (io->type == HAMMER_STRUCTURE_META_BUFFER) { | |
| 969 | hammer_buffer_t buffer = (void *)io; | |
| 970 | hammer_node_t node; | |
| 971 | ||
| 972 | restart: | |
| 973 | TAILQ_FOREACH(node, &buffer->clist, entry) { | |
| 974 | if ((node->flags & HAMMER_NODE_NEEDSCRC) == 0) | |
| 975 | continue; | |
| 976 | node->flags &= ~HAMMER_NODE_NEEDSCRC; | |
| 977 | KKASSERT(node->ondisk); | |
| 978 | if (inval == 0) | |
| 979 | node->ondisk->crc = crc32(&node->ondisk->crc + 1, HAMMER_BTREE_CRCSIZE); | |
| 980 | hammer_rel_node(node); | |
| 981 | goto restart; | |
| 61aeeb33 | 982 | } |
| cebe9493 | 983 | } |
| 5c8d05e2 | 984 | /* caller must still have ref on io */ |
| 250aec18 | 985 | KKASSERT(hammer_isactive(&io->lock)); |
| cebe9493 MD |
986 | } |
| 987 | ||
| 988 | /* | |
| 989 | * Clear the IO's modify list. Even though the IO is no longer modified | |
| 1afb73cf | 990 | * it may still be on the lose_root. This routine is called just before |
| cebe9493 | 991 | * the governing hammer_buffer is destroyed. |
| b0aab9b9 | 992 | * |
| 1afb73cf | 993 | * mod_root requires io_token protection. |
| cebe9493 MD |
994 | */ |
| 995 | void | |
| 996 | hammer_io_clear_modlist(struct hammer_io *io) | |
| 997 | { | |
| b0aab9b9 MD |
998 | hammer_mount_t hmp = io->hmp; |
| 999 | ||
| 4a2796f3 | 1000 | KKASSERT(io->modified == 0); |
| 1afb73cf | 1001 | if (io->mod_root) { |
| b0aab9b9 | 1002 | lwkt_gettoken(&hmp->io_token); |
| 1afb73cf MD |
1003 | if (io->mod_root) { |
| 1004 | KKASSERT(io->mod_root == &io->hmp->lose_root); | |
| 1005 | RB_REMOVE(hammer_mod_rb_tree, io->mod_root, io); | |
| 1006 | io->mod_root = NULL; | |
| b0aab9b9 MD |
1007 | } |
| 1008 | lwkt_reltoken(&hmp->io_token); | |
| 61aeeb33 MD |
1009 | } |
| 1010 | } | |
| 1011 | ||
| cdb6e4e6 MD |
1012 | static void |
| 1013 | hammer_io_set_modlist(struct hammer_io *io) | |
| 1014 | { | |
| 1015 | struct hammer_mount *hmp = io->hmp; | |
| 1016 | ||
| 77912481 | 1017 | lwkt_gettoken(&hmp->io_token); |
| 1afb73cf | 1018 | KKASSERT(io->mod_root == NULL); |
| cdb6e4e6 MD |
1019 | |
| 1020 | switch(io->type) { | |
| 1021 | case HAMMER_STRUCTURE_VOLUME: | |
| 1afb73cf | 1022 | io->mod_root = &hmp->volu_root; |
| cdb6e4e6 | 1023 | hmp->locked_dirty_space += io->bytes; |
| 3583bbb4 | 1024 | atomic_add_long(&hammer_count_dirtybufspace, io->bytes); |
| cdb6e4e6 MD |
1025 | break; |
| 1026 | case HAMMER_STRUCTURE_META_BUFFER: | |
| 1afb73cf | 1027 | io->mod_root = &hmp->meta_root; |
| cdb6e4e6 | 1028 | hmp->locked_dirty_space += io->bytes; |
| 3583bbb4 | 1029 | atomic_add_long(&hammer_count_dirtybufspace, io->bytes); |
| cdb6e4e6 MD |
1030 | break; |
| 1031 | case HAMMER_STRUCTURE_UNDO_BUFFER: | |
| 1afb73cf | 1032 | io->mod_root = &hmp->undo_root; |
| cdb6e4e6 MD |
1033 | break; |
| 1034 | case HAMMER_STRUCTURE_DATA_BUFFER: | |
| 1afb73cf | 1035 | io->mod_root = &hmp->data_root; |
| cdb6e4e6 | 1036 | break; |
| eddadaee | 1037 | case HAMMER_STRUCTURE_DUMMY: |
| 1afb73cf MD |
1038 | panic("hammer_io_set_modlist: bad io type"); |
| 1039 | break; /* NOT REACHED */ | |
| 1040 | } | |
| 1041 | if (RB_INSERT(hammer_mod_rb_tree, io->mod_root, io)) { | |
| 1042 | panic("hammer_io_set_modlist: duplicate entry"); | |
| 1043 | /* NOT REACHED */ | |
| cdb6e4e6 | 1044 | } |
| 77912481 | 1045 | lwkt_reltoken(&hmp->io_token); |
| cdb6e4e6 MD |
1046 | } |
| 1047 | ||
| 055f5ff8 MD |
1048 | /************************************************************************ |
| 1049 | * HAMMER_BIOOPS * | |
| 1050 | ************************************************************************ | |
| 1051 | * | |
| 66325755 MD |
1052 | */ |
| 1053 | ||
| 1054 | /* | |
| 055f5ff8 | 1055 | * Pre-IO initiation kernel callback - cluster build only |
| b0aab9b9 MD |
1056 | * |
| 1057 | * bioops callback - hold io_token | |
| 66325755 | 1058 | */ |
| 66325755 MD |
1059 | static void |
| 1060 | hammer_io_start(struct buf *bp) | |
| 1061 | { | |
| b0aab9b9 | 1062 | /* nothing to do, so io_token not needed */ |
| 66325755 MD |
1063 | } |
| 1064 | ||
| 055f5ff8 | 1065 | /* |
| 7bc5b8c2 | 1066 | * Post-IO completion kernel callback - MAY BE CALLED FROM INTERRUPT! |
| b33e2cc0 | 1067 | * |
| 77912481 MD |
1068 | * NOTE: HAMMER may modify a data buffer after we have initiated write |
| 1069 | * I/O. | |
| 1070 | * | |
| 1071 | * NOTE: MPSAFE callback | |
| b0aab9b9 MD |
1072 | * |
| 1073 | * bioops callback - hold io_token | |
| 055f5ff8 | 1074 | */ |
| 66325755 MD |
1075 | static void |
| 1076 | hammer_io_complete(struct buf *bp) | |
| 1077 | { | |
| 055f5ff8 | 1078 | union hammer_io_structure *iou = (void *)LIST_FIRST(&bp->b_dep); |
| ba298df1 | 1079 | struct hammer_mount *hmp = iou->io.hmp; |
| eddadaee | 1080 | struct hammer_io *ionext; |
| 055f5ff8 | 1081 | |
| b0aab9b9 MD |
1082 | lwkt_gettoken(&hmp->io_token); |
| 1083 | ||
| 055f5ff8 | 1084 | KKASSERT(iou->io.released == 1); |
| fbc6e32a | 1085 | |
| bf3b416b MD |
1086 | /* |
| 1087 | * Deal with people waiting for I/O to drain | |
| 1088 | */ | |
| f90dde4c | 1089 | if (iou->io.running) { |
| cdb6e4e6 MD |
1090 | /* |
| 1091 | * Deal with critical write errors. Once a critical error | |
| 1092 | * has been flagged in hmp the UNDO FIFO will not be updated. | |
| 1093 | * That way crash recover will give us a consistent | |
| 1094 | * filesystem. | |
| 1095 | * | |
| 1096 | * Because of this we can throw away failed UNDO buffers. If | |
| 1097 | * we throw away META or DATA buffers we risk corrupting | |
| 1098 | * the now read-only version of the filesystem visible to | |
| 1099 | * the user. Clear B_ERROR so the buffer is not re-dirtied | |
| 1100 | * by the kernel and ref the io so it doesn't get thrown | |
| 1101 | * away. | |
| 1102 | */ | |
| 1103 | if (bp->b_flags & B_ERROR) { | |
| 77912481 | 1104 | lwkt_gettoken(&hmp->fs_token); |
| ba298df1 | 1105 | hammer_critical_error(hmp, NULL, bp->b_error, |
| cdb6e4e6 | 1106 | "while flushing meta-data"); |
| 77912481 MD |
1107 | lwkt_reltoken(&hmp->fs_token); |
| 1108 | ||
| cdb6e4e6 MD |
1109 | switch(iou->io.type) { |
| 1110 | case HAMMER_STRUCTURE_UNDO_BUFFER: | |
| 1111 | break; | |
| 1112 | default: | |
| 1113 | if (iou->io.ioerror == 0) { | |
| 1114 | iou->io.ioerror = 1; | |
| cdb6e4e6 MD |
1115 | hammer_ref(&iou->io.lock); |
| 1116 | } | |
| 1117 | break; | |
| 1118 | } | |
| 1119 | bp->b_flags &= ~B_ERROR; | |
| 1120 | bundirty(bp); | |
| 1121 | #if 0 | |
| 1122 | hammer_io_set_modlist(&iou->io); | |
| 1123 | iou->io.modified = 1; | |
| 1124 | #endif | |
| 1125 | } | |
| ce0138a6 | 1126 | hammer_stats_disk_write += iou->io.bytes; |
| 3583bbb4 MD |
1127 | atomic_add_long(&hammer_count_io_running_write, -iou->io.bytes); |
| 1128 | atomic_add_long(&hmp->io_running_space, -iou->io.bytes); | |
| ba298df1 | 1129 | KKASSERT(hmp->io_running_space >= 0); |
| f90dde4c | 1130 | iou->io.running = 0; |
| eddadaee MD |
1131 | |
| 1132 | /* | |
| 1133 | * Remove from iorun list and wakeup any multi-io waiter(s). | |
| 1134 | */ | |
| ba298df1 | 1135 | if (TAILQ_FIRST(&hmp->iorun_list) == &iou->io) { |
| eddadaee MD |
1136 | ionext = TAILQ_NEXT(&iou->io, iorun_entry); |
| 1137 | if (ionext && ionext->type == HAMMER_STRUCTURE_DUMMY) | |
| 1138 | wakeup(ionext); | |
| 1139 | } | |
| ba298df1 | 1140 | TAILQ_REMOVE(&hmp->iorun_list, &iou->io, iorun_entry); |
| ce0138a6 MD |
1141 | } else { |
| 1142 | hammer_stats_disk_read += iou->io.bytes; | |
| f90dde4c MD |
1143 | } |
| 1144 | ||
| 055f5ff8 MD |
1145 | if (iou->io.waiting) { |
| 1146 | iou->io.waiting = 0; | |
| 1147 | wakeup(iou); | |
| 1148 | } | |
| 1149 | ||
| 1150 | /* | |
| bf3b416b | 1151 | * If B_LOCKED is set someone wanted to deallocate the bp at some |
| 250aec18 MD |
1152 | * point, try to do it now. The operation will fail if there are |
| 1153 | * refs or if hammer_io_deallocate() is unable to gain the | |
| 1154 | * interlock. | |
| 055f5ff8 | 1155 | */ |
| 250aec18 | 1156 | if (bp->b_flags & B_LOCKED) { |
| b0aab9b9 | 1157 | atomic_add_int(&hammer_count_io_locked, -1); |
| d5ef456e | 1158 | bp->b_flags &= ~B_LOCKED; |
| 055f5ff8 MD |
1159 | hammer_io_deallocate(bp); |
| 1160 | /* structure may be dead now */ | |
| 1161 | } | |
| b0aab9b9 | 1162 | lwkt_reltoken(&hmp->io_token); |
| 66325755 MD |
1163 | } |
| 1164 | ||
| 1165 | /* | |
| 1166 | * Callback from kernel when it wishes to deallocate a passively | |
| 10a5d1ba MD |
1167 | * associated structure. This mostly occurs with clean buffers |
| 1168 | * but it may be possible for a holding structure to be marked dirty | |
| 7bc5b8c2 | 1169 | * while its buffer is passively associated. The caller owns the bp. |
| 66325755 MD |
1170 | * |
| 1171 | * If we cannot disassociate we set B_LOCKED to prevent the buffer | |
| 1172 | * from getting reused. | |
| 46fe7ae1 MD |
1173 | * |
| 1174 | * WARNING: Because this can be called directly by getnewbuf we cannot | |
| 1175 | * recurse into the tree. If a bp cannot be immediately disassociated | |
| 1176 | * our only recourse is to set B_LOCKED. | |
| 7bc5b8c2 MD |
1177 | * |
| 1178 | * WARNING: This may be called from an interrupt via hammer_io_complete() | |
| b0aab9b9 MD |
1179 | * |
| 1180 | * bioops callback - hold io_token | |
| 66325755 MD |
1181 | */ |
| 1182 | static void | |
| 1183 | hammer_io_deallocate(struct buf *bp) | |
| 1184 | { | |
| 055f5ff8 | 1185 | hammer_io_structure_t iou = (void *)LIST_FIRST(&bp->b_dep); |
| b0aab9b9 MD |
1186 | hammer_mount_t hmp; |
| 1187 | ||
| 1188 | hmp = iou->io.hmp; | |
| 1189 | ||
| 1190 | lwkt_gettoken(&hmp->io_token); | |
| 66325755 | 1191 | |
| 055f5ff8 | 1192 | KKASSERT((bp->b_flags & B_LOCKED) == 0 && iou->io.running == 0); |
| 250aec18 MD |
1193 | if (hammer_try_interlock_norefs(&iou->io.lock) == 0) { |
| 1194 | /* | |
| 1195 | * We cannot safely disassociate a bp from a referenced | |
| 1196 | * or interlocked HAMMER structure. | |
| 1197 | */ | |
| 1198 | bp->b_flags |= B_LOCKED; | |
| b0aab9b9 | 1199 | atomic_add_int(&hammer_count_io_locked, 1); |
| 250aec18 | 1200 | } else if (iou->io.modified) { |
| 10a5d1ba MD |
1201 | /* |
| 1202 | * It is not legal to disassociate a modified buffer. This | |
| 1203 | * case really shouldn't ever occur. | |
| 1204 | */ | |
| 055f5ff8 | 1205 | bp->b_flags |= B_LOCKED; |
| b0aab9b9 | 1206 | atomic_add_int(&hammer_count_io_locked, 1); |
| 250aec18 | 1207 | hammer_put_interlock(&iou->io.lock, 0); |
| 055f5ff8 | 1208 | } else { |
| 10a5d1ba MD |
1209 | /* |
| 1210 | * Disassociate the BP. If the io has no refs left we | |
| b0aab9b9 MD |
1211 | * have to add it to the loose list. The kernel has |
| 1212 | * locked the buffer and therefore our io must be | |
| 1213 | * in a released state. | |
| 10a5d1ba | 1214 | */ |
| ecca949a MD |
1215 | hammer_io_disassociate(iou); |
| 1216 | if (iou->io.type != HAMMER_STRUCTURE_VOLUME) { | |
| 1217 | KKASSERT(iou->io.bp == NULL); | |
| 1afb73cf MD |
1218 | KKASSERT(iou->io.mod_root == NULL); |
| 1219 | iou->io.mod_root = &hmp->lose_root; | |
| 1220 | if (RB_INSERT(hammer_mod_rb_tree, iou->io.mod_root, | |
| 1221 | &iou->io)) { | |
| 1222 | panic("hammer_io_deallocate: duplicate entry"); | |
| 1223 | } | |
| 66325755 | 1224 | } |
| 250aec18 | 1225 | hammer_put_interlock(&iou->io.lock, 1); |
| 66325755 | 1226 | } |
| b0aab9b9 | 1227 | lwkt_reltoken(&hmp->io_token); |
| 66325755 MD |
1228 | } |
| 1229 | ||
| b0aab9b9 MD |
1230 | /* |
| 1231 | * bioops callback - hold io_token | |
| 1232 | */ | |
| 66325755 MD |
1233 | static int |
| 1234 | hammer_io_fsync(struct vnode *vp) | |
| 1235 | { | |
| b0aab9b9 | 1236 | /* nothing to do, so io_token not needed */ |
| 66325755 MD |
1237 | return(0); |
| 1238 | } | |
| 1239 | ||
| 1240 | /* | |
| 1241 | * NOTE: will not be called unless we tell the kernel about the | |
| 1242 | * bioops. Unused... we use the mount's VFS_SYNC instead. | |
| b0aab9b9 MD |
1243 | * |
| 1244 | * bioops callback - hold io_token | |
| 66325755 MD |
1245 | */ |
| 1246 | static int | |
| 1247 | hammer_io_sync(struct mount *mp) | |
| 1248 | { | |
| b0aab9b9 | 1249 | /* nothing to do, so io_token not needed */ |
| 66325755 MD |
1250 | return(0); |
| 1251 | } | |
| 1252 | ||
| b0aab9b9 MD |
1253 | /* |
| 1254 | * bioops callback - hold io_token | |
| 1255 | */ | |
| 66325755 MD |
1256 | static void |
| 1257 | hammer_io_movedeps(struct buf *bp1, struct buf *bp2) | |
| 1258 | { | |
| b0aab9b9 | 1259 | /* nothing to do, so io_token not needed */ |
| 66325755 MD |
1260 | } |
| 1261 | ||
| 1262 | /* | |
| 1263 | * I/O pre-check for reading and writing. HAMMER only uses this for | |
| 1264 | * B_CACHE buffers so checkread just shouldn't happen, but if it does | |
| 1265 | * allow it. | |
| 1266 | * | |
| fbc6e32a MD |
1267 | * Writing is a different case. We don't want the kernel to try to write |
| 1268 | * out a buffer that HAMMER may be modifying passively or which has a | |
| 10a5d1ba MD |
1269 | * dependancy. In addition, kernel-demanded writes can only proceed for |
| 1270 | * certain types of buffers (i.e. UNDO and DATA types). Other dirty | |
| 1271 | * buffer types can only be explicitly written by the flusher. | |
| fbc6e32a | 1272 | * |
| 10a5d1ba MD |
1273 | * checkwrite will only be called for bdwrite()n buffers. If we return |
| 1274 | * success the kernel is guaranteed to initiate the buffer write. | |
| b0aab9b9 MD |
1275 | * |
| 1276 | * bioops callback - hold io_token | |
| 66325755 MD |
1277 | */ |
| 1278 | static int | |
| 1279 | hammer_io_checkread(struct buf *bp) | |
| 1280 | { | |
| b0aab9b9 | 1281 | /* nothing to do, so io_token not needed */ |
| 66325755 MD |
1282 | return(0); |
| 1283 | } | |
| 1284 | ||
| b0aab9b9 | 1285 | /* |
| 77912481 MD |
1286 | * The kernel is asking us whether it can write out a dirty buffer or not. |
| 1287 | * | |
| b0aab9b9 MD |
1288 | * bioops callback - hold io_token |
| 1289 | */ | |
| 66325755 MD |
1290 | static int |
| 1291 | hammer_io_checkwrite(struct buf *bp) | |
| 1292 | { | |
| 10a5d1ba | 1293 | hammer_io_t io = (void *)LIST_FIRST(&bp->b_dep); |
| b0aab9b9 | 1294 | hammer_mount_t hmp = io->hmp; |
| 66325755 | 1295 | |
| 77062c8a MD |
1296 | /* |
| 1297 | * This shouldn't happen under normal operation. | |
| 1298 | */ | |
| b0aab9b9 | 1299 | lwkt_gettoken(&hmp->io_token); |
| 77062c8a MD |
1300 | if (io->type == HAMMER_STRUCTURE_VOLUME || |
| 1301 | io->type == HAMMER_STRUCTURE_META_BUFFER) { | |
| 1302 | if (!panicstr) | |
| 1303 | panic("hammer_io_checkwrite: illegal buffer"); | |
| a99b9ea2 MD |
1304 | if ((bp->b_flags & B_LOCKED) == 0) { |
| 1305 | bp->b_flags |= B_LOCKED; | |
| b0aab9b9 | 1306 | atomic_add_int(&hammer_count_io_locked, 1); |
| a99b9ea2 | 1307 | } |
| b0aab9b9 | 1308 | lwkt_reltoken(&hmp->io_token); |
| 77062c8a MD |
1309 | return(1); |
| 1310 | } | |
| c9b9e29d | 1311 | |
| 055f5ff8 | 1312 | /* |
| 77912481 MD |
1313 | * We have to be able to interlock the IO to safely modify any |
| 1314 | * of its fields without holding the fs_token. If we can't lock | |
| 1315 | * it then we are racing someone. | |
| 1316 | * | |
| 1317 | * Our ownership of the bp lock prevents the io from being ripped | |
| 1318 | * out from under us. | |
| 1319 | */ | |
| 1320 | if (hammer_try_interlock_norefs(&io->lock) == 0) { | |
| 1321 | bp->b_flags |= B_LOCKED; | |
| 1322 | atomic_add_int(&hammer_count_io_locked, 1); | |
| 1323 | lwkt_reltoken(&hmp->io_token); | |
| 1324 | return(1); | |
| 1325 | } | |
| 1326 | ||
| 1327 | /* | |
| 1328 | * The modified bit must be cleared prior to the initiation of | |
| 1329 | * any IO (returning 0 initiates the IO). Because this is a | |
| 1330 | * normal data buffer hammer_io_clear_modify() runs through a | |
| 1331 | * simple degenerate case. | |
| 1332 | * | |
| 1333 | * Return 0 will cause the kernel to initiate the IO, and we | |
| 1334 | * must normally clear the modified bit before we begin. If | |
| 1335 | * the io has modify_refs we do not clear the modified bit, | |
| 1336 | * otherwise we may miss changes. | |
| 5c8d05e2 MD |
1337 | * |
| 1338 | * Only data and undo buffers can reach here. These buffers do | |
| 1339 | * not have terminal crc functions but we temporarily reference | |
| 1340 | * the IO anyway, just in case. | |
| b33e2cc0 | 1341 | */ |
| 5c8d05e2 MD |
1342 | if (io->modify_refs == 0 && io->modified) { |
| 1343 | hammer_ref(&io->lock); | |
| 4a2796f3 | 1344 | hammer_io_clear_modify(io, 0); |
| 250aec18 | 1345 | hammer_rel(&io->lock); |
| 5c8d05e2 MD |
1346 | } else if (io->modified) { |
| 1347 | KKASSERT(io->type == HAMMER_STRUCTURE_DATA_BUFFER); | |
| 1348 | } | |
| f90dde4c MD |
1349 | |
| 1350 | /* | |
| 1351 | * The kernel is going to start the IO, set io->running. | |
| 1352 | */ | |
| 1353 | KKASSERT(io->running == 0); | |
| 1354 | io->running = 1; | |
| 3583bbb4 MD |
1355 | atomic_add_long(&io->hmp->io_running_space, io->bytes); |
| 1356 | atomic_add_long(&hammer_count_io_running_write, io->bytes); | |
| eddadaee | 1357 | TAILQ_INSERT_TAIL(&io->hmp->iorun_list, io, iorun_entry); |
| b0aab9b9 | 1358 | |
| 77912481 | 1359 | hammer_put_interlock(&io->lock, 1); |
| b0aab9b9 MD |
1360 | lwkt_reltoken(&hmp->io_token); |
| 1361 | ||
| 055f5ff8 | 1362 | return(0); |
| 66325755 MD |
1363 | } |
| 1364 | ||
| 8cd0a023 | 1365 | /* |
| 66325755 MD |
1366 | * Return non-zero if we wish to delay the kernel's attempt to flush |
| 1367 | * this buffer to disk. | |
| b0aab9b9 MD |
1368 | * |
| 1369 | * bioops callback - hold io_token | |
| 66325755 MD |
1370 | */ |
| 1371 | static int | |
| 1372 | hammer_io_countdeps(struct buf *bp, int n) | |
| 1373 | { | |
| b0aab9b9 | 1374 | /* nothing to do, so io_token not needed */ |
| 66325755 MD |
1375 | return(0); |
| 1376 | } | |
| 1377 | ||
| 1378 | struct bio_ops hammer_bioops = { | |
| 1379 | .io_start = hammer_io_start, | |
| 1380 | .io_complete = hammer_io_complete, | |
| 1381 | .io_deallocate = hammer_io_deallocate, | |
| 1382 | .io_fsync = hammer_io_fsync, | |
| 1383 | .io_sync = hammer_io_sync, | |
| 1384 | .io_movedeps = hammer_io_movedeps, | |
| 1385 | .io_countdeps = hammer_io_countdeps, | |
| 1386 | .io_checkread = hammer_io_checkread, | |
| 1387 | .io_checkwrite = hammer_io_checkwrite, | |
| 1388 | }; | |
| 1389 | ||
| 47637bff MD |
1390 | /************************************************************************ |
| 1391 | * DIRECT IO OPS * | |
| 1392 | ************************************************************************ | |
| 1393 | * | |
| 1394 | * These functions operate directly on the buffer cache buffer associated | |
| 1395 | * with a front-end vnode rather then a back-end device vnode. | |
| 1396 | */ | |
| 1397 | ||
| 1398 | /* | |
| 1399 | * Read a buffer associated with a front-end vnode directly from the | |
| 1b0ab2c3 MD |
1400 | * disk media. The bio may be issued asynchronously. If leaf is non-NULL |
| 1401 | * we validate the CRC. | |
| a99b9ea2 | 1402 | * |
| 1b0ab2c3 MD |
1403 | * We must check for the presence of a HAMMER buffer to handle the case |
| 1404 | * where the reblocker has rewritten the data (which it does via the HAMMER | |
| 1405 | * buffer system, not via the high-level vnode buffer cache), but not yet | |
| 1406 | * committed the buffer to the media. | |
| 47637bff MD |
1407 | */ |
| 1408 | int | |
| 1b0ab2c3 MD |
1409 | hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio, |
| 1410 | hammer_btree_leaf_elm_t leaf) | |
| 47637bff | 1411 | { |
| 1b0ab2c3 | 1412 | hammer_off_t buf_offset; |
| 47637bff MD |
1413 | hammer_off_t zone2_offset; |
| 1414 | hammer_volume_t volume; | |
| 1415 | struct buf *bp; | |
| 1416 | struct bio *nbio; | |
| 1417 | int vol_no; | |
| 1418 | int error; | |
| 1419 | ||
| 1b0ab2c3 MD |
1420 | buf_offset = bio->bio_offset; |
| 1421 | KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) == | |
| 1422 | HAMMER_ZONE_LARGE_DATA); | |
| 1423 | ||
| 1424 | /* | |
| 1425 | * The buffer cache may have an aliased buffer (the reblocker can | |
| 1426 | * write them). If it does we have to sync any dirty data before | |
| 1427 | * we can build our direct-read. This is a non-critical code path. | |
| 1428 | */ | |
| 1429 | bp = bio->bio_buf; | |
| 1430 | hammer_sync_buffers(hmp, buf_offset, bp->b_bufsize); | |
| 4a2796f3 | 1431 | |
| 1b0ab2c3 MD |
1432 | /* |
| 1433 | * Resolve to a zone-2 offset. The conversion just requires | |
| 1434 | * munging the top 4 bits but we want to abstract it anyway | |
| 1435 | * so the blockmap code can verify the zone assignment. | |
| 1436 | */ | |
| 1437 | zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error); | |
| 1438 | if (error) | |
| 1439 | goto done; | |
| 43c665ae MD |
1440 | KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) == |
| 1441 | HAMMER_ZONE_RAW_BUFFER); | |
| 1442 | ||
| 1b0ab2c3 MD |
1443 | /* |
| 1444 | * Resolve volume and raw-offset for 3rd level bio. The | |
| 1445 | * offset will be specific to the volume. | |
| 1446 | */ | |
| 43c665ae MD |
1447 | vol_no = HAMMER_VOL_DECODE(zone2_offset); |
| 1448 | volume = hammer_get_volume(hmp, vol_no, &error); | |
| 1449 | if (error == 0 && zone2_offset >= volume->maxbuf_off) | |
| 1450 | error = EIO; | |
| 1451 | ||
| 47637bff | 1452 | if (error == 0) { |
| e469566b MD |
1453 | /* |
| 1454 | * 3rd level bio | |
| 1455 | */ | |
| 43c665ae MD |
1456 | nbio = push_bio(bio); |
| 1457 | nbio->bio_offset = volume->ondisk->vol_buf_beg + | |
| e469566b | 1458 | (zone2_offset & HAMMER_OFF_SHORT_MASK); |
| 1b0ab2c3 MD |
1459 | #if 0 |
| 1460 | /* | |
| 1461 | * XXX disabled - our CRC check doesn't work if the OS | |
| 1462 | * does bogus_page replacement on the direct-read. | |
| 1463 | */ | |
| 1464 | if (leaf && hammer_verify_data) { | |
| 1465 | nbio->bio_done = hammer_io_direct_read_complete; | |
| 1466 | nbio->bio_caller_info1.uvalue32 = leaf->data_crc; | |
| 1467 | } | |
| 1468 | #endif | |
| ce0138a6 | 1469 | hammer_stats_disk_read += bp->b_bufsize; |
| 43c665ae | 1470 | vn_strategy(volume->devvp, nbio); |
| 47637bff | 1471 | } |
| 43c665ae | 1472 | hammer_rel_volume(volume, 0); |
| 1b0ab2c3 | 1473 | done: |
| 47637bff | 1474 | if (error) { |
| cebe9493 | 1475 | kprintf("hammer_direct_read: failed @ %016llx\n", |
| 973c11b9 | 1476 | (long long)zone2_offset); |
| 47637bff MD |
1477 | bp->b_error = error; |
| 1478 | bp->b_flags |= B_ERROR; | |
| 1479 | biodone(bio); | |
| 1480 | } | |
| 1481 | return(error); | |
| 1482 | } | |
| 1483 | ||
| 9a98f3cc MD |
1484 | /* |
| 1485 | * This works similarly to hammer_io_direct_read() except instead of | |
| 1486 | * directly reading from the device into the bio we instead indirectly | |
| 1487 | * read through the device's buffer cache and then copy the data into | |
| 1488 | * the bio. | |
| 1489 | * | |
| 1490 | * If leaf is non-NULL and validation is enabled, the CRC will be checked. | |
| 1491 | * | |
| 1492 | * This routine also executes asynchronously. It allows hammer strategy | |
| 1493 | * calls to operate asynchronously when in double_buffer mode (in addition | |
| 1494 | * to operating asynchronously when in normal mode). | |
| 1495 | */ | |
| 1496 | int | |
| 1497 | hammer_io_indirect_read(hammer_mount_t hmp, struct bio *bio, | |
| 1498 | hammer_btree_leaf_elm_t leaf) | |
| 1499 | { | |
| 1500 | hammer_off_t buf_offset; | |
| 1501 | hammer_off_t zone2_offset; | |
| 1502 | hammer_volume_t volume; | |
| 1503 | struct buf *bp; | |
| 1504 | int vol_no; | |
| 1505 | int error; | |
| 1506 | ||
| 1507 | buf_offset = bio->bio_offset; | |
| 1508 | KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) == | |
| 1509 | HAMMER_ZONE_LARGE_DATA); | |
| 1510 | ||
| 1511 | /* | |
| 1512 | * The buffer cache may have an aliased buffer (the reblocker can | |
| 1513 | * write them). If it does we have to sync any dirty data before | |
| 1514 | * we can build our direct-read. This is a non-critical code path. | |
| 1515 | */ | |
| 1516 | bp = bio->bio_buf; | |
| 1517 | hammer_sync_buffers(hmp, buf_offset, bp->b_bufsize); | |
| 1518 | ||
| 1519 | /* | |
| 1520 | * Resolve to a zone-2 offset. The conversion just requires | |
| 1521 | * munging the top 4 bits but we want to abstract it anyway | |
| 1522 | * so the blockmap code can verify the zone assignment. | |
| 1523 | */ | |
| 1524 | zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error); | |
| 1525 | if (error) | |
| 1526 | goto done; | |
| 1527 | KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) == | |
| 1528 | HAMMER_ZONE_RAW_BUFFER); | |
| 1529 | ||
| 1530 | /* | |
| 1531 | * Resolve volume and raw-offset for 3rd level bio. The | |
| 1532 | * offset will be specific to the volume. | |
| 1533 | */ | |
| 1534 | vol_no = HAMMER_VOL_DECODE(zone2_offset); | |
| 1535 | volume = hammer_get_volume(hmp, vol_no, &error); | |
| 1536 | if (error == 0 && zone2_offset >= volume->maxbuf_off) | |
| 1537 | error = EIO; | |
| 1538 | ||
| 1539 | if (error == 0) { | |
| 1540 | /* | |
| 1541 | * Convert to the raw volume->devvp offset and acquire | |
| 1542 | * the buf, issuing async I/O if necessary. | |
| 1543 | */ | |
| 1544 | buf_offset = volume->ondisk->vol_buf_beg + | |
| 1545 | (zone2_offset & HAMMER_OFF_SHORT_MASK); | |
| 1546 | ||
| 1547 | if (leaf && hammer_verify_data) { | |
| 1548 | bio->bio_caller_info1.uvalue32 = leaf->data_crc; | |
| 1549 | bio->bio_caller_info2.index = 1; | |
| 1550 | } else { | |
| 1551 | bio->bio_caller_info2.index = 0; | |
| 1552 | } | |
| 1553 | breadcb(volume->devvp, buf_offset, bp->b_bufsize, | |
| 1554 | hammer_indirect_callback, bio); | |
| 1555 | } | |
| 1556 | hammer_rel_volume(volume, 0); | |
| 1557 | done: | |
| 1558 | if (error) { | |
| 1559 | kprintf("hammer_direct_read: failed @ %016llx\n", | |
| 1560 | (long long)zone2_offset); | |
| 1561 | bp->b_error = error; | |
| 1562 | bp->b_flags |= B_ERROR; | |
| 1563 | biodone(bio); | |
| 1564 | } | |
| 1565 | return(error); | |
| 1566 | } | |
| 1567 | ||
| 1568 | /* | |
| 1569 | * Indirect callback on completion. bio/bp specify the device-backed | |
| 1570 | * buffer. bio->bio_caller_info1.ptr holds obio. | |
| 1571 | * | |
| 1572 | * obio/obp is the original regular file buffer. obio->bio_caller_info* | |
| 1573 | * contains the crc specification. | |
| 1574 | * | |
| 1575 | * We are responsible for calling bpdone() and bqrelse() on bio/bp, and | |
| 1576 | * for calling biodone() on obio. | |
| 1577 | */ | |
| 1578 | static void | |
| 1579 | hammer_indirect_callback(struct bio *bio) | |
| 1580 | { | |
| 1581 | struct buf *bp = bio->bio_buf; | |
| 1582 | struct buf *obp; | |
| 1583 | struct bio *obio; | |
| 1584 | ||
| 1585 | /* | |
| 1586 | * If BIO_DONE is already set the device buffer was already | |
| 1587 | * fully valid (B_CACHE). If it is not set then I/O was issued | |
| 1588 | * and we have to run I/O completion as the last bio. | |
| 1589 | * | |
| 1590 | * Nobody is waiting for our device I/O to complete, we are | |
| 1591 | * responsible for bqrelse()ing it which means we also have to do | |
| 1592 | * the equivalent of biowait() and clear BIO_DONE (which breadcb() | |
| 1593 | * may have set). | |
| 1594 | * | |
| 1595 | * Any preexisting device buffer should match the requested size, | |
| 1596 | * but due to bigblock recycling and other factors there is some | |
| 1597 | * fragility there, so we assert that the device buffer covers | |
| 1598 | * the request. | |
| 1599 | */ | |
| 1600 | if ((bio->bio_flags & BIO_DONE) == 0) | |
| 1601 | bpdone(bp, 0); | |
| 1602 | bio->bio_flags &= ~(BIO_DONE | BIO_SYNC); | |
| 1603 | ||
| 1604 | obio = bio->bio_caller_info1.ptr; | |
| 1605 | obp = obio->bio_buf; | |
| 1606 | ||
| 1607 | if (bp->b_flags & B_ERROR) { | |
| 1608 | obp->b_flags |= B_ERROR; | |
| 1609 | obp->b_error = bp->b_error; | |
| 1610 | } else if (obio->bio_caller_info2.index && | |
| 1611 | obio->bio_caller_info1.uvalue32 != | |
| 1612 | crc32(bp->b_data, bp->b_bufsize)) { | |
| 1613 | obp->b_flags |= B_ERROR; | |
| 1614 | obp->b_error = EIO; | |
| 1615 | } else { | |
| 1616 | KKASSERT(bp->b_bufsize >= obp->b_bufsize); | |
| 1617 | bcopy(bp->b_data, obp->b_data, obp->b_bufsize); | |
| 1618 | obp->b_resid = 0; | |
| 1619 | obp->b_flags |= B_AGE; | |
| 1620 | } | |
| 1621 | biodone(obio); | |
| 1622 | bqrelse(bp); | |
| 1623 | } | |
| 1624 | ||
| 1b0ab2c3 MD |
1625 | #if 0 |
| 1626 | /* | |
| 1627 | * On completion of the BIO this callback must check the data CRC | |
| 1628 | * and chain to the previous bio. | |
| b0aab9b9 MD |
1629 | * |
| 1630 | * MPSAFE - since we do not modify and hammer_records we do not need | |
| 1631 | * io_token. | |
| 77912481 MD |
1632 | * |
| 1633 | * NOTE: MPSAFE callback | |
| 1b0ab2c3 MD |
1634 | */ |
| 1635 | static | |
| 1636 | void | |
| 1637 | hammer_io_direct_read_complete(struct bio *nbio) | |
| 1638 | { | |
| 1639 | struct bio *obio; | |
| 1640 | struct buf *bp; | |
| 1641 | u_int32_t rec_crc = nbio->bio_caller_info1.uvalue32; | |
| 1642 | ||
| 1643 | bp = nbio->bio_buf; | |
| 1644 | if (crc32(bp->b_data, bp->b_bufsize) != rec_crc) { | |
| 1645 | kprintf("HAMMER: data_crc error @%016llx/%d\n", | |
| 1646 | nbio->bio_offset, bp->b_bufsize); | |
| fc73edd8 MD |
1647 | if (hammer_debug_critical) |
| 1648 | Debugger("data_crc on read"); | |
| 1b0ab2c3 MD |
1649 | bp->b_flags |= B_ERROR; |
| 1650 | bp->b_error = EIO; | |
| 1651 | } | |
| 1652 | obio = pop_bio(nbio); | |
| 1653 | biodone(obio); | |
| 1654 | } | |
| 1655 | #endif | |
| 1656 | ||
| 47637bff MD |
1657 | /* |
| 1658 | * Write a buffer associated with a front-end vnode directly to the | |
| 1659 | * disk media. The bio may be issued asynchronously. | |
| 1b0ab2c3 | 1660 | * |
| 77912481 | 1661 | * The BIO is associated with the specified record and RECG_DIRECT_IO |
| e469566b | 1662 | * is set. The recorded is added to its object. |
| 47637bff MD |
1663 | */ |
| 1664 | int | |
| 6362a262 MD |
1665 | hammer_io_direct_write(hammer_mount_t hmp, struct bio *bio, |
| 1666 | hammer_record_t record) | |
| 47637bff | 1667 | { |
| 1b0ab2c3 | 1668 | hammer_btree_leaf_elm_t leaf = &record->leaf; |
| 0832c9bb | 1669 | hammer_off_t buf_offset; |
| 47637bff MD |
1670 | hammer_off_t zone2_offset; |
| 1671 | hammer_volume_t volume; | |
| 0832c9bb | 1672 | hammer_buffer_t buffer; |
| 47637bff MD |
1673 | struct buf *bp; |
| 1674 | struct bio *nbio; | |
| 0832c9bb | 1675 | char *ptr; |
| 47637bff MD |
1676 | int vol_no; |
| 1677 | int error; | |
| 1678 | ||
| 0832c9bb MD |
1679 | buf_offset = leaf->data_offset; |
| 1680 | ||
| 1681 | KKASSERT(buf_offset > HAMMER_ZONE_BTREE); | |
| 47637bff MD |
1682 | KKASSERT(bio->bio_buf->b_cmd == BUF_CMD_WRITE); |
| 1683 | ||
| 6362a262 MD |
1684 | /* |
| 1685 | * Issue or execute the I/O. The new memory record must replace | |
| 1686 | * the old one before the I/O completes, otherwise a reaquisition of | |
| 1687 | * the buffer will load the old media data instead of the new. | |
| 1688 | */ | |
| 0832c9bb | 1689 | if ((buf_offset & HAMMER_BUFMASK) == 0 && |
| 4a2796f3 | 1690 | leaf->data_len >= HAMMER_BUFSIZE) { |
| 0832c9bb MD |
1691 | /* |
| 1692 | * We are using the vnode's bio to write directly to the | |
| 1693 | * media, any hammer_buffer at the same zone-X offset will | |
| 1694 | * now have stale data. | |
| 1695 | */ | |
| 1696 | zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error); | |
| 47637bff MD |
1697 | vol_no = HAMMER_VOL_DECODE(zone2_offset); |
| 1698 | volume = hammer_get_volume(hmp, vol_no, &error); | |
| 1699 | ||
| 1700 | if (error == 0 && zone2_offset >= volume->maxbuf_off) | |
| 1701 | error = EIO; | |
| 1702 | if (error == 0) { | |
| 0832c9bb | 1703 | bp = bio->bio_buf; |
| 4a2796f3 | 1704 | KKASSERT((bp->b_bufsize & HAMMER_BUFMASK) == 0); |
| e469566b | 1705 | /* |
| 4a2796f3 MD |
1706 | hammer_del_buffers(hmp, buf_offset, |
| 1707 | zone2_offset, bp->b_bufsize); | |
| e469566b | 1708 | */ |
| 1b0ab2c3 | 1709 | |
| 43c665ae MD |
1710 | /* |
| 1711 | * Second level bio - cached zone2 offset. | |
| 1b0ab2c3 MD |
1712 | * |
| 1713 | * (We can put our bio_done function in either the | |
| 1714 | * 2nd or 3rd level). | |
| 43c665ae | 1715 | */ |
| 47637bff | 1716 | nbio = push_bio(bio); |
| 43c665ae | 1717 | nbio->bio_offset = zone2_offset; |
| 1b0ab2c3 MD |
1718 | nbio->bio_done = hammer_io_direct_write_complete; |
| 1719 | nbio->bio_caller_info1.ptr = record; | |
| e469566b | 1720 | record->zone2_offset = zone2_offset; |
| 77912481 MD |
1721 | record->gflags |= HAMMER_RECG_DIRECT_IO | |
| 1722 | HAMMER_RECG_DIRECT_INVAL; | |
| 43c665ae MD |
1723 | |
| 1724 | /* | |
| 1725 | * Third level bio - raw offset specific to the | |
| 1726 | * correct volume. | |
| 1727 | */ | |
| 1728 | zone2_offset &= HAMMER_OFF_SHORT_MASK; | |
| 1729 | nbio = push_bio(nbio); | |
| 47637bff | 1730 | nbio->bio_offset = volume->ondisk->vol_buf_beg + |
| 0832c9bb | 1731 | zone2_offset; |
| ce0138a6 | 1732 | hammer_stats_disk_write += bp->b_bufsize; |
| 6362a262 | 1733 | hammer_ip_replace_bulk(hmp, record); |
| 47637bff | 1734 | vn_strategy(volume->devvp, nbio); |
| 748efb59 | 1735 | hammer_io_flush_mark(volume); |
| 47637bff MD |
1736 | } |
| 1737 | hammer_rel_volume(volume, 0); | |
| 0832c9bb | 1738 | } else { |
| 1b0ab2c3 MD |
1739 | /* |
| 1740 | * Must fit in a standard HAMMER buffer. In this case all | |
| 77912481 | 1741 | * consumers use the HAMMER buffer system and RECG_DIRECT_IO |
| 1b0ab2c3 MD |
1742 | * does not need to be set-up. |
| 1743 | */ | |
| 0832c9bb MD |
1744 | KKASSERT(((buf_offset ^ (buf_offset + leaf->data_len - 1)) & ~HAMMER_BUFMASK64) == 0); |
| 1745 | buffer = NULL; | |
| 1746 | ptr = hammer_bread(hmp, buf_offset, &error, &buffer); | |
| 1747 | if (error == 0) { | |
| 0832c9bb | 1748 | bp = bio->bio_buf; |
| 7bc5b8c2 | 1749 | bp->b_flags |= B_AGE; |
| 0832c9bb MD |
1750 | hammer_io_modify(&buffer->io, 1); |
| 1751 | bcopy(bp->b_data, ptr, leaf->data_len); | |
| 1752 | hammer_io_modify_done(&buffer->io); | |
| 7bc5b8c2 | 1753 | hammer_rel_buffer(buffer, 0); |
| 0832c9bb | 1754 | bp->b_resid = 0; |
| 6362a262 | 1755 | hammer_ip_replace_bulk(hmp, record); |
| 0832c9bb MD |
1756 | biodone(bio); |
| 1757 | } | |
| 47637bff | 1758 | } |
| 6362a262 | 1759 | if (error) { |
| e469566b | 1760 | /* |
| 6362a262 MD |
1761 | * Major suckage occured. Also note: The record was |
| 1762 | * never added to the tree so we do not have to worry | |
| 1763 | * about the backend. | |
| e469566b | 1764 | */ |
| cebe9493 | 1765 | kprintf("hammer_direct_write: failed @ %016llx\n", |
| 973c11b9 | 1766 | (long long)leaf->data_offset); |
| 47637bff MD |
1767 | bp = bio->bio_buf; |
| 1768 | bp->b_resid = 0; | |
| 1769 | bp->b_error = EIO; | |
| 1770 | bp->b_flags |= B_ERROR; | |
| 1771 | biodone(bio); | |
| e469566b MD |
1772 | record->flags |= HAMMER_RECF_DELETED_FE; |
| 1773 | hammer_rel_mem_record(record); | |
| 47637bff MD |
1774 | } |
| 1775 | return(error); | |
| 1776 | } | |
| 1777 | ||
| 43c665ae | 1778 | /* |
| 1b0ab2c3 MD |
1779 | * On completion of the BIO this callback must disconnect |
| 1780 | * it from the hammer_record and chain to the previous bio. | |
| cdb6e4e6 MD |
1781 | * |
| 1782 | * An I/O error forces the mount to read-only. Data buffers | |
| 1783 | * are not B_LOCKED like meta-data buffers are, so we have to | |
| 1784 | * throw the buffer away to prevent the kernel from retrying. | |
| 77912481 MD |
1785 | * |
| 1786 | * NOTE: MPSAFE callback, only modify fields we have explicit | |
| 1787 | * access to (the bp and the record->gflags). | |
| 1b0ab2c3 MD |
1788 | */ |
| 1789 | static | |
| 1790 | void | |
| 1791 | hammer_io_direct_write_complete(struct bio *nbio) | |
| 1792 | { | |
| 1793 | struct bio *obio; | |
| e469566b | 1794 | struct buf *bp; |
| b0aab9b9 MD |
1795 | hammer_record_t record; |
| 1796 | hammer_mount_t hmp; | |
| 1797 | ||
| 1798 | record = nbio->bio_caller_info1.ptr; | |
| 1799 | KKASSERT(record != NULL); | |
| 1800 | hmp = record->ip->hmp; | |
| 1801 | ||
| 1802 | lwkt_gettoken(&hmp->io_token); | |
| 1b0ab2c3 | 1803 | |
| e469566b | 1804 | bp = nbio->bio_buf; |
| 1b0ab2c3 | 1805 | obio = pop_bio(nbio); |
| e469566b | 1806 | if (bp->b_flags & B_ERROR) { |
| 77912481 | 1807 | lwkt_gettoken(&hmp->fs_token); |
| b0aab9b9 | 1808 | hammer_critical_error(hmp, record->ip, |
| e469566b | 1809 | bp->b_error, |
| cdb6e4e6 | 1810 | "while writing bulk data"); |
| 77912481 | 1811 | lwkt_reltoken(&hmp->fs_token); |
| e469566b | 1812 | bp->b_flags |= B_INVAL; |
| cdb6e4e6 | 1813 | } |
| 1b0ab2c3 | 1814 | biodone(obio); |
| e469566b | 1815 | |
| 77912481 MD |
1816 | KKASSERT(record->gflags & HAMMER_RECG_DIRECT_IO); |
| 1817 | if (record->gflags & HAMMER_RECG_DIRECT_WAIT) { | |
| 1818 | record->gflags &= ~(HAMMER_RECG_DIRECT_IO | | |
| 1819 | HAMMER_RECG_DIRECT_WAIT); | |
| de996e86 | 1820 | /* record can disappear once DIRECT_IO flag is cleared */ |
| 1b0ab2c3 | 1821 | wakeup(&record->flags); |
| de996e86 | 1822 | } else { |
| 77912481 | 1823 | record->gflags &= ~HAMMER_RECG_DIRECT_IO; |
| de996e86 | 1824 | /* record can disappear once DIRECT_IO flag is cleared */ |
| 1b0ab2c3 | 1825 | } |
| b0aab9b9 | 1826 | lwkt_reltoken(&hmp->io_token); |
| 1b0ab2c3 MD |
1827 | } |
| 1828 | ||
| 1829 | ||
| 1830 | /* | |
| 1831 | * This is called before a record is either committed to the B-Tree | |
| e469566b | 1832 | * or destroyed, to resolve any associated direct-IO. |
| 1b0ab2c3 | 1833 | * |
| e469566b MD |
1834 | * (1) We must wait for any direct-IO related to the record to complete. |
| 1835 | * | |
| 1836 | * (2) We must remove any buffer cache aliases for data accessed via | |
| 1837 | * leaf->data_offset or zone2_offset so non-direct-IO consumers | |
| 1838 | * (the mirroring and reblocking code) do not see stale data. | |
| 1b0ab2c3 MD |
1839 | */ |
| 1840 | void | |
| 1841 | hammer_io_direct_wait(hammer_record_t record) | |
| 1842 | { | |
| b0aab9b9 MD |
1843 | hammer_mount_t hmp = record->ip->hmp; |
| 1844 | ||
| e469566b MD |
1845 | /* |
| 1846 | * Wait for I/O to complete | |
| 1847 | */ | |
| 77912481 | 1848 | if (record->gflags & HAMMER_RECG_DIRECT_IO) { |
| b0aab9b9 | 1849 | lwkt_gettoken(&hmp->io_token); |
| 77912481 MD |
1850 | while (record->gflags & HAMMER_RECG_DIRECT_IO) { |
| 1851 | record->gflags |= HAMMER_RECG_DIRECT_WAIT; | |
| e469566b MD |
1852 | tsleep(&record->flags, 0, "hmdiow", 0); |
| 1853 | } | |
| b0aab9b9 | 1854 | lwkt_reltoken(&hmp->io_token); |
| e469566b MD |
1855 | } |
| 1856 | ||
| 1857 | /* | |
| 362ec2dc MD |
1858 | * Invalidate any related buffer cache aliases associated with the |
| 1859 | * backing device. This is needed because the buffer cache buffer | |
| 1860 | * for file data is associated with the file vnode, not the backing | |
| 1861 | * device vnode. | |
| 1862 | * | |
| 1863 | * XXX I do not think this case can occur any more now that | |
| 1864 | * reservations ensure that all such buffers are removed before | |
| 1865 | * an area can be reused. | |
| e469566b | 1866 | */ |
| 77912481 | 1867 | if (record->gflags & HAMMER_RECG_DIRECT_INVAL) { |
| e469566b | 1868 | KKASSERT(record->leaf.data_offset); |
| b0aab9b9 | 1869 | hammer_del_buffers(hmp, record->leaf.data_offset, |
| 362ec2dc MD |
1870 | record->zone2_offset, record->leaf.data_len, |
| 1871 | 1); | |
| 77912481 | 1872 | record->gflags &= ~HAMMER_RECG_DIRECT_INVAL; |
| 1b0ab2c3 | 1873 | } |
| 1b0ab2c3 MD |
1874 | } |
| 1875 | ||
| 1876 | /* | |
| 43c665ae MD |
1877 | * This is called to remove the second-level cached zone-2 offset from |
| 1878 | * frontend buffer cache buffers, now stale due to a data relocation. | |
| 1879 | * These offsets are generated by cluster_read() via VOP_BMAP, or directly | |
| 1880 | * by hammer_vop_strategy_read(). | |
| 1881 | * | |
| 1882 | * This is rather nasty because here we have something like the reblocker | |
| 1883 | * scanning the raw B-Tree with no held references on anything, really, | |
| 1884 | * other then a shared lock on the B-Tree node, and we have to access the | |
| 1885 | * frontend's buffer cache to check for and clean out the association. | |
| 1886 | * Specifically, if the reblocker is moving data on the disk, these cached | |
| 1887 | * offsets will become invalid. | |
| 1888 | * | |
| 1889 | * Only data record types associated with the large-data zone are subject | |
| 1890 | * to direct-io and need to be checked. | |
| 1891 | * | |
| 1892 | */ | |
| 1893 | void | |
| 1894 | hammer_io_direct_uncache(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf) | |
| 1895 | { | |
| 1896 | struct hammer_inode_info iinfo; | |
| 1897 | int zone; | |
| 1898 | ||
| 1899 | if (leaf->base.rec_type != HAMMER_RECTYPE_DATA) | |
| 1900 | return; | |
| 1901 | zone = HAMMER_ZONE_DECODE(leaf->data_offset); | |
| 1902 | if (zone != HAMMER_ZONE_LARGE_DATA_INDEX) | |
| 1903 | return; | |
| 1904 | iinfo.obj_id = leaf->base.obj_id; | |
| 1905 | iinfo.obj_asof = 0; /* unused */ | |
| 1906 | iinfo.obj_localization = leaf->base.localization & | |
| 5a930e66 | 1907 | HAMMER_LOCALIZE_PSEUDOFS_MASK; |
| 43c665ae MD |
1908 | iinfo.u.leaf = leaf; |
| 1909 | hammer_scan_inode_snapshots(hmp, &iinfo, | |
| 1910 | hammer_io_direct_uncache_callback, | |
| 1911 | leaf); | |
| 1912 | } | |
| 1913 | ||
| 1914 | static int | |
| 1915 | hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data) | |
| 1916 | { | |
| 1917 | hammer_inode_info_t iinfo = data; | |
| 1918 | hammer_off_t data_offset; | |
| 1919 | hammer_off_t file_offset; | |
| 1920 | struct vnode *vp; | |
| 1921 | struct buf *bp; | |
| 1922 | int blksize; | |
| 1923 | ||
| 1924 | if (ip->vp == NULL) | |
| 1925 | return(0); | |
| 1926 | data_offset = iinfo->u.leaf->data_offset; | |
| 1927 | file_offset = iinfo->u.leaf->base.key - iinfo->u.leaf->data_len; | |
| 1928 | blksize = iinfo->u.leaf->data_len; | |
| 1929 | KKASSERT((blksize & HAMMER_BUFMASK) == 0); | |
| 1930 | ||
| 9c90dba2 MD |
1931 | /* |
| 1932 | * Warning: FINDBLK_TEST return stable storage but not stable | |
| 1933 | * contents. It happens to be ok in this case. | |
| 1934 | */ | |
| 43c665ae MD |
1935 | hammer_ref(&ip->lock); |
| 1936 | if (hammer_get_vnode(ip, &vp) == 0) { | |
| b1c20cfa | 1937 | if ((bp = findblk(ip->vp, file_offset, FINDBLK_TEST)) != NULL && |
| 43c665ae MD |
1938 | bp->b_bio2.bio_offset != NOOFFSET) { |
| 1939 | bp = getblk(ip->vp, file_offset, blksize, 0, 0); | |
| 1940 | bp->b_bio2.bio_offset = NOOFFSET; | |
| 1941 | brelse(bp); | |
| 1942 | } | |
| 1943 | vput(vp); | |
| 1944 | } | |
| 1945 | hammer_rel_inode(ip, 0); | |
| 1946 | return(0); | |
| 1947 | } | |
| 47637bff | 1948 | |
| 748efb59 MD |
1949 | |
| 1950 | /* | |
| 1951 | * This function is called when writes may have occured on the volume, | |
| 1952 | * indicating that the device may be holding cached writes. | |
| 1953 | */ | |
| 1954 | static void | |
| 1955 | hammer_io_flush_mark(hammer_volume_t volume) | |
| 1956 | { | |
| 77912481 | 1957 | atomic_set_int(&volume->vol_flags, HAMMER_VOLF_NEEDFLUSH); |
| 748efb59 MD |
1958 | } |
| 1959 | ||
| 1960 | /* | |
| 1961 | * This function ensures that the device has flushed any cached writes out. | |
| 1962 | */ | |
| 1963 | void | |
| 1964 | hammer_io_flush_sync(hammer_mount_t hmp) | |
| 1965 | { | |
| 1966 | hammer_volume_t volume; | |
| 1967 | struct buf *bp_base = NULL; | |
| 1968 | struct buf *bp; | |
| 1969 | ||
| 1970 | RB_FOREACH(volume, hammer_vol_rb_tree, &hmp->rb_vols_root) { | |
| 1971 | if (volume->vol_flags & HAMMER_VOLF_NEEDFLUSH) { | |
| 77912481 MD |
1972 | atomic_clear_int(&volume->vol_flags, |
| 1973 | HAMMER_VOLF_NEEDFLUSH); | |
| 748efb59 MD |
1974 | bp = getpbuf(NULL); |
| 1975 | bp->b_bio1.bio_offset = 0; | |
| 1976 | bp->b_bufsize = 0; | |
| 1977 | bp->b_bcount = 0; | |
| 1978 | bp->b_cmd = BUF_CMD_FLUSH; | |
| 1979 | bp->b_bio1.bio_caller_info1.cluster_head = bp_base; | |
| ae8e83e6 MD |
1980 | bp->b_bio1.bio_done = biodone_sync; |
| 1981 | bp->b_bio1.bio_flags |= BIO_SYNC; | |
| 748efb59 MD |
1982 | bp_base = bp; |
| 1983 | vn_strategy(volume->devvp, &bp->b_bio1); | |
| 1984 | } | |
| 1985 | } | |
| 1986 | while ((bp = bp_base) != NULL) { | |
| 1987 | bp_base = bp->b_bio1.bio_caller_info1.cluster_head; | |
| ae8e83e6 | 1988 | biowait(&bp->b_bio1, "hmrFLS"); |
| 748efb59 MD |
1989 | relpbuf(bp, NULL); |
| 1990 | } | |
| 1991 | } | |
| ba298df1 MD |
1992 | |
| 1993 | /* | |
| 1994 | * Limit the amount of backlog which we allow to build up | |
| 1995 | */ | |
| 1996 | void | |
| 1997 | hammer_io_limit_backlog(hammer_mount_t hmp) | |
| 1998 | { | |
| 3038a8ca | 1999 | waitrunningbufspace(); |
| ba298df1 | 2000 | } |