| 1 | /* |
| 2 | * Copyright (c) 2013-2014 The DragonFly Project. All rights reserved. |
| 3 | * |
| 4 | * This code is derived from software contributed to The DragonFly Project |
| 5 | * by Matthew Dillon <dillon@dragonflybsd.org> |
| 6 | * |
| 7 | * Redistribution and use in source and binary forms, with or without |
| 8 | * modification, are permitted provided that the following conditions |
| 9 | * are met: |
| 10 | * |
| 11 | * 1. Redistributions of source code must retain the above copyright |
| 12 | * notice, this list of conditions and the following disclaimer. |
| 13 | * 2. Redistributions in binary form must reproduce the above copyright |
| 14 | * notice, this list of conditions and the following disclaimer in |
| 15 | * the documentation and/or other materials provided with the |
| 16 | * distribution. |
| 17 | * 3. Neither the name of The DragonFly Project nor the names of its |
| 18 | * contributors may be used to endorse or promote products derived |
| 19 | * from this software without specific, prior written permission. |
| 20 | * |
| 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
| 24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
| 25 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| 26 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| 27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
| 29 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 30 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| 31 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 32 | * SUCH DAMAGE. |
| 33 | */ |
| 34 | /* |
| 35 | * The cluster module collects multiple chains representing the same |
| 36 | * information into a single entity. It allows direct access to media |
| 37 | * data as long as it is not blockref array data. Meaning, basically, |
| 38 | * just inode and file data. |
| 39 | * |
| 40 | * This module also handles I/O dispatch, status rollup, and various |
| 41 | * mastership arrangements including quorum operations. It effectively |
| 42 | * presents one topology to the vnops layer. |
| 43 | * |
| 44 | * Many of the API calls mimic chain API calls but operate on clusters |
| 45 | * instead of chains. Please see hammer2_chain.c for more complete code |
| 46 | * documentation of the API functions. |
| 47 | */ |
| 48 | #include <sys/cdefs.h> |
| 49 | #include <sys/param.h> |
| 50 | #include <sys/systm.h> |
| 51 | #include <sys/types.h> |
| 52 | #include <sys/lock.h> |
| 53 | #include <sys/uuid.h> |
| 54 | |
| 55 | #include "hammer2.h" |
| 56 | |
| 57 | /* |
| 58 | * Returns TRUE if any chain in the cluster needs to be resized. |
| 59 | */ |
| 60 | int |
| 61 | hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes) |
| 62 | { |
| 63 | hammer2_chain_t *chain; |
| 64 | int i; |
| 65 | |
| 66 | for (i = 0; i < cluster->nchains; ++i) { |
| 67 | chain = cluster->array[i]; |
| 68 | if (chain && chain->bytes != bytes) |
| 69 | return 1; |
| 70 | } |
| 71 | return 0; |
| 72 | } |
| 73 | |
| 74 | uint8_t |
| 75 | hammer2_cluster_type(hammer2_cluster_t *cluster) |
| 76 | { |
| 77 | return(cluster->focus->bref.type); |
| 78 | } |
| 79 | |
| 80 | int |
| 81 | hammer2_cluster_modified(hammer2_cluster_t *cluster) |
| 82 | { |
| 83 | return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0); |
| 84 | } |
| 85 | |
| 86 | /* |
| 87 | * Return a bref representative of the cluster. Any data offset is removed |
| 88 | * (since it would only be applicable to a particular chain in the cluster). |
| 89 | * |
| 90 | * However, the radix portion of data_off is used for many purposes and will |
| 91 | * be retained. |
| 92 | */ |
| 93 | void |
| 94 | hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) |
| 95 | { |
| 96 | *bref = cluster->focus->bref; |
| 97 | bref->data_off &= HAMMER2_OFF_MASK_RADIX; |
| 98 | } |
| 99 | |
| 100 | void |
| 101 | hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags) |
| 102 | { |
| 103 | hammer2_chain_t *chain; |
| 104 | int i; |
| 105 | |
| 106 | for (i = 0; i < cluster->nchains; ++i) { |
| 107 | chain = cluster->array[i]; |
| 108 | if (chain) |
| 109 | atomic_set_int(&chain->flags, flags); |
| 110 | } |
| 111 | } |
| 112 | |
| 113 | void |
| 114 | hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster) |
| 115 | { |
| 116 | hammer2_chain_t *chain; |
| 117 | int i; |
| 118 | |
| 119 | for (i = 0; i < cluster->nchains; ++i) { |
| 120 | chain = cluster->array[i]; |
| 121 | if (chain) |
| 122 | hammer2_chain_setflush(trans, chain); |
| 123 | } |
| 124 | } |
| 125 | |
| 126 | void |
| 127 | hammer2_cluster_setmethod_check(hammer2_trans_t *trans, |
| 128 | hammer2_cluster_t *cluster, |
| 129 | int check_algo) |
| 130 | { |
| 131 | hammer2_chain_t *chain; |
| 132 | int i; |
| 133 | |
| 134 | for (i = 0; i < cluster->nchains; ++i) { |
| 135 | chain = cluster->array[i]; |
| 136 | if (chain) { |
| 137 | KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); |
| 138 | chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1); |
| 139 | chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo); |
| 140 | } |
| 141 | } |
| 142 | } |
| 143 | |
| 144 | /* |
| 145 | * Create a cluster with one ref from the specified chain. The chain |
| 146 | * is not further referenced. The caller typically supplies a locked |
| 147 | * chain and transfers ownership to the cluster. |
| 148 | */ |
| 149 | hammer2_cluster_t * |
| 150 | hammer2_cluster_from_chain(hammer2_chain_t *chain) |
| 151 | { |
| 152 | hammer2_cluster_t *cluster; |
| 153 | |
| 154 | cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); |
| 155 | cluster->array[0] = chain; |
| 156 | cluster->nchains = 1; |
| 157 | cluster->focus = chain; |
| 158 | cluster->pmp = chain->pmp; |
| 159 | cluster->refs = 1; |
| 160 | |
| 161 | return cluster; |
| 162 | } |
| 163 | |
| 164 | /* |
| 165 | * Allocates a cluster and its underlying chain structures. The underlying |
| 166 | * chains will be locked. The cluster and underlying chains will have one |
| 167 | * ref. |
| 168 | */ |
| 169 | hammer2_cluster_t * |
| 170 | hammer2_cluster_alloc(hammer2_pfsmount_t *pmp, |
| 171 | hammer2_trans_t *trans, hammer2_blockref_t *bref) |
| 172 | { |
| 173 | hammer2_cluster_t *cluster; |
| 174 | hammer2_cluster_t *rcluster; |
| 175 | hammer2_chain_t *chain; |
| 176 | #if 0 |
| 177 | u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); |
| 178 | #endif |
| 179 | int i; |
| 180 | |
| 181 | KKASSERT(pmp != NULL); |
| 182 | |
| 183 | /* |
| 184 | * Construct the appropriate system structure. |
| 185 | */ |
| 186 | switch(bref->type) { |
| 187 | case HAMMER2_BREF_TYPE_INODE: |
| 188 | case HAMMER2_BREF_TYPE_INDIRECT: |
| 189 | case HAMMER2_BREF_TYPE_FREEMAP_NODE: |
| 190 | case HAMMER2_BREF_TYPE_DATA: |
| 191 | case HAMMER2_BREF_TYPE_FREEMAP_LEAF: |
| 192 | /* |
| 193 | * Chain's are really only associated with the hmp but we |
| 194 | * maintain a pmp association for per-mount memory tracking |
| 195 | * purposes. The pmp can be NULL. |
| 196 | */ |
| 197 | break; |
| 198 | case HAMMER2_BREF_TYPE_VOLUME: |
| 199 | case HAMMER2_BREF_TYPE_FREEMAP: |
| 200 | chain = NULL; |
| 201 | panic("hammer2_cluster_alloc volume type illegal for op"); |
| 202 | default: |
| 203 | chain = NULL; |
| 204 | panic("hammer2_cluster_alloc: unrecognized blockref type: %d", |
| 205 | bref->type); |
| 206 | } |
| 207 | |
| 208 | cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); |
| 209 | cluster->refs = 1; |
| 210 | |
| 211 | rcluster = &pmp->iroot->cluster; |
| 212 | for (i = 0; i < rcluster->nchains; ++i) { |
| 213 | chain = hammer2_chain_alloc(rcluster->array[i]->hmp, |
| 214 | pmp, trans, bref); |
| 215 | #if 0 |
| 216 | chain->hmp = rcluster->array[i]->hmp; |
| 217 | chain->bref = *bref; |
| 218 | chain->bytes = bytes; |
| 219 | chain->refs = 1; |
| 220 | chain->flags = HAMMER2_CHAIN_ALLOCATED; |
| 221 | #endif |
| 222 | |
| 223 | /* |
| 224 | * NOTE: When loading a chain from backing store or creating a |
| 225 | * snapshot, trans will be NULL and the caller is |
| 226 | * responsible for setting these fields. |
| 227 | */ |
| 228 | cluster->array[i] = chain; |
| 229 | } |
| 230 | cluster->nchains = i; |
| 231 | cluster->pmp = pmp; |
| 232 | cluster->focus = cluster->array[0]; |
| 233 | |
| 234 | return (cluster); |
| 235 | } |
| 236 | |
| 237 | /* |
| 238 | * Add a reference to a cluster. |
| 239 | * |
| 240 | * We must also ref the underlying chains in order to allow ref/unlock |
| 241 | * sequences to later re-lock. |
| 242 | */ |
| 243 | void |
| 244 | hammer2_cluster_ref(hammer2_cluster_t *cluster) |
| 245 | { |
| 246 | hammer2_chain_t *chain; |
| 247 | int i; |
| 248 | |
| 249 | atomic_add_int(&cluster->refs, 1); |
| 250 | for (i = 0; i < cluster->nchains; ++i) { |
| 251 | chain = cluster->array[i]; |
| 252 | if (chain) |
| 253 | hammer2_chain_ref(chain); |
| 254 | } |
| 255 | } |
| 256 | |
| 257 | /* |
| 258 | * Drop the caller's reference to the cluster. When the ref count drops to |
| 259 | * zero this function frees the cluster and drops all underlying chains. |
| 260 | * |
| 261 | * In-progress read I/Os are typically detached from the cluster once the |
| 262 | * first one returns (the remaining stay attached to the DIOs but are then |
| 263 | * ignored and drop naturally). |
| 264 | */ |
| 265 | void |
| 266 | hammer2_cluster_drop(hammer2_cluster_t *cluster) |
| 267 | { |
| 268 | hammer2_chain_t *chain; |
| 269 | int i; |
| 270 | |
| 271 | KKASSERT(cluster->refs > 0); |
| 272 | for (i = 0; i < cluster->nchains; ++i) { |
| 273 | chain = cluster->array[i]; |
| 274 | if (chain) { |
| 275 | hammer2_chain_drop(chain); |
| 276 | if (cluster->refs == 1) |
| 277 | cluster->array[i] = NULL; |
| 278 | } |
| 279 | } |
| 280 | if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { |
| 281 | cluster->focus = NULL; |
| 282 | kfree(cluster, M_HAMMER2); |
| 283 | /* cluster = NULL; safety */ |
| 284 | } |
| 285 | } |
| 286 | |
| 287 | void |
| 288 | hammer2_cluster_wait(hammer2_cluster_t *cluster) |
| 289 | { |
| 290 | tsleep(cluster->focus, 0, "h2clcw", 1); |
| 291 | } |
| 292 | |
| 293 | /* |
| 294 | * Lock and ref a cluster. This adds a ref to the cluster and its chains |
| 295 | * and then locks them. |
| 296 | */ |
| 297 | int |
| 298 | hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) |
| 299 | { |
| 300 | hammer2_chain_t *chain; |
| 301 | int i; |
| 302 | int error; |
| 303 | |
| 304 | error = 0; |
| 305 | atomic_add_int(&cluster->refs, 1); |
| 306 | for (i = 0; i < cluster->nchains; ++i) { |
| 307 | chain = cluster->array[i]; |
| 308 | if (chain) { |
| 309 | error = hammer2_chain_lock(chain, how); |
| 310 | if (error) { |
| 311 | while (--i >= 0) |
| 312 | hammer2_chain_unlock(cluster->array[i]); |
| 313 | atomic_add_int(&cluster->refs, -1); |
| 314 | break; |
| 315 | } |
| 316 | } |
| 317 | } |
| 318 | return error; |
| 319 | } |
| 320 | |
| 321 | /* |
| 322 | * Replace the contents of dst with src, adding a reference to src's chains. |
| 323 | * dst is assumed to already have a ref and any chains present in dst are |
| 324 | * assumed to be locked and will be unlocked. |
| 325 | * |
| 326 | * If the chains in src are locked, only one of (src) or (dst) should be |
| 327 | * considered locked by the caller after return, not both. |
| 328 | */ |
| 329 | void |
| 330 | hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src) |
| 331 | { |
| 332 | hammer2_chain_t *chain; |
| 333 | int i; |
| 334 | |
| 335 | KKASSERT(dst->refs == 1); |
| 336 | dst->focus = NULL; |
| 337 | |
| 338 | for (i = 0; i < src->nchains; ++i) { |
| 339 | chain = src->array[i]; |
| 340 | if (chain) { |
| 341 | hammer2_chain_ref(chain); |
| 342 | if (i < dst->nchains && dst->array[i]) |
| 343 | hammer2_chain_unlock(dst->array[i]); |
| 344 | dst->array[i] = chain; |
| 345 | if (dst->focus == NULL) |
| 346 | dst->focus = chain; |
| 347 | } |
| 348 | } |
| 349 | while (i < dst->nchains) { |
| 350 | chain = dst->array[i]; |
| 351 | if (chain) { |
| 352 | hammer2_chain_unlock(chain); |
| 353 | dst->array[i] = NULL; |
| 354 | } |
| 355 | ++i; |
| 356 | } |
| 357 | dst->nchains = src->nchains; |
| 358 | } |
| 359 | |
| 360 | /* |
| 361 | * Replace the contents of the locked destination with the contents of the |
| 362 | * locked source. Destination must have one ref. |
| 363 | * |
| 364 | * Returns with the destination still with one ref and the copied chains |
| 365 | * with an additional lock (representing their state on the destination). |
| 366 | * The original chains associated with the destination are unlocked. |
| 367 | */ |
| 368 | void |
| 369 | hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src) |
| 370 | { |
| 371 | hammer2_chain_t *chain; |
| 372 | int i; |
| 373 | |
| 374 | KKASSERT(dst->refs == 1); |
| 375 | |
| 376 | dst->focus = NULL; |
| 377 | for (i = 0; i < src->nchains; ++i) { |
| 378 | chain = src->array[i]; |
| 379 | if (chain) { |
| 380 | hammer2_chain_lock(chain, 0); |
| 381 | if (i < dst->nchains && dst->array[i]) |
| 382 | hammer2_chain_unlock(dst->array[i]); |
| 383 | dst->array[i] = src->array[i]; |
| 384 | if (dst->focus == NULL) |
| 385 | dst->focus = chain; |
| 386 | } |
| 387 | } |
| 388 | while (i < dst->nchains) { |
| 389 | chain = dst->array[i]; |
| 390 | if (chain) { |
| 391 | hammer2_chain_unlock(chain); |
| 392 | dst->array[i] = NULL; |
| 393 | } |
| 394 | ++i; |
| 395 | } |
| 396 | dst->nchains = src->nchains; |
| 397 | } |
| 398 | |
| 399 | /* |
| 400 | * Copy a cluster, returned a ref'd cluster. All underlying chains |
| 401 | * are also ref'd, but not locked. |
| 402 | * |
| 403 | * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied |
| 404 | * to the new cluster and a reference is nominally added to them and to |
| 405 | * the cluster. The cluster will have 1 ref. |
| 406 | * |
| 407 | * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains |
| 408 | * are copied but no additional references are made and the cluster will have |
| 409 | * 0 refs. Callers must ref the cluster and the chains before dropping it |
| 410 | * (typically by locking it). |
| 411 | * |
| 412 | * If flags are passed as 0 the copy is setup as if it contained the chains |
| 413 | * but the chains will not be copied over, and the cluster will have 0 refs. |
| 414 | * Callers must ref the cluster before dropping it (typically by locking it). |
| 415 | */ |
| 416 | hammer2_cluster_t * |
| 417 | hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags) |
| 418 | { |
| 419 | hammer2_pfsmount_t *pmp = ocluster->pmp; |
| 420 | hammer2_cluster_t *ncluster; |
| 421 | hammer2_chain_t *chain; |
| 422 | int i; |
| 423 | |
| 424 | ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO); |
| 425 | ncluster->pmp = pmp; |
| 426 | ncluster->nchains = ocluster->nchains; |
| 427 | ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1; |
| 428 | if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) { |
| 429 | ncluster->focus = ocluster->focus; |
| 430 | for (i = 0; i < ocluster->nchains; ++i) { |
| 431 | chain = ocluster->array[i]; |
| 432 | ncluster->array[i] = chain; |
| 433 | if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 && |
| 434 | chain) { |
| 435 | hammer2_chain_ref(chain); |
| 436 | } |
| 437 | } |
| 438 | } |
| 439 | return (ncluster); |
| 440 | } |
| 441 | |
| 442 | /* |
| 443 | * Unlock and deref a cluster. The cluster is destroyed if this is the |
| 444 | * last ref. |
| 445 | */ |
| 446 | void |
| 447 | hammer2_cluster_unlock(hammer2_cluster_t *cluster) |
| 448 | { |
| 449 | hammer2_chain_t *chain; |
| 450 | int i; |
| 451 | |
| 452 | KKASSERT(cluster->refs > 0); |
| 453 | for (i = 0; i < cluster->nchains; ++i) { |
| 454 | chain = cluster->array[i]; |
| 455 | if (chain) { |
| 456 | hammer2_chain_unlock(chain); |
| 457 | if (cluster->refs == 1) |
| 458 | cluster->array[i] = NULL; /* safety */ |
| 459 | } |
| 460 | } |
| 461 | if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { |
| 462 | cluster->focus = NULL; |
| 463 | kfree(cluster, M_HAMMER2); |
| 464 | /* cluster = NULL; safety */ |
| 465 | } |
| 466 | } |
| 467 | |
| 468 | /* |
| 469 | * Resize the cluster's physical storage allocation in-place. This may |
| 470 | * replace the cluster's chains. |
| 471 | */ |
| 472 | void |
| 473 | hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, |
| 474 | hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, |
| 475 | int nradix, int flags) |
| 476 | { |
| 477 | int i; |
| 478 | |
| 479 | KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */ |
| 480 | KKASSERT(cparent->nchains == cluster->nchains); |
| 481 | |
| 482 | cluster->focus = NULL; |
| 483 | for (i = 0; i < cluster->nchains; ++i) { |
| 484 | if (cluster->array[i]) { |
| 485 | KKASSERT(cparent->array[i]); |
| 486 | hammer2_chain_resize(trans, ip, |
| 487 | cparent->array[i], |
| 488 | cluster->array[i], |
| 489 | nradix, flags); |
| 490 | if (cluster->focus == NULL) |
| 491 | cluster->focus = cluster->array[i]; |
| 492 | } |
| 493 | } |
| 494 | } |
| 495 | |
| 496 | /* |
| 497 | * Set an inode's cluster modified, marking the related chains RW and |
| 498 | * duplicating them if necessary. |
| 499 | * |
| 500 | * The passed-in chain is a localized copy of the chain previously acquired |
| 501 | * when the inode was locked (and possilby replaced in the mean time), and |
| 502 | * must also be updated. In fact, we update it first and then synchronize |
| 503 | * the inode's cluster cache. |
| 504 | */ |
| 505 | hammer2_inode_data_t * |
| 506 | hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, |
| 507 | hammer2_cluster_t *cluster, int flags) |
| 508 | { |
| 509 | atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); |
| 510 | hammer2_cluster_modify(trans, cluster, flags); |
| 511 | |
| 512 | hammer2_inode_repoint(ip, NULL, cluster); |
| 513 | if (ip->vp) |
| 514 | vsetisdirty(ip->vp); |
| 515 | return (&hammer2_cluster_wdata(cluster)->ipdata); |
| 516 | } |
| 517 | |
| 518 | /* |
| 519 | * Adjust the cluster's chains to allow modification and adjust the |
| 520 | * focus. Data will be accessible on return. |
| 521 | */ |
| 522 | void |
| 523 | hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster, |
| 524 | int flags) |
| 525 | { |
| 526 | int i; |
| 527 | |
| 528 | cluster->focus = NULL; |
| 529 | for (i = 0; i < cluster->nchains; ++i) { |
| 530 | if (cluster->array[i]) { |
| 531 | hammer2_chain_modify(trans, cluster->array[i], flags); |
| 532 | if (cluster->focus == NULL) |
| 533 | cluster->focus = cluster->array[i]; |
| 534 | } |
| 535 | } |
| 536 | } |
| 537 | |
| 538 | /* |
| 539 | * Synchronize modifications from the focus to other chains in a cluster. |
| 540 | * Convenient because nominal API users can just modify the contents of the |
| 541 | * focus (at least for non-blockref data). |
| 542 | * |
| 543 | * Nominal front-end operations only edit non-block-table data in a single |
| 544 | * chain. This code copies such modifications to the other chains in the |
| 545 | * cluster. Blocktable modifications are handled on a chain-by-chain basis |
| 546 | * by both the frontend and the backend and will explode in fireworks if |
| 547 | * blindly copied. |
| 548 | */ |
| 549 | void |
| 550 | hammer2_cluster_modsync(hammer2_cluster_t *cluster) |
| 551 | { |
| 552 | hammer2_chain_t *focus; |
| 553 | hammer2_chain_t *scan; |
| 554 | const hammer2_inode_data_t *ripdata; |
| 555 | hammer2_inode_data_t *wipdata; |
| 556 | int i; |
| 557 | |
| 558 | focus = cluster->focus; |
| 559 | KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED); |
| 560 | |
| 561 | for (i = 0; i < cluster->nchains; ++i) { |
| 562 | scan = cluster->array[i]; |
| 563 | if (scan == NULL || scan == focus) |
| 564 | continue; |
| 565 | KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED); |
| 566 | KKASSERT(focus->bytes == scan->bytes && |
| 567 | focus->bref.type == scan->bref.type); |
| 568 | switch(focus->bref.type) { |
| 569 | case HAMMER2_BREF_TYPE_INODE: |
| 570 | ripdata = &focus->data->ipdata; |
| 571 | wipdata = &scan->data->ipdata; |
| 572 | if ((ripdata->op_flags & |
| 573 | HAMMER2_OPFLAG_DIRECTDATA) == 0) { |
| 574 | bcopy(ripdata, wipdata, |
| 575 | offsetof(hammer2_inode_data_t, u)); |
| 576 | break; |
| 577 | } |
| 578 | /* fall through */ |
| 579 | case HAMMER2_BREF_TYPE_DATA: |
| 580 | bcopy(focus->data, scan->data, focus->bytes); |
| 581 | break; |
| 582 | case HAMMER2_BREF_TYPE_FREEMAP_NODE: |
| 583 | case HAMMER2_BREF_TYPE_FREEMAP_LEAF: |
| 584 | case HAMMER2_BREF_TYPE_FREEMAP: |
| 585 | case HAMMER2_BREF_TYPE_VOLUME: |
| 586 | panic("hammer2_cluster_modsync: illegal node type"); |
| 587 | /* NOT REACHED */ |
| 588 | break; |
| 589 | default: |
| 590 | panic("hammer2_cluster_modsync: unknown node type"); |
| 591 | break; |
| 592 | } |
| 593 | } |
| 594 | } |
| 595 | |
| 596 | /* |
| 597 | * Lookup initialization/completion API |
| 598 | */ |
| 599 | hammer2_cluster_t * |
| 600 | hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags) |
| 601 | { |
| 602 | hammer2_cluster_t *cluster; |
| 603 | int i; |
| 604 | |
| 605 | cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); |
| 606 | cluster->pmp = cparent->pmp; /* can be NULL */ |
| 607 | /* cluster->focus = NULL; already null */ |
| 608 | |
| 609 | for (i = 0; i < cparent->nchains; ++i) { |
| 610 | cluster->array[i] = cparent->array[i]; |
| 611 | if (cluster->focus == NULL) |
| 612 | cluster->focus = cluster->array[i]; |
| 613 | } |
| 614 | cluster->nchains = cparent->nchains; |
| 615 | |
| 616 | /* |
| 617 | * Independently lock (this will also give cluster 1 ref) |
| 618 | */ |
| 619 | if (flags & HAMMER2_LOOKUP_SHARED) { |
| 620 | hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS | |
| 621 | HAMMER2_RESOLVE_SHARED); |
| 622 | } else { |
| 623 | hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS); |
| 624 | } |
| 625 | return (cluster); |
| 626 | } |
| 627 | |
| 628 | void |
| 629 | hammer2_cluster_lookup_done(hammer2_cluster_t *cparent) |
| 630 | { |
| 631 | if (cparent) |
| 632 | hammer2_cluster_unlock(cparent); |
| 633 | } |
| 634 | |
| 635 | /* |
| 636 | * Locate first match or overlap under parent, return a new cluster |
| 637 | */ |
| 638 | hammer2_cluster_t * |
| 639 | hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp, |
| 640 | hammer2_key_t key_beg, hammer2_key_t key_end, |
| 641 | int flags, int *ddflagp) |
| 642 | { |
| 643 | hammer2_pfsmount_t *pmp; |
| 644 | hammer2_cluster_t *cluster; |
| 645 | hammer2_chain_t *chain; |
| 646 | hammer2_key_t key_accum; |
| 647 | hammer2_key_t key_next; |
| 648 | hammer2_key_t bref_key; |
| 649 | int bref_keybits; |
| 650 | int null_count; |
| 651 | int ddflag; |
| 652 | int i; |
| 653 | uint8_t bref_type; |
| 654 | u_int bytes; |
| 655 | |
| 656 | pmp = cparent->pmp; /* can be NULL */ |
| 657 | key_accum = *key_nextp; |
| 658 | null_count = 0; |
| 659 | bref_type = 0; |
| 660 | bref_key = 0; |
| 661 | bref_keybits = 0; |
| 662 | bytes = 0; |
| 663 | |
| 664 | cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); |
| 665 | cluster->pmp = pmp; /* can be NULL */ |
| 666 | cluster->refs = 1; |
| 667 | /* cluster->focus = NULL; already null */ |
| 668 | cparent->focus = NULL; |
| 669 | *ddflagp = 0; |
| 670 | |
| 671 | for (i = 0; i < cparent->nchains; ++i) { |
| 672 | key_next = *key_nextp; |
| 673 | if (cparent->array[i] == NULL) { |
| 674 | ++null_count; |
| 675 | continue; |
| 676 | } |
| 677 | chain = hammer2_chain_lookup(&cparent->array[i], &key_next, |
| 678 | key_beg, key_end, |
| 679 | &cparent->cache_index[i], |
| 680 | flags, &ddflag); |
| 681 | if (cparent->focus == NULL) |
| 682 | cparent->focus = cparent->array[i]; |
| 683 | cluster->array[i] = chain; |
| 684 | if (chain == NULL) { |
| 685 | ++null_count; |
| 686 | } else { |
| 687 | if (cluster->focus == NULL) { |
| 688 | bref_type = chain->bref.type; |
| 689 | bref_key = chain->bref.key; |
| 690 | bref_keybits = chain->bref.keybits; |
| 691 | bytes = chain->bytes; |
| 692 | *ddflagp = ddflag; |
| 693 | cluster->focus = chain; |
| 694 | } |
| 695 | KKASSERT(bref_type == chain->bref.type); |
| 696 | KKASSERT(bref_key == chain->bref.key); |
| 697 | KKASSERT(bref_keybits == chain->bref.keybits); |
| 698 | KKASSERT(bytes == chain->bytes); |
| 699 | KKASSERT(*ddflagp == ddflag); |
| 700 | } |
| 701 | if (key_accum > key_next) |
| 702 | key_accum = key_next; |
| 703 | } |
| 704 | *key_nextp = key_accum; |
| 705 | cluster->nchains = i; |
| 706 | |
| 707 | if (null_count == i) { |
| 708 | hammer2_cluster_drop(cluster); |
| 709 | cluster = NULL; |
| 710 | } |
| 711 | |
| 712 | return (cluster); |
| 713 | } |
| 714 | |
| 715 | /* |
| 716 | * Locate next match or overlap under parent, replace cluster |
| 717 | */ |
| 718 | hammer2_cluster_t * |
| 719 | hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, |
| 720 | hammer2_key_t *key_nextp, |
| 721 | hammer2_key_t key_beg, hammer2_key_t key_end, int flags) |
| 722 | { |
| 723 | hammer2_chain_t *chain; |
| 724 | hammer2_key_t key_accum; |
| 725 | hammer2_key_t key_next; |
| 726 | int null_count; |
| 727 | int i; |
| 728 | |
| 729 | key_accum = *key_nextp; |
| 730 | null_count = 0; |
| 731 | cluster->focus = NULL; |
| 732 | cparent->focus = NULL; |
| 733 | |
| 734 | for (i = 0; i < cparent->nchains; ++i) { |
| 735 | key_next = *key_nextp; |
| 736 | chain = cluster->array[i]; |
| 737 | if (chain == NULL) { |
| 738 | if (cparent->focus == NULL) |
| 739 | cparent->focus = cparent->array[i]; |
| 740 | ++null_count; |
| 741 | continue; |
| 742 | } |
| 743 | if (cparent->array[i] == NULL) { |
| 744 | if (flags & HAMMER2_LOOKUP_NOLOCK) |
| 745 | hammer2_chain_drop(chain); |
| 746 | else |
| 747 | hammer2_chain_unlock(chain); |
| 748 | ++null_count; |
| 749 | continue; |
| 750 | } |
| 751 | chain = hammer2_chain_next(&cparent->array[i], chain, |
| 752 | &key_next, key_beg, key_end, |
| 753 | &cparent->cache_index[i], flags); |
| 754 | if (cparent->focus == NULL) |
| 755 | cparent->focus = cparent->array[i]; |
| 756 | cluster->array[i] = chain; |
| 757 | if (chain == NULL) { |
| 758 | ++null_count; |
| 759 | } else if (cluster->focus == NULL) { |
| 760 | cluster->focus = chain; |
| 761 | } |
| 762 | if (key_accum > key_next) |
| 763 | key_accum = key_next; |
| 764 | } |
| 765 | |
| 766 | if (null_count == i) { |
| 767 | hammer2_cluster_drop(cluster); |
| 768 | cluster = NULL; |
| 769 | } |
| 770 | return(cluster); |
| 771 | } |
| 772 | |
| 773 | #if 0 |
| 774 | /* |
| 775 | * XXX initial NULL cluster needs reworking (pass **clusterp ?) |
| 776 | * |
| 777 | * The raw scan function is similar to lookup/next but does not seek to a key. |
| 778 | * Blockrefs are iterated via first_chain = (parent, NULL) and |
| 779 | * next_chain = (parent, chain). |
| 780 | * |
| 781 | * The passed-in parent must be locked and its data resolved. The returned |
| 782 | * chain will be locked. Pass chain == NULL to acquire the first sub-chain |
| 783 | * under parent and then iterate with the passed-in chain (which this |
| 784 | * function will unlock). |
| 785 | */ |
| 786 | hammer2_cluster_t * |
| 787 | hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, |
| 788 | int flags) |
| 789 | { |
| 790 | hammer2_chain_t *chain; |
| 791 | int null_count; |
| 792 | int i; |
| 793 | |
| 794 | null_count = 0; |
| 795 | |
| 796 | for (i = 0; i < cparent->nchains; ++i) { |
| 797 | chain = cluster->array[i]; |
| 798 | if (chain == NULL) { |
| 799 | ++null_count; |
| 800 | continue; |
| 801 | } |
| 802 | if (cparent->array[i] == NULL) { |
| 803 | if (flags & HAMMER2_LOOKUP_NOLOCK) |
| 804 | hammer2_chain_drop(chain); |
| 805 | else |
| 806 | hammer2_chain_unlock(chain); |
| 807 | ++null_count; |
| 808 | continue; |
| 809 | } |
| 810 | |
| 811 | chain = hammer2_chain_scan(cparent->array[i], chain, |
| 812 | &cparent->cache_index[i], flags); |
| 813 | cluster->array[i] = chain; |
| 814 | if (chain == NULL) |
| 815 | ++null_count; |
| 816 | } |
| 817 | |
| 818 | if (null_count == i) { |
| 819 | hammer2_cluster_drop(cluster); |
| 820 | cluster = NULL; |
| 821 | } |
| 822 | return(cluster); |
| 823 | } |
| 824 | |
| 825 | #endif |
| 826 | |
| 827 | /* |
| 828 | * Create a new cluster using the specified key |
| 829 | */ |
| 830 | int |
| 831 | hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent, |
| 832 | hammer2_cluster_t **clusterp, |
| 833 | hammer2_key_t key, int keybits, |
| 834 | int type, size_t bytes, int flags) |
| 835 | { |
| 836 | hammer2_cluster_t *cluster; |
| 837 | hammer2_pfsmount_t *pmp; |
| 838 | int error; |
| 839 | int i; |
| 840 | |
| 841 | pmp = trans->pmp; /* can be NULL */ |
| 842 | |
| 843 | if ((cluster = *clusterp) == NULL) { |
| 844 | cluster = kmalloc(sizeof(*cluster), M_HAMMER2, |
| 845 | M_WAITOK | M_ZERO); |
| 846 | cluster->pmp = pmp; /* can be NULL */ |
| 847 | cluster->refs = 1; |
| 848 | } |
| 849 | cluster->focus = NULL; |
| 850 | cparent->focus = NULL; |
| 851 | |
| 852 | /* |
| 853 | * NOTE: cluster->array[] entries can initially be NULL. If |
| 854 | * *clusterp is supplied, skip NULL entries, otherwise |
| 855 | * create new chains. |
| 856 | */ |
| 857 | for (i = 0; i < cparent->nchains; ++i) { |
| 858 | if (*clusterp && cluster->array[i] == NULL) { |
| 859 | if (cparent->focus == NULL) |
| 860 | cparent->focus = cparent->array[i]; |
| 861 | continue; |
| 862 | } |
| 863 | error = hammer2_chain_create(trans, &cparent->array[i], |
| 864 | &cluster->array[i], pmp, |
| 865 | key, keybits, |
| 866 | type, bytes, flags); |
| 867 | KKASSERT(error == 0); |
| 868 | if (cparent->focus == NULL) |
| 869 | cparent->focus = cparent->array[i]; |
| 870 | if (cluster->focus == NULL) |
| 871 | cluster->focus = cluster->array[i]; |
| 872 | } |
| 873 | cluster->nchains = i; |
| 874 | *clusterp = cluster; |
| 875 | |
| 876 | return error; |
| 877 | } |
| 878 | |
| 879 | /* |
| 880 | * Rename a cluster to a new parent. |
| 881 | * |
| 882 | * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields |
| 883 | * are used from a passed-in non-NULL bref pointer. All other fields |
| 884 | * are extracted from the original chain for each chain in the |
| 885 | * iteration. |
| 886 | */ |
| 887 | void |
| 888 | hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref, |
| 889 | hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, |
| 890 | int flags) |
| 891 | { |
| 892 | hammer2_chain_t *chain; |
| 893 | hammer2_blockref_t xbref; |
| 894 | int i; |
| 895 | |
| 896 | cluster->focus = NULL; |
| 897 | cparent->focus = NULL; |
| 898 | |
| 899 | for (i = 0; i < cluster->nchains; ++i) { |
| 900 | chain = cluster->array[i]; |
| 901 | if (chain) { |
| 902 | if (bref) { |
| 903 | xbref = chain->bref; |
| 904 | xbref.key = bref->key; |
| 905 | xbref.keybits = bref->keybits; |
| 906 | hammer2_chain_rename(trans, &xbref, |
| 907 | &cparent->array[i], |
| 908 | chain, flags); |
| 909 | } else { |
| 910 | hammer2_chain_rename(trans, NULL, |
| 911 | &cparent->array[i], |
| 912 | chain, flags); |
| 913 | } |
| 914 | cluster->array[i] = chain; |
| 915 | if (cluster->focus == NULL) |
| 916 | cluster->focus = chain; |
| 917 | if (cparent->focus == NULL) |
| 918 | cparent->focus = cparent->array[i]; |
| 919 | } else { |
| 920 | if (cparent->focus == NULL) |
| 921 | cparent->focus = cparent->array[i]; |
| 922 | } |
| 923 | } |
| 924 | } |
| 925 | |
| 926 | /* |
| 927 | * Mark a cluster deleted |
| 928 | */ |
| 929 | void |
| 930 | hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent, |
| 931 | hammer2_cluster_t *cluster, int flags) |
| 932 | { |
| 933 | hammer2_chain_t *chain; |
| 934 | hammer2_chain_t *parent; |
| 935 | int i; |
| 936 | |
| 937 | if (cparent == NULL) { |
| 938 | kprintf("cparent is NULL\n"); |
| 939 | return; |
| 940 | } |
| 941 | |
| 942 | for (i = 0; i < cluster->nchains; ++i) { |
| 943 | parent = (i < cparent->nchains) ? cparent->array[i] : NULL; |
| 944 | chain = cluster->array[i]; |
| 945 | if (chain == NULL) |
| 946 | continue; |
| 947 | if (chain->parent != parent) { |
| 948 | kprintf("hammer2_cluster_delete: parent " |
| 949 | "mismatch chain=%p parent=%p against=%p\n", |
| 950 | chain, chain->parent, parent); |
| 951 | } else { |
| 952 | hammer2_chain_delete(trans, parent, chain, flags); |
| 953 | } |
| 954 | } |
| 955 | } |
| 956 | |
| 957 | /* |
| 958 | * Create a snapshot of the specified {parent, ochain} with the specified |
| 959 | * label. The originating hammer2_inode must be exclusively locked for |
| 960 | * safety. |
| 961 | * |
| 962 | * The ioctl code has already synced the filesystem. |
| 963 | */ |
| 964 | int |
| 965 | hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster, |
| 966 | hammer2_ioc_pfs_t *pfs) |
| 967 | { |
| 968 | hammer2_mount_t *hmp; |
| 969 | hammer2_cluster_t *ncluster; |
| 970 | const hammer2_inode_data_t *ripdata; |
| 971 | hammer2_inode_data_t *wipdata; |
| 972 | hammer2_inode_t *nip; |
| 973 | size_t name_len; |
| 974 | hammer2_key_t lhc; |
| 975 | struct vattr vat; |
| 976 | uuid_t opfs_clid; |
| 977 | int error; |
| 978 | int i; |
| 979 | |
| 980 | kprintf("snapshot %s\n", pfs->name); |
| 981 | |
| 982 | name_len = strlen(pfs->name); |
| 983 | lhc = hammer2_dirhash(pfs->name, name_len); |
| 984 | |
| 985 | /* |
| 986 | * Get the clid |
| 987 | */ |
| 988 | ripdata = &hammer2_cluster_rdata(ocluster)->ipdata; |
| 989 | opfs_clid = ripdata->pfs_clid; |
| 990 | hmp = ocluster->focus->hmp; |
| 991 | |
| 992 | /* |
| 993 | * Create the snapshot directory under the super-root |
| 994 | * |
| 995 | * Set PFS type, generate a unique filesystem id, and generate |
| 996 | * a cluster id. Use the same clid when snapshotting a PFS root, |
| 997 | * which theoretically allows the snapshot to be used as part of |
| 998 | * the same cluster (perhaps as a cache). |
| 999 | * |
| 1000 | * Copy the (flushed) blockref array. Theoretically we could use |
| 1001 | * chain_duplicate() but it becomes difficult to disentangle |
| 1002 | * the shared core so for now just brute-force it. |
| 1003 | */ |
| 1004 | VATTR_NULL(&vat); |
| 1005 | vat.va_type = VDIR; |
| 1006 | vat.va_mode = 0755; |
| 1007 | ncluster = NULL; |
| 1008 | nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat, |
| 1009 | proc0.p_ucred, pfs->name, name_len, |
| 1010 | &ncluster, &error); |
| 1011 | |
| 1012 | if (nip) { |
| 1013 | wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0); |
| 1014 | wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT; |
| 1015 | kern_uuidgen(&wipdata->pfs_fsid, 1); |
| 1016 | if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) |
| 1017 | wipdata->pfs_clid = opfs_clid; |
| 1018 | else |
| 1019 | kern_uuidgen(&wipdata->pfs_clid, 1); |
| 1020 | |
| 1021 | for (i = 0; i < ncluster->nchains; ++i) { |
| 1022 | if (ncluster->array[i]) { |
| 1023 | ncluster->array[i]->bref.flags |= |
| 1024 | HAMMER2_BREF_FLAG_PFSROOT; |
| 1025 | } |
| 1026 | } |
| 1027 | #if 0 |
| 1028 | /* XXX can't set this unless we do an explicit flush, which |
| 1029 | we also need a pmp assigned to do, else the flush code |
| 1030 | won't flush ncluster because it thinks it is crossing a |
| 1031 | flush boundary */ |
| 1032 | hammer2_cluster_set_chainflags(ncluster, |
| 1033 | HAMMER2_CHAIN_PFSBOUNDARY); |
| 1034 | #endif |
| 1035 | |
| 1036 | /* XXX hack blockset copy */ |
| 1037 | /* XXX doesn't work with real cluster */ |
| 1038 | KKASSERT(ocluster->nchains == 1); |
| 1039 | wipdata->u.blockset = ripdata->u.blockset; |
| 1040 | hammer2_cluster_modsync(ncluster); |
| 1041 | for (i = 0; i < ncluster->nchains; ++i) { |
| 1042 | if (ncluster->array[i]) |
| 1043 | hammer2_flush(trans, ncluster->array[i]); |
| 1044 | } |
| 1045 | hammer2_inode_unlock_ex(nip, ncluster); |
| 1046 | } |
| 1047 | return (error); |
| 1048 | } |
| 1049 | |
| 1050 | /* |
| 1051 | * Return locked parent cluster given a locked child. The child remains |
| 1052 | * locked on return. |
| 1053 | */ |
| 1054 | hammer2_cluster_t * |
| 1055 | hammer2_cluster_parent(hammer2_cluster_t *cluster) |
| 1056 | { |
| 1057 | hammer2_cluster_t *cparent; |
| 1058 | int i; |
| 1059 | |
| 1060 | cparent = hammer2_cluster_copy(cluster, HAMMER2_CLUSTER_COPY_NOCHAINS); |
| 1061 | for (i = 0; i < cluster->nchains; ++i) { |
| 1062 | hammer2_chain_t *chain; |
| 1063 | hammer2_chain_t *rchain; |
| 1064 | |
| 1065 | chain = cluster->array[i]; |
| 1066 | if (chain == NULL) |
| 1067 | continue; |
| 1068 | hammer2_chain_ref(chain); |
| 1069 | while ((rchain = chain->parent) != NULL) { |
| 1070 | hammer2_chain_ref(rchain); |
| 1071 | hammer2_chain_unlock(chain); |
| 1072 | hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS); |
| 1073 | hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); |
| 1074 | hammer2_chain_drop(rchain); |
| 1075 | if (chain->parent == rchain) |
| 1076 | break; |
| 1077 | hammer2_chain_unlock(rchain); |
| 1078 | } |
| 1079 | hammer2_chain_drop(chain); |
| 1080 | cparent->array[i] = rchain; |
| 1081 | } |
| 1082 | return cparent; |
| 1083 | } |
| 1084 | |
| 1085 | /************************************************************************ |
| 1086 | * CLUSTER I/O * |
| 1087 | ************************************************************************ |
| 1088 | * |
| 1089 | * |
| 1090 | * WARNING! blockref[] array data is not universal. These functions should |
| 1091 | * only be used to access universal data. |
| 1092 | * |
| 1093 | * NOTE! The rdata call will wait for at least one of the chain I/Os to |
| 1094 | * complete if necessary. The I/O's should have already been |
| 1095 | * initiated by the cluster_lock/chain_lock operation. |
| 1096 | * |
| 1097 | * The cluster must already be in a modified state before wdata |
| 1098 | * is called. The data will already be available for this case. |
| 1099 | */ |
| 1100 | const hammer2_media_data_t * |
| 1101 | hammer2_cluster_rdata(hammer2_cluster_t *cluster) |
| 1102 | { |
| 1103 | return(cluster->focus->data); |
| 1104 | } |
| 1105 | |
| 1106 | hammer2_media_data_t * |
| 1107 | hammer2_cluster_wdata(hammer2_cluster_t *cluster) |
| 1108 | { |
| 1109 | KKASSERT(hammer2_cluster_modified(cluster)); |
| 1110 | return(cluster->focus->data); |
| 1111 | } |
| 1112 | |
| 1113 | /* |
| 1114 | * Load async into independent buffer - used to load logical buffers from |
| 1115 | * underlying device data. The callback is made for the first validated |
| 1116 | * data found, or NULL if no valid data is available. |
| 1117 | * |
| 1118 | * NOTE! The cluster structure is either unique or serialized (e.g. embedded |
| 1119 | * in the inode with an exclusive lock held), the chain structure may be |
| 1120 | * shared. |
| 1121 | */ |
| 1122 | void |
| 1123 | hammer2_cluster_load_async(hammer2_cluster_t *cluster, |
| 1124 | void (*callback)(hammer2_iocb_t *iocb), void *ptr) |
| 1125 | { |
| 1126 | hammer2_chain_t *chain; |
| 1127 | hammer2_iocb_t *iocb; |
| 1128 | hammer2_mount_t *hmp; |
| 1129 | hammer2_blockref_t *bref; |
| 1130 | int i; |
| 1131 | |
| 1132 | /* |
| 1133 | * Try to find a chain whos data is already resolved. If none can |
| 1134 | * be found, start with the first chain. |
| 1135 | */ |
| 1136 | chain = NULL; |
| 1137 | for (i = 0; i < cluster->nchains; ++i) { |
| 1138 | chain = cluster->array[i]; |
| 1139 | if (chain && chain->data) |
| 1140 | break; |
| 1141 | } |
| 1142 | if (i == cluster->nchains) { |
| 1143 | chain = cluster->array[0]; |
| 1144 | i = 0; |
| 1145 | } |
| 1146 | |
| 1147 | iocb = &cluster->iocb; |
| 1148 | iocb->callback = callback; |
| 1149 | iocb->dio = NULL; /* for already-validated case */ |
| 1150 | iocb->cluster = cluster; |
| 1151 | iocb->chain = chain; |
| 1152 | iocb->ptr = ptr; |
| 1153 | iocb->lbase = (off_t)i; |
| 1154 | iocb->flags = 0; |
| 1155 | iocb->error = 0; |
| 1156 | |
| 1157 | /* |
| 1158 | * Data already validated |
| 1159 | */ |
| 1160 | if (chain->data) { |
| 1161 | callback(iocb); |
| 1162 | return; |
| 1163 | } |
| 1164 | |
| 1165 | /* |
| 1166 | * We must resolve to a device buffer, either by issuing I/O or |
| 1167 | * by creating a zero-fill element. We do not mark the buffer |
| 1168 | * dirty when creating a zero-fill element (the hammer2_chain_modify() |
| 1169 | * API must still be used to do that). |
| 1170 | * |
| 1171 | * The device buffer is variable-sized in powers of 2 down |
| 1172 | * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage |
| 1173 | * chunk always contains buffers of the same size. (XXX) |
| 1174 | * |
| 1175 | * The minimum physical IO size may be larger than the variable |
| 1176 | * block size. |
| 1177 | */ |
| 1178 | bref = &chain->bref; |
| 1179 | hmp = chain->hmp; |
| 1180 | |
| 1181 | #if 0 |
| 1182 | /* handled by callback? <- TODO XXX even needed for loads? */ |
| 1183 | /* |
| 1184 | * The getblk() optimization for a 100% overwrite can only be used |
| 1185 | * if the physical block size matches the request. |
| 1186 | */ |
| 1187 | if ((chain->flags & HAMMER2_CHAIN_INITIAL) && |
| 1188 | chain->bytes == hammer2_devblksize(chain->bytes)) { |
| 1189 | error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio); |
| 1190 | KKASSERT(error == 0); |
| 1191 | iocb->dio = dio; |
| 1192 | callback(iocb); |
| 1193 | return; |
| 1194 | } |
| 1195 | #endif |
| 1196 | |
| 1197 | /* |
| 1198 | * Otherwise issue a read |
| 1199 | */ |
| 1200 | hammer2_adjreadcounter(&chain->bref, chain->bytes); |
| 1201 | hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb); |
| 1202 | } |
| 1203 | |
| 1204 | /************************************************************************ |
| 1205 | * NODE FAILURES * |
| 1206 | ************************************************************************ |
| 1207 | * |
| 1208 | * A node failure can occur for numerous reasons. |
| 1209 | * |
| 1210 | * - A read I/O may fail |
| 1211 | * - A write I/O may fail |
| 1212 | * - An unexpected chain might be found (or be missing) |
| 1213 | * - A node might disconnect temporarily and reconnect later |
| 1214 | * (for example, a USB stick could get pulled, or a node might |
| 1215 | * be programmatically disconnected). |
| 1216 | * - A node might run out of space during a modifying operation. |
| 1217 | * |
| 1218 | * When a read failure or an unexpected chain state is found, the chain and |
| 1219 | * parent chain at the failure point for the nodes involved (the nodes |
| 1220 | * which we determine to be in error) are flagged as failed and removed |
| 1221 | * from the cluster. The node itself is allowed to remain active. The |
| 1222 | * highest common point (usually a parent chain) is queued to the |
| 1223 | * resynchronization thread for action. |
| 1224 | * |
| 1225 | * When a write I/O fails or a node runs out of space, we first adjust |
| 1226 | * as if a read failure occurs but we further disable flushes on the |
| 1227 | * ENTIRE node. Concurrent modifying transactions are allowed to complete |
| 1228 | * but any new modifying transactions will automatically remove the node |
| 1229 | * from consideration in all related cluster structures and not generate |
| 1230 | * any new modified chains. The ROOT chain for the failed node(s) is queued |
| 1231 | * to the resynchronization thread for action. |
| 1232 | * |
| 1233 | * A temporary disconnect is handled as if a write failure occurred. |
| 1234 | * |
| 1235 | * Any of these failures might or might not stall related high level VNOPS, |
| 1236 | * depending on what has failed, what nodes remain, the type of cluster, |
| 1237 | * and the operating state of the cluster. |
| 1238 | * |
| 1239 | * FLUSH ON WRITE-DISABLED NODES |
| 1240 | * |
| 1241 | * A flush on a write-disabled node is not allowed to write anything because |
| 1242 | * we cannot safely update the mirror_tid anywhere on the failed node. The |
| 1243 | * synchronization thread uses mirror_tid to calculate incremental resyncs. |
| 1244 | * Dirty meta-data related to the failed node is thrown away. |
| 1245 | * |
| 1246 | * Dirty buffer cache buffers and inodes are only thrown away if they can be |
| 1247 | * retired... that is, if the filesystem still has enough nodes to complete |
| 1248 | * the operation. |
| 1249 | */ |
| 1250 | |
| 1251 | /************************************************************************ |
| 1252 | * SYNCHRONIZATION THREAD * |
| 1253 | ************************************************************************ |
| 1254 | * |
| 1255 | * This thread is responsible for [re]synchronizing the cluster representing |
| 1256 | * a PFS. Any out-of-sync or failed node starts this thread on a |
| 1257 | * node-by-node basis when the failure is detected. |
| 1258 | * |
| 1259 | * Clusters needing resynchronization are queued at the highest point |
| 1260 | * where the parent on the failed node is still valid, or a special |
| 1261 | * incremental scan from the ROOT is queued if no parent exists. This |
| 1262 | * thread is also responsible for waiting for reconnections of the failed |
| 1263 | * node if the cause was due to a disconnect, and waiting for space to be |
| 1264 | * freed up if the cause was due to running out of space. |
| 1265 | * |
| 1266 | * If the cause is due to a node running out of space, this thread will also |
| 1267 | * remove older (unlocked) snapshots to make new space, recover space, and |
| 1268 | * then start resynchronization. |
| 1269 | * |
| 1270 | * Each resynchronization pass virtually snapshots the PFS on the good nodes |
| 1271 | * and synchronizes using that snapshot against the target node. This |
| 1272 | * ensures a consistent chain topology and also avoid interference between |
| 1273 | * the resynchronization thread and frontend operations. |
| 1274 | * |
| 1275 | * Since these are per-node threads it is possible to resynchronize several |
| 1276 | * nodes at once. |
| 1277 | */ |