2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * The cluster module collects multiple chains representing the same
36 * information from different nodes into a single entity. It allows direct
37 * access to media data as long as it is not blockref array data (which
38 * will obviously have to be different at each node).
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
48 * WARNING! This module is *extremely* complex. It must issue asynchronous
49 * locks and I/O, do quorum and/or master-slave processing, and
50 * it must operate properly even if some nodes are broken (which
51 * can also mean indefinite locks).
55 * Cluster operations can be broken down into three pieces:
57 * (1) Chain locking and data retrieval.
58 * hammer2_cluster_lock()
59 * hammer2_cluster_parent()
61 * - Most complex functions, quorum management on transaction ids.
63 * - Locking and data accesses must be internally asynchronous.
65 * - Validate and manage cache coherency primitives (cache state
66 * is stored in chain topologies but must be validated by these
69 * (2) Lookups and Scans
70 * hammer2_cluster_lookup()
71 * hammer2_cluster_next()
73 * - Depend on locking & data retrieval functions, but still complex.
75 * - Must do quorum management on transaction ids.
77 * - Lookup and Iteration ops Must be internally asynchronous.
79 * (3) Modifying Operations
80 * hammer2_cluster_create()
81 * hammer2_cluster_rename()
82 * hammer2_cluster_delete()
83 * hammer2_cluster_modify()
84 * hammer2_cluster_modsync()
86 * - Can usually punt on failures, operation continues unless quorum
87 * is lost. If quorum is lost, must wait for resynchronization
88 * (depending on the management mode).
90 * - Must disconnect node on failures (also not flush), remount, and
93 * - Network links (via kdmsg) are relatively easy to issue as the
94 * complex underworkings of hammer2_chain.c don't have to messed
95 * with (the protocol is at a higher level than block-level).
97 * - Multiple local disk nodes (i.e. block devices) are another matter.
98 * Chain operations have to be dispatched to per-node threads (xN)
99 * because we can't asynchronize potentially very complex chain
100 * operations in hammer2_chain.c (it would be a huge mess).
102 * (these threads are also used to terminate incoming kdmsg ops from
105 * - Single-node filesystems do not use threads and will simply call
106 * hammer2_chain.c functions directly. This short-cut is handled
107 * at the base of each cluster function.
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
119 * Returns non-zero if any chain in the cluster needs to be resized.
120 * Errored elements are not used in the calculation.
123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
125 hammer2_chain_t *chain;
128 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
129 for (i = 0; i < cluster->nchains; ++i) {
130 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
132 chain = cluster->array[i].chain;
137 if (chain->bytes != bytes)
144 * Returns the bref type of the cluster's foucs.
146 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
147 * The cluster must be locked.
150 hammer2_cluster_type(hammer2_cluster_t *cluster)
152 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
153 if (cluster->error == 0)
154 return(cluster->focus->bref.type);
159 * Returns non-zero if the cluster's focus is flagged as being modified.
161 * If the cluster is errored, returns 0.
164 hammer2_cluster_modified(hammer2_cluster_t *cluster)
166 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
167 if (cluster->error == 0)
168 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
173 * Returns the bref of the cluster's focus, sans any data-offset information
174 * (since offset information is per-node and wouldn't be useful).
176 * Callers use this function to access modify_tid, mirror_tid, type,
179 * If the cluster is errored, returns an empty bref.
180 * The cluster must be locked.
183 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
185 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
186 if (cluster->error == 0) {
187 *bref = cluster->focus->bref;
190 bzero(bref, sizeof(*bref));
195 * Flag the cluster for flushing recursively up to the root. Despite the
196 * work it does, this is relatively benign. It just makes sure that the
197 * flusher has top-down visibility to this cluster.
199 * Errored chains are not flagged for flushing.
201 * The cluster should probably be locked.
204 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
206 hammer2_chain_t *chain;
209 for (i = 0; i < cluster->nchains; ++i) {
210 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
212 chain = cluster->array[i].chain;
217 hammer2_chain_setflush(trans, chain);
222 * Set the check mode for the cluster.
223 * Errored elements of the cluster are ignored.
225 * The cluster must be locked and modified.
228 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
229 hammer2_cluster_t *cluster,
232 hammer2_chain_t *chain;
235 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
236 for (i = 0; i < cluster->nchains; ++i) {
237 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
238 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
241 chain = cluster->array[i].chain;
246 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
247 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
248 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
253 * Create a degenerate cluster with one ref from a single locked chain.
254 * The returned cluster will be focused on the chain and inherit its
257 * The chain's lock and reference are transfered to the new cluster, so
258 * the caller should not try to unlock the chain separately.
263 hammer2_cluster_from_chain(hammer2_chain_t *chain)
265 hammer2_cluster_t *cluster;
267 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
268 cluster->array[0].chain = chain;
269 cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
270 cluster->nchains = 1;
271 cluster->focus = chain;
272 cluster->focus_index = 0;
273 cluster->pmp = chain->pmp;
275 cluster->error = chain->error;
276 cluster->flags = HAMMER2_CLUSTER_LOCKED |
277 HAMMER2_CLUSTER_WRHARD |
278 HAMMER2_CLUSTER_RDHARD |
279 HAMMER2_CLUSTER_MSYNCED |
280 HAMMER2_CLUSTER_SSYNCED;
286 * Add a reference to a cluster and its underlying chains.
288 * We must also ref the underlying chains in order to allow ref/unlock
289 * sequences to later re-lock.
292 hammer2_cluster_ref(hammer2_cluster_t *cluster)
294 atomic_add_int(&cluster->refs, 1);
298 * Drop the caller's reference to the cluster. When the ref count drops to
299 * zero this function frees the cluster and drops all underlying chains.
301 * In-progress read I/Os are typically detached from the cluster once the
302 * first one returns (the remaining stay attached to the DIOs but are then
303 * ignored and drop naturally).
306 hammer2_cluster_drop(hammer2_cluster_t *cluster)
308 hammer2_chain_t *chain;
311 KKASSERT(cluster->refs > 0);
312 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
313 cluster->focus = NULL; /* safety XXX chg to assert */
314 cluster->focus_index = 0;
316 for (i = 0; i < cluster->nchains; ++i) {
317 chain = cluster->array[i].chain;
319 hammer2_chain_drop(chain);
320 cluster->array[i].chain = NULL; /* safety */
323 cluster->nchains = 0; /* safety */
325 kfree(cluster, M_HAMMER2);
326 /* cluster is invalid */
331 hammer2_cluster_wait(hammer2_cluster_t *cluster)
333 tsleep(cluster->focus, 0, "h2clcw", 1);
337 * Lock a cluster. Cluster must already be referenced. Focus is maintained.
339 * WARNING! This function expects the caller to handle resolution of the
340 * cluster. We never re-resolve the cluster in this function,
341 * because it might be used to temporarily unlock/relock a cparent
342 * in an iteration or recursrion, and the cparents elements do not
346 hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
348 hammer2_chain_t *chain;
351 /* cannot be on inode-embedded cluster template, must be on copy */
352 KKASSERT(cluster->refs > 0);
353 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
354 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
355 panic("hammer2_cluster_lock: cluster %p already locked!\n",
358 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
361 * Lock chains and resolve state.
363 for (i = 0; i < cluster->nchains; ++i) {
366 chain = cluster->array[i].chain;
369 hammer2_chain_lock(chain, how);
374 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
376 hammer2_cluster_lock_except(cluster, -1, how);
380 * Calculate the clustering state for the cluster and set its focus.
381 * This routine must be called with care. For example, it should not
382 * normally be called after relocking a non-leaf cluster because parent
383 * clusters help iterations and each element might be at a slightly different
384 * indirect node (each node's topology is independently indexed).
386 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
387 * operations. Typically this is only set on a quorum of MASTERs or
388 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER
389 * is present, this bit is *not* set on a quorum of MASTERs. The
390 * synchronization code ignores this bit, but all hammer2_cluster_*() calls
391 * that create/modify/delete elements use it.
393 * The chains making up the cluster may be narrowed down based on quorum
394 * acceptability, and if RESOLVE_RDONLY is specified the chains can be
395 * narrowed down to a single chain as long as the entire subtopology is known
396 * to be intact. So, for example, we can narrow a read-only op to a single
397 * fast SLAVE but if we focus a CACHE chain we must still retain at least
398 * a SLAVE to ensure that the subtopology can be accessed.
400 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
401 * to be maintained once the topology is validated as-of the top level of
404 * If a failure occurs the operation must be aborted by higher-level code and
408 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
410 hammer2_chain_t *chain;
411 hammer2_chain_t *focus;
413 hammer2_tid_t quorum_tid;
414 hammer2_tid_t last_best_quorum_tid;
426 cluster->focus = NULL;
439 KKASSERT(pmp != NULL || cluster->nchains == 0);
440 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
446 * NOTE: A NULL chain is not necessarily an error, it could be
447 * e.g. a lookup failure or the end of an iteration.
450 for (i = 0; i < cluster->nchains; ++i) {
451 chain = cluster->array[i].chain;
452 if (chain && chain->error) {
453 if (cluster->focus == NULL || cluster->focus == chain) {
454 /* error will be overridden by valid focus */
455 cluster->error = chain->error;
459 * Must count total masters and slaves whether the
460 * chain is errored or not.
462 switch (cluster->pmp->pfs_types[i]) {
463 case HAMMER2_PFSTYPE_MASTER:
466 case HAMMER2_PFSTYPE_SLAVE:
472 switch (cluster->pmp->pfs_types[i]) {
473 case HAMMER2_PFSTYPE_MASTER:
476 case HAMMER2_PFSTYPE_SLAVE:
479 case HAMMER2_PFSTYPE_SOFT_MASTER:
480 nflags |= HAMMER2_CLUSTER_WRSOFT;
481 nflags |= HAMMER2_CLUSTER_RDSOFT;
484 case HAMMER2_PFSTYPE_SOFT_SLAVE:
485 nflags |= HAMMER2_CLUSTER_RDSOFT;
487 case HAMMER2_PFSTYPE_SUPROOT:
489 * Degenerate cluster representing the super-root
490 * topology on a single device. Fake stuff so
491 * cluster ops work as expected.
493 nflags |= HAMMER2_CLUSTER_WRHARD;
494 nflags |= HAMMER2_CLUSTER_RDHARD;
495 cluster->focus_index = i;
496 cluster->focus = chain;
497 cluster->error = chain ? chain->error : 0;
507 * Resolve masters. Calculate nmasters for the highest matching
508 * TID, if a quorum cannot be attained try the next lower matching
509 * TID until we exhaust TIDs.
511 * NOTE: A NULL chain is not necessarily an error, it could be
512 * e.g. a lookup failure or the end of an iteration.
515 last_best_quorum_tid = HAMMER2_TID_MAX;
516 quorum_tid = 0; /* fix gcc warning */
518 while (nmasters < nquorum && last_best_quorum_tid != 0) {
522 for (i = 0; i < cluster->nchains; ++i) {
523 if (cluster->pmp->pfs_types[i] !=
524 HAMMER2_PFSTYPE_MASTER) {
527 chain = cluster->array[i].chain;
529 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
531 * Invalid as in unsynchronized, cannot be
532 * used to calculate the quorum.
534 } else if (chain == NULL && quorum_tid == 0) {
536 * NULL chain on master matches NULL chains
540 } else if (quorum_tid < last_best_quorum_tid &&
542 (quorum_tid < chain->bref.modify_tid ||
545 * Better TID located, reset nmasters count.
548 quorum_tid = chain->bref.modify_tid;
550 quorum_tid == chain->bref.modify_tid) {
552 * TID matches current collection.
557 if (nmasters >= nquorum)
559 last_best_quorum_tid = quorum_tid;
565 * NOTE: A NULL chain is not necessarily an error, it could be
566 * e.g. a lookup failure or the end of an iteration.
569 for (i = 0; i < cluster->nchains; ++i) {
570 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
571 chain = cluster->array[i].chain;
572 if (chain && chain->error) {
573 if (cluster->focus == NULL || cluster->focus == chain) {
574 /* error will be overridden by valid focus */
575 cluster->error = chain->error;
580 switch (cluster->pmp->pfs_types[i]) {
581 case HAMMER2_PFSTYPE_MASTER:
583 * We must have enough up-to-date masters to reach
584 * a quorum and the master modify_tid must match
585 * the quorum's modify_tid.
587 * Do not select an errored or out-of-sync master.
589 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
590 nflags |= HAMMER2_CLUSTER_UNHARD;
591 } else if (nmasters >= nquorum &&
592 (chain == NULL || chain->error == 0) &&
593 ((chain == NULL && quorum_tid == 0) ||
594 (chain != NULL && quorum_tid ==
595 chain->bref.modify_tid))) {
596 nflags |= HAMMER2_CLUSTER_WRHARD;
597 nflags |= HAMMER2_CLUSTER_RDHARD;
599 cluster->array[i].flags |=
602 if (cluster->focus == NULL ||
603 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
604 focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
605 cluster->focus_index = i;
606 cluster->focus = chain; /* NULL ok */
607 cluster->error = chain ? chain->error :
610 } else if (chain == NULL || chain->error == 0) {
611 nflags |= HAMMER2_CLUSTER_UNHARD;
614 case HAMMER2_PFSTYPE_SLAVE:
616 * We must have enough up-to-date masters to reach
617 * a quorum and the slave modify_tid must match the
618 * quorum's modify_tid.
620 * Do not select an errored slave.
622 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
623 nflags |= HAMMER2_CLUSTER_UNHARD;
624 } else if (nmasters >= nquorum &&
625 (chain == NULL || chain->error == 0) &&
626 ((chain == NULL && quorum_tid == 0) ||
627 (chain && quorum_tid ==
628 chain->bref.modify_tid))) {
630 nflags |= HAMMER2_CLUSTER_RDHARD;
632 /* XXX optimize for RESOLVE_RDONLY */
633 if (cluster->focus == NULL) {
634 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
635 cluster->focus_index = i;
636 cluster->focus = chain; /* NULL ok */
637 cluster->error = chain ? chain->error :
641 } else if (chain == NULL || chain->error == 0) {
642 nflags |= HAMMER2_CLUSTER_UNSOFT;
645 case HAMMER2_PFSTYPE_SOFT_MASTER:
647 * Directly mounted soft master always wins. There
648 * should be only one.
650 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
651 cluster->focus_index = i;
652 cluster->focus = chain;
653 cluster->error = chain ? chain->error : 0;
654 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
655 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
657 case HAMMER2_PFSTYPE_SOFT_SLAVE:
659 * Directly mounted soft slave always wins. There
660 * should be only one.
662 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
663 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
664 cluster->focus_index = i;
665 cluster->focus = chain;
666 cluster->error = chain ? chain->error : 0;
667 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
670 case HAMMER2_PFSTYPE_SUPROOT:
672 * spmp (degenerate case)
675 cluster->focus_index = i;
676 cluster->focus = chain;
677 cluster->error = chain ? chain->error : 0;
678 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
679 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
687 * Focus now set, adjust ddflag. Skip this pass if the focus
688 * is bad or if we are at the PFS root (the bref won't match at
689 * the PFS root, obviously).
691 focus = cluster->focus;
694 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
699 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
705 * Validate the elements that were not marked invalid. They should
708 for (i = 0; i < cluster->nchains; ++i) {
711 chain = cluster->array[i].chain;
717 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
720 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
721 if (chain->bref.type != focus->bref.type ||
722 chain->bref.key != focus->bref.key ||
723 chain->bref.keybits != focus->bref.keybits ||
724 chain->bref.modify_tid != focus->bref.modify_tid ||
725 chain->bytes != focus->bytes ||
726 ddflag != cluster->ddflag) {
727 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
728 if (hammer2_debug & 1)
729 kprintf("cluster_resolve: matching modify_tid failed "
730 "bref test: idx=%d type=%02x/%02x "
731 "key=%016jx/%d-%016jx/%d "
732 "mod=%016jx/%016jx bytes=%u/%u\n",
734 chain->bref.type, focus->bref.type,
735 chain->bref.key, chain->bref.keybits,
736 focus->bref.key, focus->bref.keybits,
737 chain->bref.modify_tid, focus->bref.modify_tid,
738 chain->bytes, focus->bytes);
739 if (hammer2_debug & 0x4000)
740 panic("cluster_resolve");
741 /* flag issue and force resync? */
747 nflags |= HAMMER2_CLUSTER_NOSOFT;
749 nflags |= HAMMER2_CLUSTER_NOHARD;
752 * Set SSYNCED or MSYNCED for slaves and masters respectively if
753 * all available nodes (even if 0 are available) are fully
754 * synchronized. This is used by the synchronization thread to
755 * determine if there is work it could potentially accomplish.
757 if (nslaves == ttlslaves)
758 nflags |= HAMMER2_CLUSTER_SSYNCED;
759 if (nmasters == ttlmasters)
760 nflags |= HAMMER2_CLUSTER_MSYNCED;
763 * Determine if the cluster was successfully locked for the
764 * requested operation and generate an error code. The cluster
765 * will not be locked (or ref'd) if an error is returned.
767 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
768 * to determine if reading or writing is possible. If writing, the
769 * cluster still requires a call to hammer2_cluster_modify() first.
771 atomic_set_int(&cluster->flags, nflags);
772 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
776 * This is used by the sync thread to force non-NULL elements of a copy
777 * of the pmp->iroot cluster to be good which is required to prime the
781 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
785 for (i = 0; i < cluster->nchains; ++i) {
786 if (cluster->array[i].chain)
787 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
792 * Copy a cluster, returned a ref'd cluster. All underlying chains
793 * are also ref'd, but not locked. Focus state is also copied.
795 * Original cluster does not have to be locked but usually is.
796 * New cluster will not be flagged as locked.
798 * Callers using this function to initialize a new cluster from an inode
799 * generally lock and resolve the resulting cluster.
801 * Callers which use this function to save/restore a cluster structure
802 * generally retain the focus state and do not re-resolve it. Caller should
803 * not try to re-resolve internal (cparent) node state during an iteration
804 * as the individual tracking elements of cparent in an iteration may not
805 * match even though they are correct.
808 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
810 hammer2_pfs_t *pmp = ocluster->pmp;
811 hammer2_cluster_t *ncluster;
812 hammer2_chain_t *chain;
815 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
817 ncluster->nchains = ocluster->nchains;
820 for (i = 0; i < ocluster->nchains; ++i) {
821 chain = ocluster->array[i].chain;
822 ncluster->array[i].chain = chain;
823 ncluster->array[i].flags = ocluster->array[i].flags;
825 hammer2_chain_ref(chain);
827 ncluster->focus_index = ocluster->focus_index;
828 ncluster->focus = ocluster->focus;
829 ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED |
830 HAMMER2_CLUSTER_INODE);
836 * Unlock a cluster. Refcount and focus is maintained.
839 hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx)
841 hammer2_chain_t *chain;
844 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
845 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
848 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
849 KKASSERT(cluster->refs > 0);
850 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
852 for (i = 0; i < cluster->nchains; ++i) {
855 chain = cluster->array[i].chain;
857 hammer2_chain_unlock(chain);
862 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
864 hammer2_cluster_unlock_except(cluster, -1);
868 * Resize the cluster's physical storage allocation in-place. This may
869 * replace the cluster's chains.
872 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
873 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
874 int nradix, int flags)
876 hammer2_chain_t *chain;
879 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */
880 KKASSERT(cparent->nchains == cluster->nchains);
882 for (i = 0; i < cluster->nchains; ++i) {
883 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
884 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
887 chain = cluster->array[i].chain;
889 KKASSERT(cparent->array[i].chain);
890 hammer2_chain_resize(trans, ip,
891 cparent->array[i].chain, chain,
898 * Set an inode's cluster modified, marking the related chains RW and
899 * duplicating them if necessary.
901 * The passed-in chain is a localized copy of the chain previously acquired
902 * when the inode was locked (and possilby replaced in the mean time), and
903 * must also be updated. In fact, we update it first and then synchronize
904 * the inode's cluster cache.
906 hammer2_inode_data_t *
907 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
908 hammer2_cluster_t *cluster, int flags)
910 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
911 hammer2_cluster_modify(trans, cluster, flags);
913 hammer2_inode_repoint(ip, NULL, cluster);
916 return (&hammer2_cluster_wdata(cluster)->ipdata);
920 * Adjust the cluster's chains to allow modification and adjust the
921 * focus. Data will be accessible on return.
923 * If our focused master errors on modify, re-resolve the cluster to
924 * try to select a different master.
927 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
930 hammer2_chain_t *chain;
935 for (i = 0; i < cluster->nchains; ++i) {
936 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
937 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
940 chain = cluster->array[i].chain;
945 hammer2_chain_modify(trans, chain, flags);
946 if (cluster->focus == chain && chain->error) {
947 cluster->error = chain->error;
952 hammer2_cluster_resolve(cluster);
956 * Synchronize modifications from the focus to other chains in a cluster.
957 * Convenient because nominal API users can just modify the contents of the
958 * focus (at least for non-blockref data).
960 * Nominal front-end operations only edit non-block-table data in a single
961 * chain. This code copies such modifications to the other chains in the
962 * cluster. Blocktable modifications are handled on a chain-by-chain basis
963 * by both the frontend and the backend and will explode in fireworks if
967 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
969 hammer2_chain_t *focus;
970 hammer2_chain_t *scan;
971 const hammer2_inode_data_t *ripdata;
972 hammer2_inode_data_t *wipdata;
975 focus = cluster->focus;
976 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
978 for (i = 0; i < cluster->nchains; ++i) {
979 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
981 scan = cluster->array[i].chain;
982 if (scan == NULL || scan == focus)
986 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
987 KKASSERT(focus->bytes == scan->bytes &&
988 focus->bref.type == scan->bref.type);
989 switch(focus->bref.type) {
990 case HAMMER2_BREF_TYPE_INODE:
991 ripdata = &focus->data->ipdata;
992 wipdata = &scan->data->ipdata;
993 if ((ripdata->meta.op_flags &
994 HAMMER2_OPFLAG_DIRECTDATA) == 0) {
995 bcopy(ripdata, wipdata,
996 offsetof(hammer2_inode_data_t, u));
999 /* fall through to full copy */
1000 case HAMMER2_BREF_TYPE_DATA:
1001 bcopy(focus->data, scan->data, focus->bytes);
1003 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1004 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1005 case HAMMER2_BREF_TYPE_FREEMAP:
1006 case HAMMER2_BREF_TYPE_VOLUME:
1007 panic("hammer2_cluster_modsync: illegal node type");
1011 panic("hammer2_cluster_modsync: unknown node type");
1018 * Lookup initialization/completion API. Returns a locked, fully resolved
1019 * cluster with one ref.
1022 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
1024 hammer2_cluster_t *cluster;
1026 cluster = hammer2_cluster_copy(cparent);
1027 if (flags & HAMMER2_LOOKUP_SHARED) {
1028 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
1029 HAMMER2_RESOLVE_SHARED);
1031 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1033 hammer2_cluster_resolve(cluster);
1039 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1042 hammer2_cluster_unlock(cparent);
1043 hammer2_cluster_drop(cparent);
1048 * Locate first match or overlap under parent, return a new, locked, resolved
1049 * cluster with one ref.
1051 * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1054 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
1055 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1058 hammer2_cluster_t *cluster;
1059 hammer2_chain_t *chain;
1060 hammer2_key_t key_accum;
1061 hammer2_key_t key_next;
1066 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1068 pmp = cparent->pmp; /* can be NULL */
1069 key_accum = *key_nextp;
1071 if (flags & HAMMER2_LOOKUP_SHARED)
1072 rflags = HAMMER2_RESOLVE_SHARED;
1076 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1077 cluster->pmp = pmp; /* can be NULL */
1079 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1080 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1083 * Iterating earlier cluster elements with later elements still
1084 * locked is a problem, so we have to unlock the parent and then
1087 hammer2_cluster_unlock(cparent);
1088 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1091 * Pass-1, issue lookups.
1093 for (i = 0; i < cparent->nchains; ++i) {
1094 cluster->array[i].flags = cparent->array[i].flags;
1095 key_next = *key_nextp;
1098 * Always relock the parent as we go.
1100 if (cparent->array[i].chain) {
1101 hammer2_chain_lock(cparent->array[i].chain, rflags);
1105 * Nothing to base the lookup, or parent was not synchronized.
1107 if (cparent->array[i].chain == NULL ||
1108 (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
1113 chain = hammer2_chain_lookup(&cparent->array[i].chain,
1116 &cparent->array[i].cache_index,
1118 cluster->array[i].chain = chain;
1119 if (chain == NULL) {
1122 if (key_accum > key_next)
1123 key_accum = key_next;
1129 cluster->nchains = i;
1130 *key_nextp = key_accum;
1133 * The cluster must be resolved, out of sync elements may be present.
1135 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1137 if (null_count != i)
1138 hammer2_cluster_resolve(cluster);
1139 if (null_count == i ||
1140 (cluster->focus == NULL &&
1141 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1142 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1143 hammer2_cluster_unlock(cluster);
1144 hammer2_cluster_drop(cluster);
1152 * Locate next match or overlap under parent, replace the passed-in cluster.
1153 * The returned cluster is a new, locked, resolved cluster with one ref.
1155 * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1158 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1159 hammer2_key_t *key_nextp,
1160 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1162 hammer2_chain_t *ochain;
1163 hammer2_chain_t *nchain;
1164 hammer2_key_t key_accum;
1165 hammer2_key_t key_next;
1172 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1174 key_accum = *key_nextp;
1176 parent_index = cparent->focus_index; /* save prior focus */
1177 cluster_index = cluster->focus_index;
1178 if (flags & HAMMER2_LOOKUP_SHARED)
1179 rflags = HAMMER2_RESOLVE_SHARED;
1183 cluster->focus = NULL; /* XXX needed any more? */
1184 /*cparent->focus = NULL;*/
1185 cluster->focus_index = 0; /* XXX needed any more? */
1186 /*cparent->focus_index = 0;*/
1188 cluster->ddflag = 0;
1191 * The parent is always locked on entry, the iterator may be locked
1192 * depending on flags.
1194 * We must temporarily unlock the passed-in clusters to avoid a
1195 * deadlock between elements of the cluster with other threads.
1196 * We will fixup the lock in the loop.
1198 * Note that this will clear the focus.
1200 * Reflag the clusters as locked, because we will relock them
1203 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1204 hammer2_cluster_unlock(cluster);
1205 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1207 hammer2_cluster_unlock(cparent);
1208 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1210 for (i = 0; i < cparent->nchains; ++i) {
1211 key_next = *key_nextp;
1212 ochain = cluster->array[i].chain;
1215 * Always relock the parent as we go.
1217 if (cparent->array[i].chain)
1218 hammer2_chain_lock(cparent->array[i].chain, rflags);
1221 * Nothing to iterate from. These cases can occur under
1222 * normal operations. For example, during synchronization
1223 * a slave might reach the end of its scan while records
1224 * are still left on the master(s).
1226 if (ochain == NULL) {
1230 if (cparent->array[i].chain == NULL ||
1231 (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1232 (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1233 /* ochain has not yet been relocked */
1234 hammer2_chain_drop(ochain);
1235 cluster->array[i].chain = NULL;
1241 * Relock the child if necessary. Parent and child will then
1242 * be locked as expected by hammer2_chain_next() and flags.
1244 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1245 hammer2_chain_lock(ochain, rflags);
1246 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1247 &key_next, key_beg, key_end,
1248 &cparent->array[i].cache_index,
1250 /* ochain now invalid but can still be used for focus check */
1251 if (parent_index == i) {
1252 cparent->focus_index = i;
1253 cparent->focus = cparent->array[i].chain;
1256 cluster->array[i].chain = nchain;
1257 if (nchain == NULL) {
1260 if (key_accum > key_next)
1261 key_accum = key_next;
1267 cluster->nchains = i;
1268 *key_nextp = key_accum;
1271 * The cluster must be resolved, out of sync elements may be present.
1273 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1275 if (null_count != i)
1276 hammer2_cluster_resolve(cluster);
1277 if (null_count == i ||
1278 (cluster->focus == NULL &&
1279 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1280 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1281 hammer2_cluster_unlock(cluster);
1282 hammer2_cluster_drop(cluster);
1289 * Advance just one chain in the cluster and recalculate the invalid bit.
1290 * The cluster index is allowed to be flagged invalid on input and is
1291 * recalculated on return.
1293 * (used during synchronization to advance past a chain being deleted).
1295 * The chain being advanced must not be the focus and the clusters in
1296 * question must have already passed normal cluster_lookup/cluster_next
1299 * The cluster always remains intact on return, so void function.
1302 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1303 hammer2_cluster_t *cluster,
1304 hammer2_key_t *key_nextp,
1305 hammer2_key_t key_beg,
1306 hammer2_key_t key_end,
1309 hammer2_chain_t *ochain;
1310 hammer2_chain_t *nchain;
1311 hammer2_chain_t *focus;
1312 hammer2_key_t key_accum;
1313 hammer2_key_t key_next;
1316 key_accum = *key_nextp;
1317 key_next = *key_nextp;
1318 ochain = cluster->array[i].chain;
1321 KKASSERT(ochain != cluster->focus);
1323 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1324 &key_next, key_beg, key_end,
1325 &cparent->array[i].cache_index,
1327 /* ochain now invalid */
1328 if (cparent->focus_index == i)
1329 cparent->focus = cparent->array[i].chain;
1332 * Install nchain. Note that nchain can be NULL, and can also
1333 * be in an unlocked state depending on flags.
1335 cluster->array[i].chain = nchain;
1336 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1338 if (key_accum > key_next)
1339 key_accum = key_next;
1341 focus = cluster->focus;
1347 if (nchain == focus) /* ASSERTED NOT TRUE */
1350 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1351 if (nchain->bref.type != focus->bref.type ||
1352 nchain->bref.key != focus->bref.key ||
1353 nchain->bref.keybits != focus->bref.keybits ||
1354 nchain->bref.modify_tid != focus->bref.modify_tid ||
1355 nchain->bytes != focus->bytes ||
1356 ddflag != cluster->ddflag) {
1357 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1361 *key_nextp = key_accum;
1364 * For now don't re-resolve cluster->flags.
1366 hammer2_cluster_resolve(cluster);
1371 * Create a new cluster using the specified key
1374 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1375 hammer2_cluster_t **clusterp,
1376 hammer2_key_t key, int keybits,
1377 int type, size_t bytes, int flags)
1379 hammer2_cluster_t *cluster;
1384 pmp = trans->pmp; /* can be NULL */
1386 if ((cluster = *clusterp) == NULL) {
1387 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1389 cluster->pmp = pmp; /* can be NULL */
1391 cluster->flags = HAMMER2_CLUSTER_LOCKED;
1393 cluster->focus_index = 0;
1394 cluster->focus = NULL;
1397 * NOTE: cluster->array[] entries can initially be NULL. If
1398 * *clusterp is supplied, skip NULL entries, otherwise
1399 * create new chains.
1401 for (i = 0; i < cparent->nchains; ++i) {
1402 if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1403 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1407 if ((cluster->array[i].flags &
1408 HAMMER2_CITEM_FEMOD) == 0) {
1409 cluster->array[i].flags |=
1410 HAMMER2_CITEM_INVALID;
1413 if (cluster->array[i].chain == NULL)
1416 error = hammer2_chain_create(trans, &cparent->array[i].chain,
1417 &cluster->array[i].chain, pmp,
1419 type, bytes, flags);
1420 if (cparent->focus_index == i)
1421 cparent->focus = cparent->array[i].chain;
1422 KKASSERT(error == 0);
1423 if (cluster->focus == NULL) {
1424 cluster->focus_index = i;
1425 cluster->focus = cluster->array[i].chain;
1427 if (cparent->focus == cparent->array[i].chain) {
1428 cluster->focus_index = i;
1429 cluster->focus = cluster->array[i].chain;
1432 cluster->nchains = i;
1433 *clusterp = cluster;
1434 hammer2_cluster_resolve(cluster);
1440 * Rename a cluster to a new parent.
1442 * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1443 * So the data_off field is not relevant. Only the key and
1447 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1448 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1451 hammer2_chain_t *chain;
1452 hammer2_blockref_t xbref;
1456 cluster->focus = NULL;
1457 cparent->focus = NULL;
1458 cluster->focus_index = 0;
1459 cparent->focus_index = 0;
1462 for (i = 0; i < cluster->nchains; ++i) {
1463 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1464 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1467 chain = cluster->array[i].chain;
1470 xbref = chain->bref;
1471 xbref.key = bref->key;
1472 xbref.keybits = bref->keybits;
1473 hammer2_chain_rename(trans, &xbref,
1474 &cparent->array[i].chain,
1477 hammer2_chain_rename(trans, NULL,
1478 &cparent->array[i].chain,
1481 if (cparent->focus_index == i)
1482 cparent->focus = cparent->array[i].chain;
1483 KKASSERT(cluster->array[i].chain == chain); /*remove*/
1489 * Mark a cluster deleted
1492 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1493 hammer2_cluster_t *cluster, int flags)
1495 hammer2_chain_t *chain;
1496 hammer2_chain_t *parent;
1499 if (cparent == NULL) {
1500 kprintf("cparent is NULL\n");
1504 for (i = 0; i < cluster->nchains; ++i) {
1505 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1506 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1509 parent = cparent->array[i].chain;
1510 chain = cluster->array[i].chain;
1513 if (chain->parent != parent) {
1514 kprintf("hammer2_cluster_delete: parent "
1515 "mismatch chain=%p parent=%p against=%p\n",
1516 chain, chain->parent, parent);
1518 hammer2_chain_delete(trans, parent, chain, flags);
1524 * Create a snapshot of the specified {parent, ochain} with the specified
1525 * label. The originating hammer2_inode must be exclusively locked for
1528 * The ioctl code has already synced the filesystem.
1531 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1532 hammer2_ioc_pfs_t *pfs)
1535 hammer2_cluster_t *ncluster;
1536 const hammer2_inode_data_t *ripdata;
1537 hammer2_inode_data_t *wipdata;
1538 hammer2_chain_t *nchain;
1539 hammer2_inode_t *nip;
1549 kprintf("snapshot %s\n", pfs->name);
1551 name_len = strlen(pfs->name);
1552 lhc = hammer2_dirhash(pfs->name, name_len);
1557 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1559 opfs_clid = ripdata->meta.pfs_clid;
1561 hmp = ocluster->focus->hmp; /* XXX find synchronized local disk */
1564 * Create the snapshot directory under the super-root
1566 * Set PFS type, generate a unique filesystem id, and generate
1567 * a cluster id. Use the same clid when snapshotting a PFS root,
1568 * which theoretically allows the snapshot to be used as part of
1569 * the same cluster (perhaps as a cache).
1571 * Copy the (flushed) blockref array. Theoretically we could use
1572 * chain_duplicate() but it becomes difficult to disentangle
1573 * the shared core so for now just brute-force it.
1579 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1580 proc0.p_ucred, pfs->name, name_len,
1582 HAMMER2_INSERT_PFSROOT, &error);
1585 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1586 wipdata->meta.pfs_type = HAMMER2_PFSTYPE_MASTER;
1587 wipdata->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1588 wipdata->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT;
1589 kern_uuidgen(&wipdata->meta.pfs_fsid, 1);
1592 * Give the snapshot its own private cluster. As a snapshot
1593 * no further synchronization with the original cluster will
1597 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1598 wipdata->meta.pfs_clid = opfs_clid;
1600 kern_uuidgen(&wipdata->meta.pfs_clid, 1);
1602 kern_uuidgen(&wipdata->meta.pfs_clid, 1);
1604 for (i = 0; i < ncluster->nchains; ++i) {
1605 if ((ncluster->array[i].flags &
1606 HAMMER2_CITEM_FEMOD) == 0) {
1607 ncluster->array[i].flags |=
1608 HAMMER2_CITEM_INVALID;
1611 nchain = ncluster->array[i].chain;
1613 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1616 /* XXX hack blockset copy */
1617 /* XXX doesn't work with real cluster */
1618 KKASSERT(ocluster->nchains == 1);
1619 wipdata->u.blockset = ripdata->u.blockset;
1620 hammer2_cluster_modsync(ncluster);
1621 for (i = 0; i < ncluster->nchains; ++i) {
1622 nchain = ncluster->array[i].chain;
1624 hammer2_flush(trans, nchain, 1);
1626 hammer2_inode_unlock(nip, ncluster);
1632 * Return locked parent cluster given a locked child. The child remains
1633 * locked on return. The new parent's focus follows the child's focus
1634 * and the parent is always resolved.
1636 * We must temporarily unlock the passed-in cluster to avoid a deadlock
1637 * between elements of the cluster.
1639 * We must not try to hammer2_cluster_resolve() cparent. The individual
1640 * parent chains for the nodes are the correct parents for the cluster but
1641 * do not necessarily match, so resolve would likely implode.
1644 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1646 hammer2_cluster_t *cparent;
1649 cparent = hammer2_cluster_copy(cluster);
1650 hammer2_cluster_unlock(cluster);
1652 for (i = 0; i < cparent->nchains; ++i) {
1653 hammer2_chain_t *chain;
1654 hammer2_chain_t *rchain;
1657 * Calculate parent for each element. Old chain has an extra
1658 * ref for cparent but the lock remains with cluster.
1660 chain = cparent->array[i].chain;
1663 while ((rchain = chain->parent) != NULL) {
1664 hammer2_chain_ref(rchain);
1665 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1666 if (chain->parent == rchain)
1668 hammer2_chain_unlock(rchain);
1669 hammer2_chain_drop(rchain);
1671 cparent->array[i].chain = rchain;
1672 hammer2_chain_drop(chain);
1674 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1675 /* hammer2_cluster_resolve(cparent); */
1676 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1681 /************************************************************************
1683 ************************************************************************
1686 * WARNING! blockref[] array data is not universal. These functions should
1687 * only be used to access universal data.
1689 * NOTE! The rdata call will wait for at least one of the chain I/Os to
1690 * complete if necessary. The I/O's should have already been
1691 * initiated by the cluster_lock/chain_lock operation.
1693 * The cluster must already be in a modified state before wdata
1694 * is called. The data will already be available for this case.
1696 const hammer2_media_data_t *
1697 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1699 return(cluster->focus->data);
1702 hammer2_media_data_t *
1703 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1705 KKASSERT(hammer2_cluster_modified(cluster));
1706 return(cluster->focus->data);
1710 * Load cluster data asynchronously with callback.
1712 * The callback is made for the first validated data found, or NULL
1713 * if no valid data is available.
1715 * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1716 * in the inode with an exclusive lock held), the chain structure may be
1720 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1721 void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1723 hammer2_chain_t *chain;
1724 hammer2_iocb_t *iocb;
1726 hammer2_blockref_t *bref;
1729 i = cluster->focus_index;
1730 chain = cluster->focus;
1732 iocb = &cluster->iocb;
1733 iocb->callback = callback;
1734 iocb->dio = NULL; /* for already-validated case */
1735 iocb->cluster = cluster;
1736 iocb->chain = chain;
1738 iocb->lbase = (off_t)i;
1743 * Data already validated
1751 * We must resolve to a device buffer, either by issuing I/O or
1752 * by creating a zero-fill element. We do not mark the buffer
1753 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1754 * API must still be used to do that).
1756 * The device buffer is variable-sized in powers of 2 down
1757 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage
1758 * chunk always contains buffers of the same size. (XXX)
1760 * The minimum physical IO size may be larger than the variable
1763 * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
1764 * matches hammer2_devblksize()? Or does the freemap's
1765 * pre-zeroing handle the case for us?
1767 bref = &chain->bref;
1771 /* handled by callback? <- TODO XXX even needed for loads? */
1773 * The getblk() optimization for a 100% overwrite can only be used
1774 * if the physical block size matches the request.
1776 if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1777 chain->bytes == hammer2_devblksize(chain->bytes)) {
1778 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1779 KKASSERT(error == 0);
1787 * Otherwise issue a read
1789 hammer2_adjreadcounter(&chain->bref, chain->bytes);
1790 hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);