2 * Copyright (c) 2013-2018 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * The cluster module collects multiple chains representing the same
36 * information from different nodes into a single entity. It allows direct
37 * access to media data as long as it is not blockref array data (which
38 * will obviously have to be different at each node).
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
48 * WARNING! This module is *extremely* complex. It must issue asynchronous
49 * locks and I/O, do quorum and/or master-slave processing, and
50 * it must operate properly even if some nodes are broken (which
51 * can also mean indefinite locks).
55 * Cluster operations can be broken down into three pieces:
57 * (1) Chain locking and data retrieval.
59 * - Most complex functions, quorum management on transaction ids.
61 * - Locking and data accesses must be internally asynchronous.
63 * - Validate and manage cache coherency primitives (cache state
64 * is stored in chain topologies but must be validated by these
67 * (2) Lookups and Scans
68 * hammer2_cluster_lookup()
69 * hammer2_cluster_next()
71 * - Depend on locking & data retrieval functions, but still complex.
73 * - Must do quorum management on transaction ids.
75 * - Lookup and Iteration ops Must be internally asynchronous.
77 * (3) Modifying Operations
78 * hammer2_cluster_create()
80 * - Can usually punt on failures, operation continues unless quorum
81 * is lost. If quorum is lost, must wait for resynchronization
82 * (depending on the management mode).
84 * - Must disconnect node on failures (also not flush), remount, and
87 * - Network links (via kdmsg) are relatively easy to issue as the
88 * complex underworkings of hammer2_chain.c don't have to messed
89 * with (the protocol is at a higher level than block-level).
91 * - Multiple local disk nodes (i.e. block devices) are another matter.
92 * Chain operations have to be dispatched to per-node threads (xN)
93 * because we can't asynchronize potentially very complex chain
94 * operations in hammer2_chain.c (it would be a huge mess).
96 * (these threads are also used to terminate incoming kdmsg ops from
99 * - Single-node filesystems do not use threads and will simply call
100 * hammer2_chain.c functions directly. This short-cut is handled
101 * at the base of each cluster function.
103 #include <sys/cdefs.h>
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/types.h>
107 #include <sys/lock.h>
108 #include <sys/uuid.h>
113 * Returns the bref type of the cluster's foucs.
115 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
116 * The cluster must be locked.
119 hammer2_cluster_type(hammer2_cluster_t *cluster)
121 if (cluster->error == 0) {
122 KKASSERT(cluster->focus != NULL);
123 return(cluster->focus->bref.type);
129 * Returns the bref of the cluster's focus, sans any data-offset information
130 * (since offset information is per-node and wouldn't be useful).
132 * Callers use this function to access modify_tid, mirror_tid, type,
135 * If the cluster is errored, returns an empty bref.
136 * The cluster must be locked.
139 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
141 if (cluster->error == 0) {
142 KKASSERT(cluster->focus != NULL);
143 *bref = cluster->focus->bref;
146 bzero(bref, sizeof(*bref));
151 * Create a degenerate cluster with one ref from a single locked chain.
152 * The returned cluster will be focused on the chain and inherit its
155 * The chain's lock and reference are transfered to the new cluster, so
156 * the caller should not try to unlock the chain separately.
161 hammer2_dummy_xop_from_chain(hammer2_xop_head_t *xop, hammer2_chain_t *chain)
163 hammer2_cluster_t *cluster;
165 bzero(xop, sizeof(*xop));
167 cluster = &xop->cluster;
168 cluster->array[0].chain = chain;
169 cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
170 cluster->nchains = 1;
171 cluster->focus = chain;
172 cluster->focus_index = 0;
173 cluster->pmp = chain->pmp;
175 cluster->error = chain->error;
176 cluster->flags = HAMMER2_CLUSTER_LOCKED |
177 HAMMER2_CLUSTER_WRHARD |
178 HAMMER2_CLUSTER_RDHARD |
179 HAMMER2_CLUSTER_MSYNCED |
180 HAMMER2_CLUSTER_SSYNCED;
184 * Add a reference to a cluster and its underlying chains.
186 * We must also ref the underlying chains in order to allow ref/unlock
187 * sequences to later re-lock.
190 hammer2_cluster_ref(hammer2_cluster_t *cluster)
192 atomic_add_int(&cluster->refs, 1);
196 * Drop the caller's reference to the cluster. When the ref count drops to
197 * zero this function frees the cluster and drops all underlying chains.
199 * In-progress read I/Os are typically detached from the cluster once the
200 * first one returns (the remaining stay attached to the DIOs but are then
201 * ignored and drop naturally).
204 hammer2_cluster_drop(hammer2_cluster_t *cluster)
206 hammer2_chain_t *chain;
209 KKASSERT(cluster->refs > 0);
210 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
211 cluster->focus = NULL; /* safety XXX chg to assert */
212 cluster->focus_index = 0;
214 for (i = 0; i < cluster->nchains; ++i) {
215 chain = cluster->array[i].chain;
217 hammer2_chain_drop(chain);
218 cluster->array[i].chain = NULL; /* safety */
221 cluster->nchains = 0; /* safety */
223 kfree(cluster, M_HAMMER2);
224 /* cluster is invalid */
229 * Lock a cluster. Cluster must already be referenced. Focus is maintained.
231 * WARNING! This function expects the caller to handle resolution of the
232 * cluster. We never re-resolve the cluster in this function,
233 * because it might be used to temporarily unlock/relock a cparent
234 * in an iteration or recursrion, and the cparents elements do not
238 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
240 hammer2_chain_t *chain;
243 /* cannot be on inode-embedded cluster template, must be on copy */
244 KKASSERT(cluster->refs > 0);
245 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
246 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
247 panic("hammer2_cluster_lock: cluster %p already locked!\n",
250 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
253 * Lock chains and resolve state.
255 for (i = 0; i < cluster->nchains; ++i) {
256 chain = cluster->array[i].chain;
259 hammer2_chain_lock(chain, how);
264 * Calculate the clustering state for the cluster and set its focus.
265 * This routine must be called with care. For example, it should not
266 * normally be called after relocking a non-leaf cluster because parent
267 * clusters help iterations and each element might be at a slightly different
268 * indirect node (each node's topology is independently indexed).
270 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
271 * operations. Typically this is only set on a quorum of MASTERs or
272 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER
273 * is present, this bit is *not* set on a quorum of MASTERs. The
274 * synchronization code ignores this bit, but all hammer2_cluster_*() calls
275 * that create/modify/delete elements use it.
277 * The chains making up the cluster may be narrowed down based on quorum
278 * acceptability, and if RESOLVE_RDONLY is specified the chains can be
279 * narrowed down to a single chain as long as the entire subtopology is known
280 * to be intact. So, for example, we can narrow a read-only op to a single
281 * fast SLAVE but if we focus a CACHE chain we must still retain at least
282 * a SLAVE to ensure that the subtopology can be accessed.
284 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
285 * to be maintained once the topology is validated as-of the top level of
288 * If a failure occurs the operation must be aborted by higher-level code and
292 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
294 hammer2_chain_t *chain;
295 hammer2_chain_t *focus;
297 hammer2_tid_t quorum_tid;
298 hammer2_tid_t last_best_quorum_tid;
310 cluster->focus = NULL;
323 KKASSERT(pmp != NULL || cluster->nchains == 0);
324 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
330 * NOTE: A NULL chain is not necessarily an error, it could be
331 * e.g. a lookup failure or the end of an iteration.
334 for (i = 0; i < cluster->nchains; ++i) {
335 chain = cluster->array[i].chain;
336 if (chain && chain->error) {
337 if (cluster->focus == NULL || cluster->focus == chain) {
338 /* error will be overridden by valid focus */
339 cluster->error = chain->error;
343 * Must count total masters and slaves whether the
344 * chain is errored or not.
346 switch (cluster->pmp->pfs_types[i]) {
347 case HAMMER2_PFSTYPE_SUPROOT:
348 case HAMMER2_PFSTYPE_MASTER:
351 case HAMMER2_PFSTYPE_SLAVE:
357 switch (cluster->pmp->pfs_types[i]) {
358 case HAMMER2_PFSTYPE_MASTER:
361 case HAMMER2_PFSTYPE_SLAVE:
364 case HAMMER2_PFSTYPE_SOFT_MASTER:
365 nflags |= HAMMER2_CLUSTER_WRSOFT;
366 nflags |= HAMMER2_CLUSTER_RDSOFT;
369 case HAMMER2_PFSTYPE_SOFT_SLAVE:
370 nflags |= HAMMER2_CLUSTER_RDSOFT;
372 case HAMMER2_PFSTYPE_SUPROOT:
374 * Degenerate cluster representing the super-root
375 * topology on a single device. Fake stuff so
376 * cluster ops work as expected.
378 nflags |= HAMMER2_CLUSTER_WRHARD;
379 nflags |= HAMMER2_CLUSTER_RDHARD;
380 cluster->focus_index = i;
381 cluster->focus = chain;
382 cluster->error = chain ? chain->error : 0;
393 * Resolve masters. Calculate nmasters for the highest matching
394 * TID, if a quorum cannot be attained try the next lower matching
395 * TID until we exhaust TIDs.
397 * NOTE: A NULL chain is not necessarily an error, it could be
398 * e.g. a lookup failure or the end of an iteration.
401 last_best_quorum_tid = HAMMER2_TID_MAX;
402 quorum_tid = 0; /* fix gcc warning */
404 while (nmasters < nquorum && last_best_quorum_tid != 0) {
408 for (i = 0; i < cluster->nchains; ++i) {
409 switch (cluster->pmp->pfs_types[i]) {
410 case HAMMER2_PFSTYPE_SUPROOT:
411 case HAMMER2_PFSTYPE_MASTER:
416 chain = cluster->array[i].chain;
418 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
420 * Invalid as in unsynchronized, cannot be
421 * used to calculate the quorum.
423 } else if (chain == NULL && quorum_tid == 0) {
425 * NULL chain on master matches NULL chains
429 } else if (quorum_tid < last_best_quorum_tid &&
431 (quorum_tid < chain->bref.modify_tid ||
434 * Better TID located, reset nmasters count.
437 quorum_tid = chain->bref.modify_tid;
439 quorum_tid == chain->bref.modify_tid) {
441 * TID matches current collection.
446 if (nmasters >= nquorum)
448 last_best_quorum_tid = quorum_tid;
454 * NOTE: A NULL chain is not necessarily an error, it could be
455 * e.g. a lookup failure or the end of an iteration.
458 for (i = 0; i < cluster->nchains; ++i) {
459 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
460 chain = cluster->array[i].chain;
461 if (chain && chain->error) {
462 if (cluster->focus == NULL || cluster->focus == chain) {
463 /* error will be overridden by valid focus */
464 cluster->error = chain->error;
469 switch (cluster->pmp->pfs_types[i]) {
470 case HAMMER2_PFSTYPE_MASTER:
472 * We must have enough up-to-date masters to reach
473 * a quorum and the master modify_tid must match
474 * the quorum's modify_tid.
476 * Do not select an errored or out-of-sync master.
478 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
479 nflags |= HAMMER2_CLUSTER_UNHARD;
480 } else if (nmasters >= nquorum &&
481 (chain == NULL || chain->error == 0) &&
482 ((chain == NULL && quorum_tid == 0) ||
483 (chain != NULL && quorum_tid ==
484 chain->bref.modify_tid))) {
485 nflags |= HAMMER2_CLUSTER_WRHARD;
486 nflags |= HAMMER2_CLUSTER_RDHARD;
488 cluster->array[i].flags |=
491 if (cluster->focus == NULL ||
492 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
493 focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
494 cluster->focus_index = i;
495 cluster->focus = chain; /* NULL ok */
496 cluster->error = chain ? chain->error :
499 } else if (chain == NULL || chain->error == 0) {
500 nflags |= HAMMER2_CLUSTER_UNHARD;
503 case HAMMER2_PFSTYPE_SLAVE:
505 * We must have enough up-to-date masters to reach
506 * a quorum and the slave modify_tid must match the
507 * quorum's modify_tid.
509 * Do not select an errored slave.
511 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
512 nflags |= HAMMER2_CLUSTER_UNHARD;
513 } else if (nmasters >= nquorum &&
514 (chain == NULL || chain->error == 0) &&
515 ((chain == NULL && quorum_tid == 0) ||
516 (chain && quorum_tid ==
517 chain->bref.modify_tid))) {
519 nflags |= HAMMER2_CLUSTER_RDHARD;
521 /* XXX optimize for RESOLVE_RDONLY */
522 if (cluster->focus == NULL) {
523 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
524 cluster->focus_index = i;
525 cluster->focus = chain; /* NULL ok */
526 cluster->error = chain ? chain->error :
530 } else if (chain == NULL || chain->error == 0) {
531 nflags |= HAMMER2_CLUSTER_UNSOFT;
534 case HAMMER2_PFSTYPE_SOFT_MASTER:
536 * Directly mounted soft master always wins. There
537 * should be only one.
539 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
540 cluster->focus_index = i;
541 cluster->focus = chain;
542 cluster->error = chain ? chain->error : 0;
543 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
544 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
546 case HAMMER2_PFSTYPE_SOFT_SLAVE:
548 * Directly mounted soft slave always wins. There
549 * should be only one.
551 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
552 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
553 cluster->focus_index = i;
554 cluster->focus = chain;
555 cluster->error = chain ? chain->error : 0;
556 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
559 case HAMMER2_PFSTYPE_SUPROOT:
561 * spmp (degenerate case)
564 cluster->focus_index = i;
565 cluster->focus = chain;
566 cluster->error = chain ? chain->error : 0;
567 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
568 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
576 * Focus now set, adjust ddflag. Skip this pass if the focus
577 * is bad or if we are at the PFS root (the bref won't match at
578 * the PFS root, obviously).
580 focus = cluster->focus;
583 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
588 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
594 * Validate the elements that were not marked invalid. They should
597 for (i = 0; i < cluster->nchains; ++i) {
600 chain = cluster->array[i].chain;
606 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
609 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
610 if (chain->bref.type != focus->bref.type ||
611 chain->bref.key != focus->bref.key ||
612 chain->bref.keybits != focus->bref.keybits ||
613 chain->bref.modify_tid != focus->bref.modify_tid ||
614 chain->bytes != focus->bytes ||
615 ddflag != cluster->ddflag) {
616 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
617 if (hammer2_debug & 1)
618 kprintf("cluster_resolve: matching modify_tid failed "
619 "bref test: idx=%d type=%02x/%02x "
620 "key=%016jx/%d-%016jx/%d "
621 "mod=%016jx/%016jx bytes=%u/%u\n",
623 chain->bref.type, focus->bref.type,
624 chain->bref.key, chain->bref.keybits,
625 focus->bref.key, focus->bref.keybits,
626 chain->bref.modify_tid, focus->bref.modify_tid,
627 chain->bytes, focus->bytes);
628 if (hammer2_debug & 0x4000)
629 panic("cluster_resolve");
630 /* flag issue and force resync? */
636 nflags |= HAMMER2_CLUSTER_NOSOFT;
638 nflags |= HAMMER2_CLUSTER_NOHARD;
641 * Set SSYNCED or MSYNCED for slaves and masters respectively if
642 * all available nodes (even if 0 are available) are fully
643 * synchronized. This is used by the synchronization thread to
644 * determine if there is work it could potentially accomplish.
646 if (nslaves == ttlslaves)
647 nflags |= HAMMER2_CLUSTER_SSYNCED;
648 if (nmasters == ttlmasters)
649 nflags |= HAMMER2_CLUSTER_MSYNCED;
652 * Determine if the cluster was successfully locked for the
653 * requested operation and generate an error code. The cluster
654 * will not be locked (or ref'd) if an error is returned.
656 atomic_set_int(&cluster->flags, nflags);
657 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
661 * This is used by the XOPS subsystem to calculate the state of
662 * the collection and tell hammer2_xop_collect() what to do with it.
663 * The collection can be in various states of desynchronization, the
664 * caller specifically wants to resolve the passed-in key.
667 * 0 - Quorum agreement, key is valid
669 * ENOENT - Quorum agreement, end of scan
671 * ESRCH - Quorum agreement, key is INVALID (caller should
674 * EIO - Quorum agreement but all elements had errors.
676 * EDEADLK - No quorum agreement possible for key, a repair
677 * may be needed. Caller has to decide what to do,
678 * possibly iterating the key or generating an EIO.
680 * EINPROGRESS - No quorum agreement yet, but agreement is still
681 * possible if caller waits for more responses. Caller
682 * should not iterate key.
684 * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
686 * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
689 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
691 hammer2_chain_t *chain;
692 hammer2_chain_t *focus;
694 hammer2_tid_t quorum_tid;
695 hammer2_tid_t last_best_quorum_tid;
700 int nmasters_keymatch;
703 int umasters; /* unknown masters (still in progress) */
709 cluster->focus = NULL;
712 KKASSERT(pmp != NULL || cluster->nchains == 0);
717 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
726 * NOTE: A NULL chain is not necessarily an error, it could be
727 * e.g. a lookup failure or the end of an iteration.
730 for (i = 0; i < cluster->nchains; ++i) {
731 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
732 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
734 chain = cluster->array[i].chain;
735 error = cluster->array[i].error;
736 if (chain && error) {
737 if (cluster->focus == NULL || cluster->focus == chain) {
738 /* error will be overridden by valid focus */
743 * Must count total masters and slaves whether the
744 * chain is errored or not.
746 switch (cluster->pmp->pfs_types[i]) {
747 case HAMMER2_PFSTYPE_SUPROOT:
748 case HAMMER2_PFSTYPE_MASTER:
751 case HAMMER2_PFSTYPE_SLAVE:
757 switch (cluster->pmp->pfs_types[i]) {
758 case HAMMER2_PFSTYPE_MASTER:
761 case HAMMER2_PFSTYPE_SLAVE:
764 case HAMMER2_PFSTYPE_SOFT_MASTER:
765 nflags |= HAMMER2_CLUSTER_WRSOFT;
766 nflags |= HAMMER2_CLUSTER_RDSOFT;
769 case HAMMER2_PFSTYPE_SOFT_SLAVE:
770 nflags |= HAMMER2_CLUSTER_RDSOFT;
772 case HAMMER2_PFSTYPE_SUPROOT:
774 * Degenerate cluster representing the super-root
775 * topology on a single device. Fake stuff so
776 * cluster ops work as expected.
779 nflags |= HAMMER2_CLUSTER_WRHARD;
780 nflags |= HAMMER2_CLUSTER_RDHARD;
781 cluster->focus_index = i;
782 cluster->focus = chain;
783 cluster->error = error;
793 * Resolve nmasters - master nodes fully match
795 * Resolve umasters - master nodes operation still
798 * Resolve nmasters_keymatch - master nodes match the passed-in
799 * key and may or may not match
800 * the quorum-agreed tid.
802 * The quorum-agreed TID is the highest matching TID.
804 last_best_quorum_tid = HAMMER2_TID_MAX;
807 nmasters_keymatch = 0;
808 quorum_tid = 0; /* fix gcc warning */
810 while (nmasters < nquorum && last_best_quorum_tid != 0) {
813 nmasters_keymatch = 0;
816 for (i = 0; i < cluster->nchains; ++i) {
817 /* XXX SOFT smpresent handling */
818 switch(cluster->pmp->pfs_types[i]) {
819 case HAMMER2_PFSTYPE_MASTER:
820 case HAMMER2_PFSTYPE_SUPROOT:
826 chain = cluster->array[i].chain;
827 error = cluster->array[i].error;
830 * Skip elements still in progress. umasters keeps
831 * track of masters that might still be in-progress.
833 if (chain == NULL && (cluster->array[i].flags &
834 HAMMER2_CITEM_NULL) == 0) {
842 if (flags & HAMMER2_CHECK_NULL) {
846 if (cluster->error == 0)
847 cluster->error = error;
850 (key == (hammer2_key_t)-1 ||
851 chain->bref.key == key)) {
854 if (chain->bref.modify_tid <
855 last_best_quorum_tid &&
856 quorum_tid < chain->bref.modify_tid) {
858 * Select new TID as master if better
859 * than any found so far in this loop,
860 * as long as it does not reach the
861 * best tid found in the previous loop.
864 quorum_tid = chain->bref.modify_tid;
866 if (quorum_tid == chain->bref.modify_tid) {
868 * TID matches current collection.
870 * (error handled in next pass)
873 if (chain->error == 0) {
874 cluster->focus = chain;
875 cluster->focus_index = i;
880 if (nmasters >= nquorum)
882 last_best_quorum_tid = quorum_tid;
886 kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
887 nmasters, nquorum, nmasters_keymatch, umasters);
891 * Early return if we do not have enough masters.
893 if (nmasters < nquorum) {
894 if (nmasters + umasters >= nquorum)
895 return HAMMER2_ERROR_EINPROGRESS;
896 if (nmasters_keymatch < nquorum)
897 return HAMMER2_ERROR_ESRCH;
898 return HAMMER2_ERROR_EDEADLK;
902 * Validated end of scan.
904 if (flags & HAMMER2_CHECK_NULL) {
905 if (cluster->error == 0)
906 cluster->error = HAMMER2_ERROR_ENOENT;
907 return cluster->error;
911 * If we have a NULL focus at this point the agreeing quorum all
914 if (cluster->focus == NULL)
915 return HAMMER2_ERROR_EIO;
920 * We have quorum agreement, validate elements, not end of scan.
925 for (i = 0; i < cluster->nchains; ++i) {
926 chain = cluster->array[i].chain;
927 error = cluster->array[i].error;
929 chain->bref.key != key ||
930 chain->bref.modify_tid != quorum_tid) {
937 * XXX for now, cumulative error.
939 if (cluster->error == 0)
940 cluster->error = error;
942 switch (cluster->pmp->pfs_types[i]) {
943 case HAMMER2_PFSTYPE_MASTER:
944 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
945 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
946 nflags |= HAMMER2_CLUSTER_WRHARD;
947 nflags |= HAMMER2_CLUSTER_RDHARD;
949 case HAMMER2_PFSTYPE_SLAVE:
951 * We must have enough up-to-date masters to reach
952 * a quorum and the slave modify_tid must match the
953 * quorum's modify_tid.
955 * Do not select an errored slave.
957 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
958 nflags |= HAMMER2_CLUSTER_RDHARD;
961 case HAMMER2_PFSTYPE_SOFT_MASTER:
963 * Directly mounted soft master always wins. There
964 * should be only one.
966 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
967 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
969 case HAMMER2_PFSTYPE_SOFT_SLAVE:
971 * Directly mounted soft slave always wins. There
972 * should be only one.
976 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
978 case HAMMER2_PFSTYPE_SUPROOT:
980 * spmp (degenerate case)
982 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
983 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
984 nflags |= HAMMER2_CLUSTER_WRHARD;
985 nflags |= HAMMER2_CLUSTER_RDHARD;
993 * Focus now set, adjust ddflag. Skip this pass if the focus
994 * is bad or if we are at the PFS root (the bref won't match at
995 * the PFS root, obviously).
997 * focus is probably not locked and it isn't safe to test its
998 * content (e.g. focus->data, focus->dio, other content). We
999 * do not synchronize the dio to the cpu here. In fact, in numerous
1000 * situations the frontend doesn't even need to access its dio/data,
1001 * so synchronizing it here would be wasteful.
1003 focus = cluster->focus;
1006 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
1008 cluster->ddflag = 0;
1011 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1017 * Validate the elements that were not marked invalid. They should
1020 for (i = 0; i < cluster->nchains; ++i) {
1023 chain = cluster->array[i].chain;
1029 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1032 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1033 if (chain->bref.type != focus->bref.type ||
1034 chain->bref.key != focus->bref.key ||
1035 chain->bref.keybits != focus->bref.keybits ||
1036 chain->bref.modify_tid != focus->bref.modify_tid ||
1037 chain->bytes != focus->bytes ||
1038 ddflag != cluster->ddflag) {
1039 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1040 if (hammer2_debug & 1)
1041 kprintf("cluster_resolve: matching modify_tid failed "
1042 "bref test: idx=%d type=%02x/%02x "
1043 "key=%016jx/%d-%016jx/%d "
1044 "mod=%016jx/%016jx bytes=%u/%u\n",
1046 chain->bref.type, focus->bref.type,
1047 chain->bref.key, chain->bref.keybits,
1048 focus->bref.key, focus->bref.keybits,
1049 chain->bref.modify_tid, focus->bref.modify_tid,
1050 chain->bytes, focus->bytes);
1051 if (hammer2_debug & 0x4000)
1052 panic("cluster_resolve");
1053 /* flag issue and force resync? */
1059 nflags |= HAMMER2_CLUSTER_NOSOFT;
1060 if (ttlmasters == 0)
1061 nflags |= HAMMER2_CLUSTER_NOHARD;
1064 * Set SSYNCED or MSYNCED for slaves and masters respectively if
1065 * all available nodes (even if 0 are available) are fully
1066 * synchronized. This is used by the synchronization thread to
1067 * determine if there is work it could potentially accomplish.
1069 if (nslaves == ttlslaves)
1070 nflags |= HAMMER2_CLUSTER_SSYNCED;
1071 if (nmasters == ttlmasters)
1072 nflags |= HAMMER2_CLUSTER_MSYNCED;
1075 * Determine if the cluster was successfully locked for the
1076 * requested operation and generate an error code. The cluster
1077 * will not be locked (or ref'd) if an error is returned.
1079 atomic_set_int(&cluster->flags, nflags);
1080 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1082 return cluster->error;
1086 * This is used by the sync thread to force non-NULL elements of a copy
1087 * of the pmp->iroot cluster to be good which is required to prime the
1091 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1095 for (i = 0; i < cluster->nchains; ++i) {
1096 if (cluster->array[i].chain)
1097 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1102 * Unlock a cluster. Refcount and focus is maintained.
1105 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1107 hammer2_chain_t *chain;
1110 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1111 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1114 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1115 KKASSERT(cluster->refs > 0);
1116 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1118 for (i = 0; i < cluster->nchains; ++i) {
1119 chain = cluster->array[i].chain;
1121 hammer2_chain_unlock(chain);