2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * The cluster module collects multiple chains representing the same
36 * information from different nodes into a single entity. It allows direct
37 * access to media data as long as it is not blockref array data (which
38 * will obviously have to be different at each node).
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
48 * WARNING! This module is *extremely* complex. It must issue asynchronous
49 * locks and I/O, do quorum and/or master-slave processing, and
50 * it must operate properly even if some nodes are broken (which
51 * can also mean indefinite locks).
55 * Cluster operations can be broken down into three pieces:
57 * (1) Chain locking and data retrieval.
58 * hammer2_cluster_lock()
59 * hammer2_cluster_parent()
61 * - Most complex functions, quorum management on transaction ids.
63 * - Locking and data accesses must be internally asynchronous.
65 * - Validate and manage cache coherency primitives (cache state
66 * is stored in chain topologies but must be validated by these
69 * (2) Lookups and Scans
70 * hammer2_cluster_lookup()
71 * hammer2_cluster_next()
73 * - Depend on locking & data retrieval functions, but still complex.
75 * - Must do quorum management on transaction ids.
77 * - Lookup and Iteration ops Must be internally asynchronous.
79 * (3) Modifying Operations
80 * hammer2_cluster_create()
81 * hammer2_cluster_rename()
82 * hammer2_cluster_delete()
83 * hammer2_cluster_modify()
84 * hammer2_cluster_modsync()
86 * - Can usually punt on failures, operation continues unless quorum
87 * is lost. If quorum is lost, must wait for resynchronization
88 * (depending on the management mode).
90 * - Must disconnect node on failures (also not flush), remount, and
93 * - Network links (via kdmsg) are relatively easy to issue as the
94 * complex underworkings of hammer2_chain.c don't have to messed
95 * with (the protocol is at a higher level than block-level).
97 * - Multiple local disk nodes (i.e. block devices) are another matter.
98 * Chain operations have to be dispatched to per-node threads (xN)
99 * because we can't asynchronize potentially very complex chain
100 * operations in hammer2_chain.c (it would be a huge mess).
102 * (these threads are also used to terminate incoming kdmsg ops from
105 * - Single-node filesystems do not use threads and will simply call
106 * hammer2_chain.c functions directly. This short-cut is handled
107 * at the base of each cluster function.
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
119 * Returns the bref type of the cluster's foucs.
121 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
122 * The cluster must be locked.
125 hammer2_cluster_type(hammer2_cluster_t *cluster)
127 if (cluster->error == 0) {
128 KKASSERT(cluster->focus != NULL);
129 return(cluster->focus->bref.type);
135 * Returns non-zero if the cluster's focus is flagged as being modified.
137 * If the cluster is errored, returns 0.
141 hammer2_cluster_modified(hammer2_cluster_t *cluster)
143 if (cluster->error == 0) {
144 KKASSERT(cluster->focus != NULL);
145 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
151 * Returns the bref of the cluster's focus, sans any data-offset information
152 * (since offset information is per-node and wouldn't be useful).
154 * Callers use this function to access modify_tid, mirror_tid, type,
157 * If the cluster is errored, returns an empty bref.
158 * The cluster must be locked.
161 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
163 if (cluster->error == 0) {
164 KKASSERT(cluster->focus != NULL);
165 *bref = cluster->focus->bref;
168 bzero(bref, sizeof(*bref));
173 * Set the check mode for the cluster.
174 * Errored elements of the cluster are ignored.
176 * The cluster must be locked and modified.
179 hammer2_cluster_setmethod_check(hammer2_cluster_t *cluster, int check_algo)
181 hammer2_chain_t *chain;
184 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
185 for (i = 0; i < cluster->nchains; ++i) {
186 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
187 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
190 chain = cluster->array[i].chain;
195 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
196 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
197 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
202 * Create a degenerate cluster with one ref from a single locked chain.
203 * The returned cluster will be focused on the chain and inherit its
206 * The chain's lock and reference are transfered to the new cluster, so
207 * the caller should not try to unlock the chain separately.
212 hammer2_cluster_from_chain(hammer2_chain_t *chain)
214 hammer2_cluster_t *cluster;
216 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
217 cluster->array[0].chain = chain;
218 cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
219 cluster->nchains = 1;
220 cluster->focus = chain;
221 cluster->focus_index = 0;
222 cluster->pmp = chain->pmp;
224 cluster->error = chain->error;
225 cluster->flags = HAMMER2_CLUSTER_LOCKED |
226 HAMMER2_CLUSTER_WRHARD |
227 HAMMER2_CLUSTER_RDHARD |
228 HAMMER2_CLUSTER_MSYNCED |
229 HAMMER2_CLUSTER_SSYNCED;
235 * Add a reference to a cluster and its underlying chains.
237 * We must also ref the underlying chains in order to allow ref/unlock
238 * sequences to later re-lock.
241 hammer2_cluster_ref(hammer2_cluster_t *cluster)
243 atomic_add_int(&cluster->refs, 1);
247 * Drop the caller's reference to the cluster. When the ref count drops to
248 * zero this function frees the cluster and drops all underlying chains.
250 * In-progress read I/Os are typically detached from the cluster once the
251 * first one returns (the remaining stay attached to the DIOs but are then
252 * ignored and drop naturally).
255 hammer2_cluster_drop(hammer2_cluster_t *cluster)
257 hammer2_chain_t *chain;
260 KKASSERT(cluster->refs > 0);
261 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
262 cluster->focus = NULL; /* safety XXX chg to assert */
263 cluster->focus_index = 0;
265 for (i = 0; i < cluster->nchains; ++i) {
266 chain = cluster->array[i].chain;
268 hammer2_chain_drop(chain);
269 cluster->array[i].chain = NULL; /* safety */
272 cluster->nchains = 0; /* safety */
274 kfree(cluster, M_HAMMER2);
275 /* cluster is invalid */
280 * Lock a cluster. Cluster must already be referenced. Focus is maintained.
282 * WARNING! This function expects the caller to handle resolution of the
283 * cluster. We never re-resolve the cluster in this function,
284 * because it might be used to temporarily unlock/relock a cparent
285 * in an iteration or recursrion, and the cparents elements do not
289 hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
291 hammer2_chain_t *chain;
294 /* cannot be on inode-embedded cluster template, must be on copy */
295 KKASSERT(cluster->refs > 0);
296 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
297 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
298 panic("hammer2_cluster_lock: cluster %p already locked!\n",
301 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
304 * Lock chains and resolve state.
306 for (i = 0; i < cluster->nchains; ++i) {
309 chain = cluster->array[i].chain;
312 hammer2_chain_lock(chain, how);
317 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
319 hammer2_cluster_lock_except(cluster, -1, how);
323 * Calculate the clustering state for the cluster and set its focus.
324 * This routine must be called with care. For example, it should not
325 * normally be called after relocking a non-leaf cluster because parent
326 * clusters help iterations and each element might be at a slightly different
327 * indirect node (each node's topology is independently indexed).
329 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
330 * operations. Typically this is only set on a quorum of MASTERs or
331 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER
332 * is present, this bit is *not* set on a quorum of MASTERs. The
333 * synchronization code ignores this bit, but all hammer2_cluster_*() calls
334 * that create/modify/delete elements use it.
336 * The chains making up the cluster may be narrowed down based on quorum
337 * acceptability, and if RESOLVE_RDONLY is specified the chains can be
338 * narrowed down to a single chain as long as the entire subtopology is known
339 * to be intact. So, for example, we can narrow a read-only op to a single
340 * fast SLAVE but if we focus a CACHE chain we must still retain at least
341 * a SLAVE to ensure that the subtopology can be accessed.
343 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
344 * to be maintained once the topology is validated as-of the top level of
347 * If a failure occurs the operation must be aborted by higher-level code and
351 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
353 hammer2_chain_t *chain;
354 hammer2_chain_t *focus;
356 hammer2_tid_t quorum_tid;
357 hammer2_tid_t last_best_quorum_tid;
369 cluster->focus = NULL;
382 KKASSERT(pmp != NULL || cluster->nchains == 0);
383 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
389 * NOTE: A NULL chain is not necessarily an error, it could be
390 * e.g. a lookup failure or the end of an iteration.
393 for (i = 0; i < cluster->nchains; ++i) {
394 chain = cluster->array[i].chain;
395 if (chain && chain->error) {
396 if (cluster->focus == NULL || cluster->focus == chain) {
397 /* error will be overridden by valid focus */
398 cluster->error = chain->error;
402 * Must count total masters and slaves whether the
403 * chain is errored or not.
405 switch (cluster->pmp->pfs_types[i]) {
406 case HAMMER2_PFSTYPE_MASTER:
409 case HAMMER2_PFSTYPE_SLAVE:
415 switch (cluster->pmp->pfs_types[i]) {
416 case HAMMER2_PFSTYPE_MASTER:
419 case HAMMER2_PFSTYPE_SLAVE:
422 case HAMMER2_PFSTYPE_SOFT_MASTER:
423 nflags |= HAMMER2_CLUSTER_WRSOFT;
424 nflags |= HAMMER2_CLUSTER_RDSOFT;
427 case HAMMER2_PFSTYPE_SOFT_SLAVE:
428 nflags |= HAMMER2_CLUSTER_RDSOFT;
430 case HAMMER2_PFSTYPE_SUPROOT:
432 * Degenerate cluster representing the super-root
433 * topology on a single device. Fake stuff so
434 * cluster ops work as expected.
436 nflags |= HAMMER2_CLUSTER_WRHARD;
437 nflags |= HAMMER2_CLUSTER_RDHARD;
438 cluster->focus_index = i;
439 cluster->focus = chain;
440 cluster->error = chain ? chain->error : 0;
450 * Resolve masters. Calculate nmasters for the highest matching
451 * TID, if a quorum cannot be attained try the next lower matching
452 * TID until we exhaust TIDs.
454 * NOTE: A NULL chain is not necessarily an error, it could be
455 * e.g. a lookup failure or the end of an iteration.
458 last_best_quorum_tid = HAMMER2_TID_MAX;
459 quorum_tid = 0; /* fix gcc warning */
461 while (nmasters < nquorum && last_best_quorum_tid != 0) {
465 for (i = 0; i < cluster->nchains; ++i) {
466 if (cluster->pmp->pfs_types[i] !=
467 HAMMER2_PFSTYPE_MASTER) {
470 chain = cluster->array[i].chain;
472 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
474 * Invalid as in unsynchronized, cannot be
475 * used to calculate the quorum.
477 } else if (chain == NULL && quorum_tid == 0) {
479 * NULL chain on master matches NULL chains
483 } else if (quorum_tid < last_best_quorum_tid &&
485 (quorum_tid < chain->bref.modify_tid ||
488 * Better TID located, reset nmasters count.
491 quorum_tid = chain->bref.modify_tid;
493 quorum_tid == chain->bref.modify_tid) {
495 * TID matches current collection.
500 if (nmasters >= nquorum)
502 last_best_quorum_tid = quorum_tid;
508 * NOTE: A NULL chain is not necessarily an error, it could be
509 * e.g. a lookup failure or the end of an iteration.
512 for (i = 0; i < cluster->nchains; ++i) {
513 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
514 chain = cluster->array[i].chain;
515 if (chain && chain->error) {
516 if (cluster->focus == NULL || cluster->focus == chain) {
517 /* error will be overridden by valid focus */
518 cluster->error = chain->error;
523 switch (cluster->pmp->pfs_types[i]) {
524 case HAMMER2_PFSTYPE_MASTER:
526 * We must have enough up-to-date masters to reach
527 * a quorum and the master modify_tid must match
528 * the quorum's modify_tid.
530 * Do not select an errored or out-of-sync master.
532 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
533 nflags |= HAMMER2_CLUSTER_UNHARD;
534 } else if (nmasters >= nquorum &&
535 (chain == NULL || chain->error == 0) &&
536 ((chain == NULL && quorum_tid == 0) ||
537 (chain != NULL && quorum_tid ==
538 chain->bref.modify_tid))) {
539 nflags |= HAMMER2_CLUSTER_WRHARD;
540 nflags |= HAMMER2_CLUSTER_RDHARD;
542 cluster->array[i].flags |=
545 if (cluster->focus == NULL ||
546 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
547 focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
548 cluster->focus_index = i;
549 cluster->focus = chain; /* NULL ok */
550 cluster->error = chain ? chain->error :
553 } else if (chain == NULL || chain->error == 0) {
554 nflags |= HAMMER2_CLUSTER_UNHARD;
557 case HAMMER2_PFSTYPE_SLAVE:
559 * We must have enough up-to-date masters to reach
560 * a quorum and the slave modify_tid must match the
561 * quorum's modify_tid.
563 * Do not select an errored slave.
565 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
566 nflags |= HAMMER2_CLUSTER_UNHARD;
567 } else if (nmasters >= nquorum &&
568 (chain == NULL || chain->error == 0) &&
569 ((chain == NULL && quorum_tid == 0) ||
570 (chain && quorum_tid ==
571 chain->bref.modify_tid))) {
573 nflags |= HAMMER2_CLUSTER_RDHARD;
575 /* XXX optimize for RESOLVE_RDONLY */
576 if (cluster->focus == NULL) {
577 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
578 cluster->focus_index = i;
579 cluster->focus = chain; /* NULL ok */
580 cluster->error = chain ? chain->error :
584 } else if (chain == NULL || chain->error == 0) {
585 nflags |= HAMMER2_CLUSTER_UNSOFT;
588 case HAMMER2_PFSTYPE_SOFT_MASTER:
590 * Directly mounted soft master always wins. There
591 * should be only one.
593 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
594 cluster->focus_index = i;
595 cluster->focus = chain;
596 cluster->error = chain ? chain->error : 0;
597 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
598 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
600 case HAMMER2_PFSTYPE_SOFT_SLAVE:
602 * Directly mounted soft slave always wins. There
603 * should be only one.
605 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
606 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
607 cluster->focus_index = i;
608 cluster->focus = chain;
609 cluster->error = chain ? chain->error : 0;
610 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
613 case HAMMER2_PFSTYPE_SUPROOT:
615 * spmp (degenerate case)
618 cluster->focus_index = i;
619 cluster->focus = chain;
620 cluster->error = chain ? chain->error : 0;
621 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
622 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
630 * Focus now set, adjust ddflag. Skip this pass if the focus
631 * is bad or if we are at the PFS root (the bref won't match at
632 * the PFS root, obviously).
634 focus = cluster->focus;
637 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
642 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
648 * Validate the elements that were not marked invalid. They should
651 for (i = 0; i < cluster->nchains; ++i) {
654 chain = cluster->array[i].chain;
660 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
663 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
664 if (chain->bref.type != focus->bref.type ||
665 chain->bref.key != focus->bref.key ||
666 chain->bref.keybits != focus->bref.keybits ||
667 chain->bref.modify_tid != focus->bref.modify_tid ||
668 chain->bytes != focus->bytes ||
669 ddflag != cluster->ddflag) {
670 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
671 if (hammer2_debug & 1)
672 kprintf("cluster_resolve: matching modify_tid failed "
673 "bref test: idx=%d type=%02x/%02x "
674 "key=%016jx/%d-%016jx/%d "
675 "mod=%016jx/%016jx bytes=%u/%u\n",
677 chain->bref.type, focus->bref.type,
678 chain->bref.key, chain->bref.keybits,
679 focus->bref.key, focus->bref.keybits,
680 chain->bref.modify_tid, focus->bref.modify_tid,
681 chain->bytes, focus->bytes);
682 if (hammer2_debug & 0x4000)
683 panic("cluster_resolve");
684 /* flag issue and force resync? */
690 nflags |= HAMMER2_CLUSTER_NOSOFT;
692 nflags |= HAMMER2_CLUSTER_NOHARD;
695 * Set SSYNCED or MSYNCED for slaves and masters respectively if
696 * all available nodes (even if 0 are available) are fully
697 * synchronized. This is used by the synchronization thread to
698 * determine if there is work it could potentially accomplish.
700 if (nslaves == ttlslaves)
701 nflags |= HAMMER2_CLUSTER_SSYNCED;
702 if (nmasters == ttlmasters)
703 nflags |= HAMMER2_CLUSTER_MSYNCED;
706 * Determine if the cluster was successfully locked for the
707 * requested operation and generate an error code. The cluster
708 * will not be locked (or ref'd) if an error is returned.
710 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
711 * to determine if reading or writing is possible. If writing, the
712 * cluster still requires a call to hammer2_cluster_modify() first.
714 atomic_set_int(&cluster->flags, nflags);
715 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
719 * This is used by the XOPS subsystem to calculate the state of
720 * the collection and tell hammer2_xop_collect() what to do with it.
721 * The collection can be in various states of desynchronization, the
722 * caller specifically wants to resolve the passed-in key.
725 * 0 - Quorum agreement, key is valid
727 * ENOENT - Quorum agreement, end of scan
729 * ESRCH - Quorum agreement, key is INVALID (caller should
732 * EIO - Quorum agreement but all elements had errors.
734 * EDEADLK - No quorum agreement possible for key, a repair
735 * may be needed. Caller has to decide what to do,
736 * possibly iterating the key or generating an EIO.
738 * EINPROGRESS - No quorum agreement yet, but agreement is still
739 * possible if caller waits for more responses. Caller
740 * should not iterate key.
742 * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
745 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
747 hammer2_chain_t *chain;
748 hammer2_chain_t *focus;
750 hammer2_tid_t quorum_tid;
751 hammer2_tid_t last_best_quorum_tid;
756 int nmasters_keymatch;
759 int umasters; /* unknown masters (still in progress) */
764 cluster->focus = NULL;
770 nmasters_keymatch = 0;
778 KKASSERT(pmp != NULL || cluster->nchains == 0);
779 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
785 * NOTE: A NULL chain is not necessarily an error, it could be
786 * e.g. a lookup failure or the end of an iteration.
789 for (i = 0; i < cluster->nchains; ++i) {
790 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
791 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
793 chain = cluster->array[i].chain;
794 if (chain && chain->error) {
795 if (cluster->focus == NULL || cluster->focus == chain) {
796 /* error will be overridden by valid focus */
797 cluster->error = chain->error;
801 * Must count total masters and slaves whether the
802 * chain is errored or not.
804 switch (cluster->pmp->pfs_types[i]) {
805 case HAMMER2_PFSTYPE_MASTER:
808 case HAMMER2_PFSTYPE_SLAVE:
814 switch (cluster->pmp->pfs_types[i]) {
815 case HAMMER2_PFSTYPE_MASTER:
818 case HAMMER2_PFSTYPE_SLAVE:
821 case HAMMER2_PFSTYPE_SOFT_MASTER:
822 nflags |= HAMMER2_CLUSTER_WRSOFT;
823 nflags |= HAMMER2_CLUSTER_RDSOFT;
826 case HAMMER2_PFSTYPE_SOFT_SLAVE:
827 nflags |= HAMMER2_CLUSTER_RDSOFT;
829 case HAMMER2_PFSTYPE_SUPROOT:
831 * Degenerate cluster representing the super-root
832 * topology on a single device. Fake stuff so
833 * cluster ops work as expected.
835 nflags |= HAMMER2_CLUSTER_WRHARD;
836 nflags |= HAMMER2_CLUSTER_RDHARD;
837 cluster->focus_index = i;
838 cluster->focus = chain;
839 cluster->error = chain ? chain->error : 0;
849 * Resolve nmasters - master nodes fully match
851 * Resolve umasters - master nodes operation still
854 * Resolve nmasters_keymatch - master nodes match the passed-in
855 * key and may or may not match
856 * the quorum-agreed tid.
858 * The quorum-agreed TID is the highest matching TID.
860 last_best_quorum_tid = HAMMER2_TID_MAX;
861 quorum_tid = 0; /* fix gcc warning */
863 while (nmasters < nquorum && last_best_quorum_tid != 0) {
867 for (i = 0; i < cluster->nchains; ++i) {
868 /* XXX SOFT smpresent handling */
869 if (cluster->pmp->pfs_types[i] !=
870 HAMMER2_PFSTYPE_MASTER) {
874 chain = cluster->array[i].chain;
877 * Skip elements still in progress. umasters keeps
878 * track of masters that might still be in-progress.
880 if (chain == NULL && (cluster->array[i].flags &
881 HAMMER2_CITEM_NULL) == 0) {
889 if (flags & HAMMER2_CHECK_NULL) {
894 } else if (chain && chain->bref.key == key) {
896 if (quorum_tid < last_best_quorum_tid &&
897 (quorum_tid < chain->bref.modify_tid ||
900 * Better TID located, reset
904 quorum_tid = chain->bref.modify_tid;
906 if (quorum_tid == chain->bref.modify_tid) {
908 * TID matches current collection.
911 if (chain->error == 0) {
912 cluster->focus = chain;
913 cluster->focus_index = i;
918 if (nmasters >= nquorum)
920 last_best_quorum_tid = quorum_tid;
924 kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
925 nmasters, nquorum, nmasters_keymatch, umasters);
929 * Early return if we do not have enough masters.
931 if (nmasters < nquorum) {
932 if (nmasters + umasters >= nquorum)
934 if (nmasters_keymatch < nquorum)
940 * Validated end of scan.
942 if (flags & HAMMER2_CHECK_NULL)
946 * If we have a NULL focus at this point the agreeing quorum all
949 if (cluster->focus == NULL)
955 * We have quorum agreement, validate elements, not end of scan.
957 for (i = 0; i < cluster->nchains; ++i) {
958 chain = cluster->array[i].chain;
960 chain->bref.key != key ||
961 chain->bref.modify_tid != quorum_tid) {
965 switch (cluster->pmp->pfs_types[i]) {
966 case HAMMER2_PFSTYPE_MASTER:
967 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
968 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
969 nflags |= HAMMER2_CLUSTER_WRHARD;
970 nflags |= HAMMER2_CLUSTER_RDHARD;
972 case HAMMER2_PFSTYPE_SLAVE:
974 * We must have enough up-to-date masters to reach
975 * a quorum and the slave modify_tid must match the
976 * quorum's modify_tid.
978 * Do not select an errored slave.
980 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
981 nflags |= HAMMER2_CLUSTER_RDHARD;
984 case HAMMER2_PFSTYPE_SOFT_MASTER:
986 * Directly mounted soft master always wins. There
987 * should be only one.
989 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
990 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
992 case HAMMER2_PFSTYPE_SOFT_SLAVE:
994 * Directly mounted soft slave always wins. There
995 * should be only one.
999 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1001 case HAMMER2_PFSTYPE_SUPROOT:
1003 * spmp (degenerate case)
1005 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
1006 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1014 * Focus now set, adjust ddflag. Skip this pass if the focus
1015 * is bad or if we are at the PFS root (the bref won't match at
1016 * the PFS root, obviously).
1018 focus = cluster->focus;
1021 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
1023 cluster->ddflag = 0;
1026 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1032 * Validate the elements that were not marked invalid. They should
1035 for (i = 0; i < cluster->nchains; ++i) {
1038 chain = cluster->array[i].chain;
1044 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1047 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1048 if (chain->bref.type != focus->bref.type ||
1049 chain->bref.key != focus->bref.key ||
1050 chain->bref.keybits != focus->bref.keybits ||
1051 chain->bref.modify_tid != focus->bref.modify_tid ||
1052 chain->bytes != focus->bytes ||
1053 ddflag != cluster->ddflag) {
1054 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1055 if (hammer2_debug & 1)
1056 kprintf("cluster_resolve: matching modify_tid failed "
1057 "bref test: idx=%d type=%02x/%02x "
1058 "key=%016jx/%d-%016jx/%d "
1059 "mod=%016jx/%016jx bytes=%u/%u\n",
1061 chain->bref.type, focus->bref.type,
1062 chain->bref.key, chain->bref.keybits,
1063 focus->bref.key, focus->bref.keybits,
1064 chain->bref.modify_tid, focus->bref.modify_tid,
1065 chain->bytes, focus->bytes);
1066 if (hammer2_debug & 0x4000)
1067 panic("cluster_resolve");
1068 /* flag issue and force resync? */
1074 nflags |= HAMMER2_CLUSTER_NOSOFT;
1075 if (ttlmasters == 0)
1076 nflags |= HAMMER2_CLUSTER_NOHARD;
1079 * Set SSYNCED or MSYNCED for slaves and masters respectively if
1080 * all available nodes (even if 0 are available) are fully
1081 * synchronized. This is used by the synchronization thread to
1082 * determine if there is work it could potentially accomplish.
1084 if (nslaves == ttlslaves)
1085 nflags |= HAMMER2_CLUSTER_SSYNCED;
1086 if (nmasters == ttlmasters)
1087 nflags |= HAMMER2_CLUSTER_MSYNCED;
1090 * Determine if the cluster was successfully locked for the
1091 * requested operation and generate an error code. The cluster
1092 * will not be locked (or ref'd) if an error is returned.
1094 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
1095 * to determine if reading or writing is possible. If writing, the
1096 * cluster still requires a call to hammer2_cluster_modify() first.
1098 atomic_set_int(&cluster->flags, nflags);
1099 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1105 * This is used by the sync thread to force non-NULL elements of a copy
1106 * of the pmp->iroot cluster to be good which is required to prime the
1110 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1114 for (i = 0; i < cluster->nchains; ++i) {
1115 if (cluster->array[i].chain)
1116 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1121 * Copy a cluster, returned a ref'd cluster. All underlying chains
1122 * are also ref'd, but not locked. Focus state is also copied.
1124 * Original cluster does not have to be locked but usually is.
1125 * New cluster will not be flagged as locked.
1127 * Callers using this function to initialize a new cluster from an inode
1128 * generally lock and resolve the resulting cluster.
1130 * Callers which use this function to save/restore a cluster structure
1131 * generally retain the focus state and do not re-resolve it. Caller should
1132 * not try to re-resolve internal (cparent) node state during an iteration
1133 * as the individual tracking elements of cparent in an iteration may not
1134 * match even though they are correct.
1137 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
1139 hammer2_pfs_t *pmp = ocluster->pmp;
1140 hammer2_cluster_t *ncluster;
1141 hammer2_chain_t *chain;
1144 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
1145 ncluster->pmp = pmp;
1146 ncluster->nchains = ocluster->nchains;
1149 for (i = 0; i < ocluster->nchains; ++i) {
1150 chain = ocluster->array[i].chain;
1151 ncluster->array[i].chain = chain;
1152 ncluster->array[i].flags = ocluster->array[i].flags;
1154 hammer2_chain_ref(chain);
1156 ncluster->focus_index = ocluster->focus_index;
1157 ncluster->focus = ocluster->focus;
1158 ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED |
1159 HAMMER2_CLUSTER_INODE);
1165 * Unlock a cluster. Refcount and focus is maintained.
1168 hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx)
1170 hammer2_chain_t *chain;
1173 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1174 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1177 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1178 KKASSERT(cluster->refs > 0);
1179 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1181 for (i = 0; i < cluster->nchains; ++i) {
1184 chain = cluster->array[i].chain;
1186 hammer2_chain_unlock(chain);
1191 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1193 hammer2_cluster_unlock_except(cluster, -1);
1197 * Set an inode's cluster modified, marking the related chains RW and
1198 * duplicating them if necessary.
1200 * The passed-in chain is a localized copy of the chain previously acquired
1201 * when the inode was locked (and possilby replaced in the mean time), and
1202 * must also be updated. In fact, we update it first and then synchronize
1203 * the inode's cluster cache.
1205 hammer2_inode_data_t *
1206 hammer2_cluster_modify_ip(hammer2_inode_t *ip,
1207 hammer2_cluster_t *cluster, int flags)
1209 hammer2_inode_modify(ip);
1210 hammer2_cluster_modify(cluster, flags);
1211 hammer2_inode_repoint(ip, NULL, cluster);
1212 return (&hammer2_cluster_wdata(cluster)->ipdata);
1216 * Adjust the cluster's chains to allow modification and adjust the
1217 * focus. Data will be accessible on return.
1219 * If our focused master errors on modify, re-resolve the cluster to
1220 * try to select a different master.
1223 hammer2_cluster_modify(hammer2_cluster_t *cluster, int flags)
1225 hammer2_chain_t *chain;
1230 for (i = 0; i < cluster->nchains; ++i) {
1231 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1232 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1235 chain = cluster->array[i].chain;
1240 hammer2_chain_modify(chain, flags);
1241 if (cluster->focus == chain && chain->error) {
1242 cluster->error = chain->error;
1247 hammer2_cluster_resolve(cluster);
1251 * Synchronize modifications from the focus to other chains in a cluster.
1252 * Convenient because nominal API users can just modify the contents of the
1253 * focus (at least for non-blockref data).
1255 * Nominal front-end operations only edit non-block-table data in a single
1256 * chain. This code copies such modifications to the other chains in the
1257 * cluster. Blocktable modifications are handled on a chain-by-chain basis
1258 * by both the frontend and the backend and will explode in fireworks if
1262 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
1264 hammer2_chain_t *focus;
1265 hammer2_chain_t *scan;
1266 const hammer2_inode_data_t *ripdata;
1267 hammer2_inode_data_t *wipdata;
1270 focus = cluster->focus;
1271 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
1273 for (i = 0; i < cluster->nchains; ++i) {
1274 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1276 scan = cluster->array[i].chain;
1277 if (scan == NULL || scan == focus)
1281 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
1282 KKASSERT(focus->bytes == scan->bytes &&
1283 focus->bref.type == scan->bref.type);
1284 switch(focus->bref.type) {
1285 case HAMMER2_BREF_TYPE_INODE:
1286 ripdata = &focus->data->ipdata;
1287 wipdata = &scan->data->ipdata;
1288 if ((ripdata->meta.op_flags &
1289 HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1290 bcopy(ripdata, wipdata,
1291 offsetof(hammer2_inode_data_t, u));
1294 /* fall through to full copy */
1295 case HAMMER2_BREF_TYPE_DATA:
1296 bcopy(focus->data, scan->data, focus->bytes);
1298 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1299 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1300 case HAMMER2_BREF_TYPE_FREEMAP:
1301 case HAMMER2_BREF_TYPE_VOLUME:
1302 panic("hammer2_cluster_modsync: illegal node type");
1306 panic("hammer2_cluster_modsync: unknown node type");
1313 * Lookup initialization/completion API. Returns a locked, fully resolved
1314 * cluster with one ref.
1317 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
1319 hammer2_cluster_t *cluster;
1321 cluster = hammer2_cluster_copy(cparent);
1322 if (flags & HAMMER2_LOOKUP_SHARED) {
1323 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
1324 HAMMER2_RESOLVE_SHARED);
1326 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1328 hammer2_cluster_resolve(cluster);
1334 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1337 hammer2_cluster_unlock(cparent);
1338 hammer2_cluster_drop(cparent);
1343 * Locate first match or overlap under parent, return a new, locked, resolved
1344 * cluster with one ref.
1346 * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1349 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
1350 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1353 hammer2_cluster_t *cluster;
1354 hammer2_chain_t *chain;
1355 hammer2_key_t key_accum;
1356 hammer2_key_t key_next;
1361 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1363 pmp = cparent->pmp; /* can be NULL */
1364 key_accum = *key_nextp;
1366 if (flags & HAMMER2_LOOKUP_SHARED)
1367 rflags = HAMMER2_RESOLVE_SHARED;
1371 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1372 cluster->pmp = pmp; /* can be NULL */
1374 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1375 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1378 * Iterating earlier cluster elements with later elements still
1379 * locked is a problem, so we have to unlock the parent and then
1382 hammer2_cluster_unlock(cparent);
1383 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1386 * Pass-1, issue lookups.
1388 for (i = 0; i < cparent->nchains; ++i) {
1389 cluster->array[i].flags = cparent->array[i].flags;
1390 key_next = *key_nextp;
1393 * Always relock the parent as we go.
1395 if (cparent->array[i].chain) {
1396 hammer2_chain_lock(cparent->array[i].chain, rflags);
1400 * Nothing to base the lookup, or parent was not synchronized.
1402 if (cparent->array[i].chain == NULL ||
1403 (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
1408 chain = hammer2_chain_lookup(&cparent->array[i].chain,
1411 &cparent->array[i].cache_index,
1413 cluster->array[i].chain = chain;
1414 if (chain == NULL) {
1417 if (key_accum > key_next)
1418 key_accum = key_next;
1424 cluster->nchains = i;
1425 *key_nextp = key_accum;
1428 * The cluster must be resolved, out of sync elements may be present.
1430 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1432 if (null_count != i)
1433 hammer2_cluster_resolve(cluster);
1434 if (null_count == i ||
1435 (cluster->focus == NULL &&
1436 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1437 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1438 hammer2_cluster_unlock(cluster);
1439 hammer2_cluster_drop(cluster);
1447 * Locate next match or overlap under parent, replace the passed-in cluster.
1448 * The returned cluster is a new, locked, resolved cluster with one ref.
1450 * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1453 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1454 hammer2_key_t *key_nextp,
1455 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1457 hammer2_chain_t *ochain;
1458 hammer2_chain_t *nchain;
1459 hammer2_key_t key_accum;
1460 hammer2_key_t key_next;
1467 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1469 key_accum = *key_nextp;
1471 parent_index = cparent->focus_index; /* save prior focus */
1472 cluster_index = cluster->focus_index;
1473 if (flags & HAMMER2_LOOKUP_SHARED)
1474 rflags = HAMMER2_RESOLVE_SHARED;
1478 cluster->focus = NULL; /* XXX needed any more? */
1479 /*cparent->focus = NULL;*/
1480 cluster->focus_index = 0; /* XXX needed any more? */
1481 /*cparent->focus_index = 0;*/
1483 cluster->ddflag = 0;
1486 * The parent is always locked on entry, the iterator may be locked
1487 * depending on flags.
1489 * We must temporarily unlock the passed-in clusters to avoid a
1490 * deadlock between elements of the cluster with other threads.
1491 * We will fixup the lock in the loop.
1493 * Note that this will clear the focus.
1495 * Reflag the clusters as locked, because we will relock them
1498 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1499 hammer2_cluster_unlock(cluster);
1500 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1502 hammer2_cluster_unlock(cparent);
1503 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1505 for (i = 0; i < cparent->nchains; ++i) {
1506 key_next = *key_nextp;
1507 ochain = cluster->array[i].chain;
1510 * Always relock the parent as we go.
1512 if (cparent->array[i].chain)
1513 hammer2_chain_lock(cparent->array[i].chain, rflags);
1516 * Nothing to iterate from. These cases can occur under
1517 * normal operations. For example, during synchronization
1518 * a slave might reach the end of its scan while records
1519 * are still left on the master(s).
1521 if (ochain == NULL) {
1525 if (cparent->array[i].chain == NULL ||
1526 (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1527 (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1528 /* ochain has not yet been relocked */
1529 hammer2_chain_drop(ochain);
1530 cluster->array[i].chain = NULL;
1536 * Relock the child if necessary. Parent and child will then
1537 * be locked as expected by hammer2_chain_next() and flags.
1539 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1540 hammer2_chain_lock(ochain, rflags);
1541 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1542 &key_next, key_beg, key_end,
1543 &cparent->array[i].cache_index,
1545 /* ochain now invalid but can still be used for focus check */
1546 if (parent_index == i) {
1547 cparent->focus_index = i;
1548 cparent->focus = cparent->array[i].chain;
1551 cluster->array[i].chain = nchain;
1552 if (nchain == NULL) {
1555 if (key_accum > key_next)
1556 key_accum = key_next;
1562 cluster->nchains = i;
1563 *key_nextp = key_accum;
1566 * The cluster must be resolved, out of sync elements may be present.
1568 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1570 if (null_count != i)
1571 hammer2_cluster_resolve(cluster);
1572 if (null_count == i ||
1573 (cluster->focus == NULL &&
1574 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1575 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1576 hammer2_cluster_unlock(cluster);
1577 hammer2_cluster_drop(cluster);
1584 * Advance just one chain in the cluster and recalculate the invalid bit.
1585 * The cluster index is allowed to be flagged invalid on input and is
1586 * recalculated on return.
1588 * (used during synchronization to advance past a chain being deleted).
1590 * The chain being advanced must not be the focus and the clusters in
1591 * question must have already passed normal cluster_lookup/cluster_next
1594 * The cluster always remains intact on return, so void function.
1597 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1598 hammer2_cluster_t *cluster,
1599 hammer2_key_t *key_nextp,
1600 hammer2_key_t key_beg,
1601 hammer2_key_t key_end,
1604 hammer2_chain_t *ochain;
1605 hammer2_chain_t *nchain;
1606 hammer2_chain_t *focus;
1607 hammer2_key_t key_accum;
1608 hammer2_key_t key_next;
1611 key_accum = *key_nextp;
1612 key_next = *key_nextp;
1613 ochain = cluster->array[i].chain;
1616 KKASSERT(ochain != cluster->focus);
1618 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1619 &key_next, key_beg, key_end,
1620 &cparent->array[i].cache_index,
1622 /* ochain now invalid */
1623 if (cparent->focus_index == i)
1624 cparent->focus = cparent->array[i].chain;
1627 * Install nchain. Note that nchain can be NULL, and can also
1628 * be in an unlocked state depending on flags.
1630 cluster->array[i].chain = nchain;
1631 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1633 if (key_accum > key_next)
1634 key_accum = key_next;
1636 focus = cluster->focus;
1642 if (nchain == focus) /* ASSERTED NOT TRUE */
1645 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1646 if (nchain->bref.type != focus->bref.type ||
1647 nchain->bref.key != focus->bref.key ||
1648 nchain->bref.keybits != focus->bref.keybits ||
1649 nchain->bref.modify_tid != focus->bref.modify_tid ||
1650 nchain->bytes != focus->bytes ||
1651 ddflag != cluster->ddflag) {
1652 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1656 *key_nextp = key_accum;
1659 * For now don't re-resolve cluster->flags.
1661 hammer2_cluster_resolve(cluster);
1666 * Create a new cluster using the specified key
1669 hammer2_cluster_create(hammer2_pfs_t *pmp, hammer2_cluster_t *cparent,
1670 hammer2_cluster_t **clusterp,
1671 hammer2_key_t key, int keybits,
1672 int type, size_t bytes, int flags)
1674 hammer2_cluster_t *cluster;
1678 if ((cluster = *clusterp) == NULL) {
1679 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1681 cluster->pmp = pmp; /* can be NULL */
1683 cluster->flags = HAMMER2_CLUSTER_LOCKED;
1685 cluster->focus_index = 0;
1686 cluster->focus = NULL;
1689 * NOTE: cluster->array[] entries can initially be NULL. If
1690 * *clusterp is supplied, skip NULL entries, otherwise
1691 * create new chains.
1693 for (i = 0; i < cparent->nchains; ++i) {
1694 if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1695 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1699 if ((cluster->array[i].flags &
1700 HAMMER2_CITEM_FEMOD) == 0) {
1701 cluster->array[i].flags |=
1702 HAMMER2_CITEM_INVALID;
1705 if (cluster->array[i].chain == NULL)
1708 error = hammer2_chain_create(&cparent->array[i].chain,
1709 &cluster->array[i].chain, pmp,
1711 type, bytes, flags);
1712 if (cparent->focus_index == i)
1713 cparent->focus = cparent->array[i].chain;
1714 KKASSERT(error == 0);
1715 if (cluster->focus == NULL) {
1716 cluster->focus_index = i;
1717 cluster->focus = cluster->array[i].chain;
1719 if (cparent->focus == cparent->array[i].chain) {
1720 cluster->focus_index = i;
1721 cluster->focus = cluster->array[i].chain;
1724 cluster->nchains = i;
1725 *clusterp = cluster;
1726 hammer2_cluster_resolve(cluster);
1732 * Mark a cluster deleted
1735 hammer2_cluster_delete(hammer2_cluster_t *cparent,
1736 hammer2_cluster_t *cluster, int flags)
1738 hammer2_chain_t *chain;
1739 hammer2_chain_t *parent;
1742 if (cparent == NULL) {
1743 kprintf("cparent is NULL\n");
1747 for (i = 0; i < cluster->nchains; ++i) {
1748 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1749 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1752 parent = cparent->array[i].chain;
1753 chain = cluster->array[i].chain;
1756 if (chain->parent != parent) {
1757 kprintf("hammer2_cluster_delete: parent "
1758 "mismatch chain=%p parent=%p against=%p\n",
1759 chain, chain->parent, parent);
1761 hammer2_chain_delete(parent, chain, flags);
1767 * Create a snapshot of the specified {parent, ochain} with the specified
1768 * label. The originating hammer2_inode must be exclusively locked for
1771 * The ioctl code has already synced the filesystem.
1774 hammer2_cluster_snapshot(hammer2_cluster_t *ocluster,
1775 hammer2_ioc_pfs_t *pmp)
1778 const hammer2_inode_data_t *ripdata;
1779 hammer2_inode_data_t *wipdata;
1780 hammer2_chain_t *nchain;
1781 hammer2_inode_t *nip;
1790 kprintf("snapshot %s\n", pmp->name);
1792 name_len = strlen(pmp->name);
1793 lhc = hammer2_dirhash(pmp->name, name_len);
1798 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1800 opfs_clid = ripdata->meta.pfs_clid;
1802 hmp = ocluster->focus->hmp; /* XXX find synchronized local disk */
1805 * Create the snapshot directory under the super-root
1807 * Set PFS type, generate a unique filesystem id, and generate
1808 * a cluster id. Use the same clid when snapshotting a PFS root,
1809 * which theoretically allows the snapshot to be used as part of
1810 * the same cluster (perhaps as a cache).
1812 * Copy the (flushed) blockref array. Theoretically we could use
1813 * chain_duplicate() but it becomes difficult to disentangle
1814 * the shared core so for now just brute-force it.
1819 nip = hammer2_inode_create(hmp->spmp->iroot, &vat, proc0.p_ucred,
1820 pmp->name, name_len,
1822 HAMMER2_INSERT_PFSROOT, &error);
1825 hammer2_inode_modify(nip);
1826 nchain = hammer2_inode_chain(nip, 0, HAMMER2_RESOLVE_ALWAYS);
1827 hammer2_chain_modify(nchain, 0);
1828 wipdata = &nchain->data->ipdata;
1830 nip->meta.pfs_type = HAMMER2_PFSTYPE_MASTER;
1831 nip->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1832 nip->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT;
1833 kern_uuidgen(&nip->meta.pfs_fsid, 1);
1836 * Give the snapshot its own private cluster id. As a
1837 * snapshot no further synchronization with the original
1838 * cluster will be done.
1841 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1842 nip->meta.pfs_clid = opfs_clid;
1844 kern_uuidgen(&nip->meta.pfs_clid, 1);
1846 kern_uuidgen(&nip->meta.pfs_clid, 1);
1847 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1849 /* XXX hack blockset copy */
1850 /* XXX doesn't work with real cluster */
1851 KKASSERT(ocluster->nchains == 1);
1852 wipdata->meta = nip->meta;
1853 wipdata->u.blockset = ripdata->u.blockset;
1854 hammer2_flush(nchain, 1);
1855 hammer2_chain_unlock(nchain);
1856 hammer2_chain_drop(nchain);
1857 hammer2_inode_unlock(nip, NULL);
1863 * Return locked parent cluster given a locked child. The child remains
1864 * locked on return. The new parent's focus follows the child's focus
1865 * and the parent is always resolved.
1867 * We must temporarily unlock the passed-in cluster to avoid a deadlock
1868 * between elements of the cluster.
1870 * We must not try to hammer2_cluster_resolve() cparent. The individual
1871 * parent chains for the nodes are the correct parents for the cluster but
1872 * do not necessarily match, so resolve would likely implode.
1875 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1877 hammer2_cluster_t *cparent;
1880 cparent = hammer2_cluster_copy(cluster);
1881 hammer2_cluster_unlock(cluster);
1883 for (i = 0; i < cparent->nchains; ++i) {
1884 hammer2_chain_t *chain;
1885 hammer2_chain_t *rchain;
1888 * Calculate parent for each element. Old chain has an extra
1889 * ref for cparent but the lock remains with cluster.
1891 chain = cparent->array[i].chain;
1894 while ((rchain = chain->parent) != NULL) {
1895 hammer2_chain_ref(rchain);
1896 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1897 if (chain->parent == rchain)
1899 hammer2_chain_unlock(rchain);
1900 hammer2_chain_drop(rchain);
1902 cparent->array[i].chain = rchain;
1903 hammer2_chain_drop(chain);
1905 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1906 /* hammer2_cluster_resolve(cparent); */
1907 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1912 /************************************************************************
1914 ************************************************************************
1917 * WARNING! blockref[] array data is not universal. These functions should
1918 * only be used to access universal data.
1920 * NOTE! The rdata call will wait for at least one of the chain I/Os to
1921 * complete if necessary. The I/O's should have already been
1922 * initiated by the cluster_lock/chain_lock operation.
1924 * The cluster must already be in a modified state before wdata
1925 * is called. The data will already be available for this case.
1927 const hammer2_media_data_t *
1928 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1930 KKASSERT(cluster->focus != NULL);
1931 return(cluster->focus->data);
1934 const hammer2_media_data_t *
1935 hammer2_cluster_rdata_bytes(hammer2_cluster_t *cluster, size_t *bytesp)
1937 KKASSERT(cluster->focus != NULL);
1938 *bytesp = cluster->focus->bytes;
1939 return(cluster->focus->data);
1942 hammer2_media_data_t *
1943 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1945 KKASSERT(cluster->focus != NULL);
1946 KKASSERT(hammer2_cluster_modified(cluster));
1947 return(cluster->focus->data);