2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * The cluster module collects multiple chains representing the same
36 * information from different nodes into a single entity. It allows direct
37 * access to media data as long as it is not blockref array data (which
38 * will obviously have to be different at each node).
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
48 * WARNING! This module is *extremely* complex. It must issue asynchronous
49 * locks and I/O, do quorum and/or master-slave processing, and
50 * it must operate properly even if some nodes are broken (which
51 * can also mean indefinite locks).
55 * Cluster operations can be broken down into three pieces:
57 * (1) Chain locking and data retrieval.
58 * hammer2_cluster_lock()
59 * hammer2_cluster_parent()
61 * - Most complex functions, quorum management on transaction ids.
63 * - Locking and data accesses must be internally asynchronous.
65 * - Validate and manage cache coherency primitives (cache state
66 * is stored in chain topologies but must be validated by these
69 * (2) Lookups and Scans
70 * hammer2_cluster_lookup()
71 * hammer2_cluster_next()
73 * - Depend on locking & data retrieval functions, but still complex.
75 * - Must do quorum management on transaction ids.
77 * - Lookup and Iteration ops Must be internally asynchronous.
79 * (3) Modifying Operations
80 * hammer2_cluster_create()
81 * hammer2_cluster_rename()
82 * hammer2_cluster_delete()
83 * hammer2_cluster_modify()
84 * hammer2_cluster_modsync()
86 * - Can usually punt on failures, operation continues unless quorum
87 * is lost. If quorum is lost, must wait for resynchronization
88 * (depending on the management mode).
90 * - Must disconnect node on failures (also not flush), remount, and
93 * - Network links (via kdmsg) are relatively easy to issue as the
94 * complex underworkings of hammer2_chain.c don't have to messed
95 * with (the protocol is at a higher level than block-level).
97 * - Multiple local disk nodes (i.e. block devices) are another matter.
98 * Chain operations have to be dispatched to per-node threads (xN)
99 * because we can't asynchronize potentially very complex chain
100 * operations in hammer2_chain.c (it would be a huge mess).
102 * (these threads are also used to terminate incoming kdmsg ops from
105 * - Single-node filesystems do not use threads and will simply call
106 * hammer2_chain.c functions directly. This short-cut is handled
107 * at the base of each cluster function.
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
119 * Returns non-zero if any chain in the cluster needs to be resized.
120 * Errored elements are not used in the calculation.
123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
125 hammer2_chain_t *chain;
128 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
129 for (i = 0; i < cluster->nchains; ++i) {
130 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
132 chain = cluster->array[i].chain;
137 if (chain->bytes != bytes)
144 * Returns the bref type of the cluster's foucs.
146 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
147 * The cluster must be locked.
150 hammer2_cluster_type(hammer2_cluster_t *cluster)
152 if (cluster->error == 0) {
153 KKASSERT(cluster->focus != NULL);
154 return(cluster->focus->bref.type);
160 * Returns non-zero if the cluster's focus is flagged as being modified.
162 * If the cluster is errored, returns 0.
165 hammer2_cluster_modified(hammer2_cluster_t *cluster)
167 if (cluster->error == 0) {
168 KKASSERT(cluster->focus != NULL);
169 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
175 * Returns the bref of the cluster's focus, sans any data-offset information
176 * (since offset information is per-node and wouldn't be useful).
178 * Callers use this function to access modify_tid, mirror_tid, type,
181 * If the cluster is errored, returns an empty bref.
182 * The cluster must be locked.
185 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
187 if (cluster->error == 0) {
188 KKASSERT(cluster->focus != NULL);
189 *bref = cluster->focus->bref;
192 bzero(bref, sizeof(*bref));
197 * Flag the cluster for flushing recursively up to the root. Despite the
198 * work it does, this is relatively benign. It just makes sure that the
199 * flusher has top-down visibility to this cluster.
201 * Errored chains are not flagged for flushing.
203 * The cluster should probably be locked.
206 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
208 hammer2_chain_t *chain;
211 for (i = 0; i < cluster->nchains; ++i) {
212 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
214 chain = cluster->array[i].chain;
219 hammer2_chain_setflush(trans, chain);
224 * Set the check mode for the cluster.
225 * Errored elements of the cluster are ignored.
227 * The cluster must be locked and modified.
230 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
231 hammer2_cluster_t *cluster,
234 hammer2_chain_t *chain;
237 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
238 for (i = 0; i < cluster->nchains; ++i) {
239 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
240 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
243 chain = cluster->array[i].chain;
248 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
249 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
250 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
255 * Create a degenerate cluster with one ref from a single locked chain.
256 * The returned cluster will be focused on the chain and inherit its
259 * The chain's lock and reference are transfered to the new cluster, so
260 * the caller should not try to unlock the chain separately.
265 hammer2_cluster_from_chain(hammer2_chain_t *chain)
267 hammer2_cluster_t *cluster;
269 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
270 cluster->array[0].chain = chain;
271 cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
272 cluster->nchains = 1;
273 cluster->focus = chain;
274 cluster->focus_index = 0;
275 cluster->pmp = chain->pmp;
277 cluster->error = chain->error;
278 cluster->flags = HAMMER2_CLUSTER_LOCKED |
279 HAMMER2_CLUSTER_WRHARD |
280 HAMMER2_CLUSTER_RDHARD |
281 HAMMER2_CLUSTER_MSYNCED |
282 HAMMER2_CLUSTER_SSYNCED;
288 * Add a reference to a cluster and its underlying chains.
290 * We must also ref the underlying chains in order to allow ref/unlock
291 * sequences to later re-lock.
294 hammer2_cluster_ref(hammer2_cluster_t *cluster)
296 atomic_add_int(&cluster->refs, 1);
300 * Drop the caller's reference to the cluster. When the ref count drops to
301 * zero this function frees the cluster and drops all underlying chains.
303 * In-progress read I/Os are typically detached from the cluster once the
304 * first one returns (the remaining stay attached to the DIOs but are then
305 * ignored and drop naturally).
308 hammer2_cluster_drop(hammer2_cluster_t *cluster)
310 hammer2_chain_t *chain;
313 KKASSERT(cluster->refs > 0);
314 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
315 cluster->focus = NULL; /* safety XXX chg to assert */
316 cluster->focus_index = 0;
318 for (i = 0; i < cluster->nchains; ++i) {
319 chain = cluster->array[i].chain;
321 hammer2_chain_drop(chain);
322 cluster->array[i].chain = NULL; /* safety */
325 cluster->nchains = 0; /* safety */
327 kfree(cluster, M_HAMMER2);
328 /* cluster is invalid */
333 hammer2_cluster_wait(hammer2_cluster_t *cluster)
335 tsleep(cluster->focus, 0, "h2clcw", 1);
339 * Lock a cluster. Cluster must already be referenced. Focus is maintained.
341 * WARNING! This function expects the caller to handle resolution of the
342 * cluster. We never re-resolve the cluster in this function,
343 * because it might be used to temporarily unlock/relock a cparent
344 * in an iteration or recursrion, and the cparents elements do not
348 hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
350 hammer2_chain_t *chain;
353 /* cannot be on inode-embedded cluster template, must be on copy */
354 KKASSERT(cluster->refs > 0);
355 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
356 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
357 panic("hammer2_cluster_lock: cluster %p already locked!\n",
360 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
363 * Lock chains and resolve state.
365 for (i = 0; i < cluster->nchains; ++i) {
368 chain = cluster->array[i].chain;
371 hammer2_chain_lock(chain, how);
376 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
378 hammer2_cluster_lock_except(cluster, -1, how);
382 * Calculate the clustering state for the cluster and set its focus.
383 * This routine must be called with care. For example, it should not
384 * normally be called after relocking a non-leaf cluster because parent
385 * clusters help iterations and each element might be at a slightly different
386 * indirect node (each node's topology is independently indexed).
388 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
389 * operations. Typically this is only set on a quorum of MASTERs or
390 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER
391 * is present, this bit is *not* set on a quorum of MASTERs. The
392 * synchronization code ignores this bit, but all hammer2_cluster_*() calls
393 * that create/modify/delete elements use it.
395 * The chains making up the cluster may be narrowed down based on quorum
396 * acceptability, and if RESOLVE_RDONLY is specified the chains can be
397 * narrowed down to a single chain as long as the entire subtopology is known
398 * to be intact. So, for example, we can narrow a read-only op to a single
399 * fast SLAVE but if we focus a CACHE chain we must still retain at least
400 * a SLAVE to ensure that the subtopology can be accessed.
402 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
403 * to be maintained once the topology is validated as-of the top level of
406 * If a failure occurs the operation must be aborted by higher-level code and
410 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
412 hammer2_chain_t *chain;
413 hammer2_chain_t *focus;
415 hammer2_tid_t quorum_tid;
416 hammer2_tid_t last_best_quorum_tid;
428 cluster->focus = NULL;
441 KKASSERT(pmp != NULL || cluster->nchains == 0);
442 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
448 * NOTE: A NULL chain is not necessarily an error, it could be
449 * e.g. a lookup failure or the end of an iteration.
452 for (i = 0; i < cluster->nchains; ++i) {
453 chain = cluster->array[i].chain;
454 if (chain && chain->error) {
455 if (cluster->focus == NULL || cluster->focus == chain) {
456 /* error will be overridden by valid focus */
457 cluster->error = chain->error;
461 * Must count total masters and slaves whether the
462 * chain is errored or not.
464 switch (cluster->pmp->pfs_types[i]) {
465 case HAMMER2_PFSTYPE_MASTER:
468 case HAMMER2_PFSTYPE_SLAVE:
474 switch (cluster->pmp->pfs_types[i]) {
475 case HAMMER2_PFSTYPE_MASTER:
478 case HAMMER2_PFSTYPE_SLAVE:
481 case HAMMER2_PFSTYPE_SOFT_MASTER:
482 nflags |= HAMMER2_CLUSTER_WRSOFT;
483 nflags |= HAMMER2_CLUSTER_RDSOFT;
486 case HAMMER2_PFSTYPE_SOFT_SLAVE:
487 nflags |= HAMMER2_CLUSTER_RDSOFT;
489 case HAMMER2_PFSTYPE_SUPROOT:
491 * Degenerate cluster representing the super-root
492 * topology on a single device. Fake stuff so
493 * cluster ops work as expected.
495 nflags |= HAMMER2_CLUSTER_WRHARD;
496 nflags |= HAMMER2_CLUSTER_RDHARD;
497 cluster->focus_index = i;
498 cluster->focus = chain;
499 cluster->error = chain ? chain->error : 0;
509 * Resolve masters. Calculate nmasters for the highest matching
510 * TID, if a quorum cannot be attained try the next lower matching
511 * TID until we exhaust TIDs.
513 * NOTE: A NULL chain is not necessarily an error, it could be
514 * e.g. a lookup failure or the end of an iteration.
517 last_best_quorum_tid = HAMMER2_TID_MAX;
518 quorum_tid = 0; /* fix gcc warning */
520 while (nmasters < nquorum && last_best_quorum_tid != 0) {
524 for (i = 0; i < cluster->nchains; ++i) {
525 if (cluster->pmp->pfs_types[i] !=
526 HAMMER2_PFSTYPE_MASTER) {
529 chain = cluster->array[i].chain;
531 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
533 * Invalid as in unsynchronized, cannot be
534 * used to calculate the quorum.
536 } else if (chain == NULL && quorum_tid == 0) {
538 * NULL chain on master matches NULL chains
542 } else if (quorum_tid < last_best_quorum_tid &&
544 (quorum_tid < chain->bref.modify_tid ||
547 * Better TID located, reset nmasters count.
550 quorum_tid = chain->bref.modify_tid;
552 quorum_tid == chain->bref.modify_tid) {
554 * TID matches current collection.
559 if (nmasters >= nquorum)
561 last_best_quorum_tid = quorum_tid;
567 * NOTE: A NULL chain is not necessarily an error, it could be
568 * e.g. a lookup failure or the end of an iteration.
571 for (i = 0; i < cluster->nchains; ++i) {
572 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
573 chain = cluster->array[i].chain;
574 if (chain && chain->error) {
575 if (cluster->focus == NULL || cluster->focus == chain) {
576 /* error will be overridden by valid focus */
577 cluster->error = chain->error;
582 switch (cluster->pmp->pfs_types[i]) {
583 case HAMMER2_PFSTYPE_MASTER:
585 * We must have enough up-to-date masters to reach
586 * a quorum and the master modify_tid must match
587 * the quorum's modify_tid.
589 * Do not select an errored or out-of-sync master.
591 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
592 nflags |= HAMMER2_CLUSTER_UNHARD;
593 } else if (nmasters >= nquorum &&
594 (chain == NULL || chain->error == 0) &&
595 ((chain == NULL && quorum_tid == 0) ||
596 (chain != NULL && quorum_tid ==
597 chain->bref.modify_tid))) {
598 nflags |= HAMMER2_CLUSTER_WRHARD;
599 nflags |= HAMMER2_CLUSTER_RDHARD;
601 cluster->array[i].flags |=
604 if (cluster->focus == NULL ||
605 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
606 focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
607 cluster->focus_index = i;
608 cluster->focus = chain; /* NULL ok */
609 cluster->error = chain ? chain->error :
612 } else if (chain == NULL || chain->error == 0) {
613 nflags |= HAMMER2_CLUSTER_UNHARD;
616 case HAMMER2_PFSTYPE_SLAVE:
618 * We must have enough up-to-date masters to reach
619 * a quorum and the slave modify_tid must match the
620 * quorum's modify_tid.
622 * Do not select an errored slave.
624 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
625 nflags |= HAMMER2_CLUSTER_UNHARD;
626 } else if (nmasters >= nquorum &&
627 (chain == NULL || chain->error == 0) &&
628 ((chain == NULL && quorum_tid == 0) ||
629 (chain && quorum_tid ==
630 chain->bref.modify_tid))) {
632 nflags |= HAMMER2_CLUSTER_RDHARD;
634 /* XXX optimize for RESOLVE_RDONLY */
635 if (cluster->focus == NULL) {
636 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
637 cluster->focus_index = i;
638 cluster->focus = chain; /* NULL ok */
639 cluster->error = chain ? chain->error :
643 } else if (chain == NULL || chain->error == 0) {
644 nflags |= HAMMER2_CLUSTER_UNSOFT;
647 case HAMMER2_PFSTYPE_SOFT_MASTER:
649 * Directly mounted soft master always wins. There
650 * should be only one.
652 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
653 cluster->focus_index = i;
654 cluster->focus = chain;
655 cluster->error = chain ? chain->error : 0;
656 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
657 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
659 case HAMMER2_PFSTYPE_SOFT_SLAVE:
661 * Directly mounted soft slave always wins. There
662 * should be only one.
664 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
665 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
666 cluster->focus_index = i;
667 cluster->focus = chain;
668 cluster->error = chain ? chain->error : 0;
669 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
672 case HAMMER2_PFSTYPE_SUPROOT:
674 * spmp (degenerate case)
677 cluster->focus_index = i;
678 cluster->focus = chain;
679 cluster->error = chain ? chain->error : 0;
680 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
681 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
689 * Focus now set, adjust ddflag. Skip this pass if the focus
690 * is bad or if we are at the PFS root (the bref won't match at
691 * the PFS root, obviously).
693 focus = cluster->focus;
696 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
701 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
707 * Validate the elements that were not marked invalid. They should
710 for (i = 0; i < cluster->nchains; ++i) {
713 chain = cluster->array[i].chain;
719 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
722 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
723 if (chain->bref.type != focus->bref.type ||
724 chain->bref.key != focus->bref.key ||
725 chain->bref.keybits != focus->bref.keybits ||
726 chain->bref.modify_tid != focus->bref.modify_tid ||
727 chain->bytes != focus->bytes ||
728 ddflag != cluster->ddflag) {
729 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
730 if (hammer2_debug & 1)
731 kprintf("cluster_resolve: matching modify_tid failed "
732 "bref test: idx=%d type=%02x/%02x "
733 "key=%016jx/%d-%016jx/%d "
734 "mod=%016jx/%016jx bytes=%u/%u\n",
736 chain->bref.type, focus->bref.type,
737 chain->bref.key, chain->bref.keybits,
738 focus->bref.key, focus->bref.keybits,
739 chain->bref.modify_tid, focus->bref.modify_tid,
740 chain->bytes, focus->bytes);
741 if (hammer2_debug & 0x4000)
742 panic("cluster_resolve");
743 /* flag issue and force resync? */
749 nflags |= HAMMER2_CLUSTER_NOSOFT;
751 nflags |= HAMMER2_CLUSTER_NOHARD;
754 * Set SSYNCED or MSYNCED for slaves and masters respectively if
755 * all available nodes (even if 0 are available) are fully
756 * synchronized. This is used by the synchronization thread to
757 * determine if there is work it could potentially accomplish.
759 if (nslaves == ttlslaves)
760 nflags |= HAMMER2_CLUSTER_SSYNCED;
761 if (nmasters == ttlmasters)
762 nflags |= HAMMER2_CLUSTER_MSYNCED;
765 * Determine if the cluster was successfully locked for the
766 * requested operation and generate an error code. The cluster
767 * will not be locked (or ref'd) if an error is returned.
769 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
770 * to determine if reading or writing is possible. If writing, the
771 * cluster still requires a call to hammer2_cluster_modify() first.
773 atomic_set_int(&cluster->flags, nflags);
774 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
778 * This is used by the XOPS subsystem to calculate the state of
779 * the collection and tell hammer2_xop_collect() what to do with it.
780 * The collection can be in various states of desynchronization, the
781 * caller specifically wants to resolve the passed-in key.
784 * 0 - Quorum agreement, key is valid
786 * ENOENT - Quorum agreement, end of scan
788 * ESRCH - Quorum agreement, key is INVALID (caller should
791 * EIO - Quorum agreement but all elements had errors.
793 * EDEADLK - No quorum agreement possible for key, a repair
794 * may be needed. Caller has to decide what to do,
795 * possibly iterating the key or generating an EIO.
797 * EINPROGRESS - No quorum agreement yet, but agreement is still
798 * possible if caller waits for more responses. Caller
799 * should not iterate key.
801 * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
804 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
806 hammer2_chain_t *chain;
807 hammer2_chain_t *focus;
809 hammer2_tid_t quorum_tid;
810 hammer2_tid_t last_best_quorum_tid;
815 int nmasters_keymatch;
818 int umasters; /* unknown masters (still in progress) */
823 cluster->focus = NULL;
829 nmasters_keymatch = 0;
837 KKASSERT(pmp != NULL || cluster->nchains == 0);
838 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
844 * NOTE: A NULL chain is not necessarily an error, it could be
845 * e.g. a lookup failure or the end of an iteration.
848 for (i = 0; i < cluster->nchains; ++i) {
849 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
850 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
852 chain = cluster->array[i].chain;
853 if (chain && chain->error) {
854 if (cluster->focus == NULL || cluster->focus == chain) {
855 /* error will be overridden by valid focus */
856 cluster->error = chain->error;
860 * Must count total masters and slaves whether the
861 * chain is errored or not.
863 switch (cluster->pmp->pfs_types[i]) {
864 case HAMMER2_PFSTYPE_MASTER:
867 case HAMMER2_PFSTYPE_SLAVE:
873 switch (cluster->pmp->pfs_types[i]) {
874 case HAMMER2_PFSTYPE_MASTER:
877 case HAMMER2_PFSTYPE_SLAVE:
880 case HAMMER2_PFSTYPE_SOFT_MASTER:
881 nflags |= HAMMER2_CLUSTER_WRSOFT;
882 nflags |= HAMMER2_CLUSTER_RDSOFT;
885 case HAMMER2_PFSTYPE_SOFT_SLAVE:
886 nflags |= HAMMER2_CLUSTER_RDSOFT;
888 case HAMMER2_PFSTYPE_SUPROOT:
890 * Degenerate cluster representing the super-root
891 * topology on a single device. Fake stuff so
892 * cluster ops work as expected.
894 nflags |= HAMMER2_CLUSTER_WRHARD;
895 nflags |= HAMMER2_CLUSTER_RDHARD;
896 cluster->focus_index = i;
897 cluster->focus = chain;
898 cluster->error = chain ? chain->error : 0;
908 * Resolve nmasters - master nodes fully match
910 * Resolve umasters - master nodes operation still
913 * Resolve nmasters_keymatch - master nodes match the passed-in
914 * key and may or may not match
915 * the quorum-agreed tid.
917 * The quorum-agreed TID is the highest matching TID.
919 last_best_quorum_tid = HAMMER2_TID_MAX;
920 quorum_tid = 0; /* fix gcc warning */
922 while (nmasters < nquorum && last_best_quorum_tid != 0) {
926 for (i = 0; i < cluster->nchains; ++i) {
927 /* XXX SOFT smpresent handling */
928 if (cluster->pmp->pfs_types[i] !=
929 HAMMER2_PFSTYPE_MASTER) {
933 chain = cluster->array[i].chain;
936 * Skip elements still in progress. umasters keeps
937 * track of masters that might still be in-progress.
939 if (chain == NULL && (cluster->array[i].flags &
940 HAMMER2_CITEM_NULL) == 0) {
948 if (flags & HAMMER2_CHECK_NULL) {
953 } else if (chain && chain->bref.key == key) {
955 if (quorum_tid < last_best_quorum_tid &&
956 (quorum_tid < chain->bref.modify_tid ||
959 * Better TID located, reset
963 quorum_tid = chain->bref.modify_tid;
965 if (quorum_tid == chain->bref.modify_tid) {
967 * TID matches current collection.
970 if (chain->error == 0) {
971 cluster->focus = chain;
972 cluster->focus_index = i;
977 if (nmasters >= nquorum)
979 last_best_quorum_tid = quorum_tid;
983 kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
984 nmasters, nquorum, nmasters_keymatch, umasters);
988 * Early return if we do not have enough masters.
990 if (nmasters < nquorum) {
991 if (nmasters + umasters >= nquorum)
993 if (nmasters_keymatch < nquorum)
999 * Validated end of scan.
1001 if (flags & HAMMER2_CHECK_NULL)
1005 * If we have a NULL focus at this point the agreeing quorum all
1008 if (cluster->focus == NULL)
1014 * We have quorum agreement, validate elements, not end of scan.
1016 for (i = 0; i < cluster->nchains; ++i) {
1017 chain = cluster->array[i].chain;
1018 if (chain == NULL ||
1019 chain->bref.key != key ||
1020 chain->bref.modify_tid != quorum_tid) {
1024 switch (cluster->pmp->pfs_types[i]) {
1025 case HAMMER2_PFSTYPE_MASTER:
1026 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
1027 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1028 nflags |= HAMMER2_CLUSTER_WRHARD;
1029 nflags |= HAMMER2_CLUSTER_RDHARD;
1031 case HAMMER2_PFSTYPE_SLAVE:
1033 * We must have enough up-to-date masters to reach
1034 * a quorum and the slave modify_tid must match the
1035 * quorum's modify_tid.
1037 * Do not select an errored slave.
1039 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1040 nflags |= HAMMER2_CLUSTER_RDHARD;
1043 case HAMMER2_PFSTYPE_SOFT_MASTER:
1045 * Directly mounted soft master always wins. There
1046 * should be only one.
1048 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
1049 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1051 case HAMMER2_PFSTYPE_SOFT_SLAVE:
1053 * Directly mounted soft slave always wins. There
1054 * should be only one.
1058 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1060 case HAMMER2_PFSTYPE_SUPROOT:
1062 * spmp (degenerate case)
1064 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
1065 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1073 * Focus now set, adjust ddflag. Skip this pass if the focus
1074 * is bad or if we are at the PFS root (the bref won't match at
1075 * the PFS root, obviously).
1077 focus = cluster->focus;
1080 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
1082 cluster->ddflag = 0;
1085 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1091 * Validate the elements that were not marked invalid. They should
1094 for (i = 0; i < cluster->nchains; ++i) {
1097 chain = cluster->array[i].chain;
1103 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1106 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1107 if (chain->bref.type != focus->bref.type ||
1108 chain->bref.key != focus->bref.key ||
1109 chain->bref.keybits != focus->bref.keybits ||
1110 chain->bref.modify_tid != focus->bref.modify_tid ||
1111 chain->bytes != focus->bytes ||
1112 ddflag != cluster->ddflag) {
1113 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1114 if (hammer2_debug & 1)
1115 kprintf("cluster_resolve: matching modify_tid failed "
1116 "bref test: idx=%d type=%02x/%02x "
1117 "key=%016jx/%d-%016jx/%d "
1118 "mod=%016jx/%016jx bytes=%u/%u\n",
1120 chain->bref.type, focus->bref.type,
1121 chain->bref.key, chain->bref.keybits,
1122 focus->bref.key, focus->bref.keybits,
1123 chain->bref.modify_tid, focus->bref.modify_tid,
1124 chain->bytes, focus->bytes);
1125 if (hammer2_debug & 0x4000)
1126 panic("cluster_resolve");
1127 /* flag issue and force resync? */
1133 nflags |= HAMMER2_CLUSTER_NOSOFT;
1134 if (ttlmasters == 0)
1135 nflags |= HAMMER2_CLUSTER_NOHARD;
1138 * Set SSYNCED or MSYNCED for slaves and masters respectively if
1139 * all available nodes (even if 0 are available) are fully
1140 * synchronized. This is used by the synchronization thread to
1141 * determine if there is work it could potentially accomplish.
1143 if (nslaves == ttlslaves)
1144 nflags |= HAMMER2_CLUSTER_SSYNCED;
1145 if (nmasters == ttlmasters)
1146 nflags |= HAMMER2_CLUSTER_MSYNCED;
1149 * Determine if the cluster was successfully locked for the
1150 * requested operation and generate an error code. The cluster
1151 * will not be locked (or ref'd) if an error is returned.
1153 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
1154 * to determine if reading or writing is possible. If writing, the
1155 * cluster still requires a call to hammer2_cluster_modify() first.
1157 atomic_set_int(&cluster->flags, nflags);
1158 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1164 * This is used by the sync thread to force non-NULL elements of a copy
1165 * of the pmp->iroot cluster to be good which is required to prime the
1169 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1173 for (i = 0; i < cluster->nchains; ++i) {
1174 if (cluster->array[i].chain)
1175 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1180 * Copy a cluster, returned a ref'd cluster. All underlying chains
1181 * are also ref'd, but not locked. Focus state is also copied.
1183 * Original cluster does not have to be locked but usually is.
1184 * New cluster will not be flagged as locked.
1186 * Callers using this function to initialize a new cluster from an inode
1187 * generally lock and resolve the resulting cluster.
1189 * Callers which use this function to save/restore a cluster structure
1190 * generally retain the focus state and do not re-resolve it. Caller should
1191 * not try to re-resolve internal (cparent) node state during an iteration
1192 * as the individual tracking elements of cparent in an iteration may not
1193 * match even though they are correct.
1196 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
1198 hammer2_pfs_t *pmp = ocluster->pmp;
1199 hammer2_cluster_t *ncluster;
1200 hammer2_chain_t *chain;
1203 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
1204 ncluster->pmp = pmp;
1205 ncluster->nchains = ocluster->nchains;
1208 for (i = 0; i < ocluster->nchains; ++i) {
1209 chain = ocluster->array[i].chain;
1210 ncluster->array[i].chain = chain;
1211 ncluster->array[i].flags = ocluster->array[i].flags;
1213 hammer2_chain_ref(chain);
1215 ncluster->focus_index = ocluster->focus_index;
1216 ncluster->focus = ocluster->focus;
1217 ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED |
1218 HAMMER2_CLUSTER_INODE);
1224 * Unlock a cluster. Refcount and focus is maintained.
1227 hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx)
1229 hammer2_chain_t *chain;
1232 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1233 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1236 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1237 KKASSERT(cluster->refs > 0);
1238 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1240 for (i = 0; i < cluster->nchains; ++i) {
1243 chain = cluster->array[i].chain;
1245 hammer2_chain_unlock(chain);
1250 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1252 hammer2_cluster_unlock_except(cluster, -1);
1256 * Resize the cluster's physical storage allocation in-place. This may
1257 * replace the cluster's chains.
1260 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
1261 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1262 int nradix, int flags)
1264 hammer2_chain_t *chain;
1267 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */
1268 KKASSERT(cparent->nchains == cluster->nchains);
1270 for (i = 0; i < cluster->nchains; ++i) {
1271 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1272 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1275 chain = cluster->array[i].chain;
1277 KKASSERT(cparent->array[i].chain);
1278 hammer2_chain_resize(trans, ip,
1279 cparent->array[i].chain, chain,
1286 * Set an inode's cluster modified, marking the related chains RW and
1287 * duplicating them if necessary.
1289 * The passed-in chain is a localized copy of the chain previously acquired
1290 * when the inode was locked (and possilby replaced in the mean time), and
1291 * must also be updated. In fact, we update it first and then synchronize
1292 * the inode's cluster cache.
1294 hammer2_inode_data_t *
1295 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
1296 hammer2_cluster_t *cluster, int flags)
1298 hammer2_inode_modify(trans, ip);
1299 hammer2_cluster_modify(trans, cluster, flags);
1300 hammer2_inode_repoint(ip, NULL, cluster);
1301 return (&hammer2_cluster_wdata(cluster)->ipdata);
1305 * Adjust the cluster's chains to allow modification and adjust the
1306 * focus. Data will be accessible on return.
1308 * If our focused master errors on modify, re-resolve the cluster to
1309 * try to select a different master.
1312 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
1315 hammer2_chain_t *chain;
1320 for (i = 0; i < cluster->nchains; ++i) {
1321 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1322 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1325 chain = cluster->array[i].chain;
1330 hammer2_chain_modify(trans, chain, flags);
1331 if (cluster->focus == chain && chain->error) {
1332 cluster->error = chain->error;
1337 hammer2_cluster_resolve(cluster);
1341 * Synchronize modifications from the focus to other chains in a cluster.
1342 * Convenient because nominal API users can just modify the contents of the
1343 * focus (at least for non-blockref data).
1345 * Nominal front-end operations only edit non-block-table data in a single
1346 * chain. This code copies such modifications to the other chains in the
1347 * cluster. Blocktable modifications are handled on a chain-by-chain basis
1348 * by both the frontend and the backend and will explode in fireworks if
1352 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
1354 hammer2_chain_t *focus;
1355 hammer2_chain_t *scan;
1356 const hammer2_inode_data_t *ripdata;
1357 hammer2_inode_data_t *wipdata;
1360 focus = cluster->focus;
1361 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
1363 for (i = 0; i < cluster->nchains; ++i) {
1364 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1366 scan = cluster->array[i].chain;
1367 if (scan == NULL || scan == focus)
1371 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
1372 KKASSERT(focus->bytes == scan->bytes &&
1373 focus->bref.type == scan->bref.type);
1374 switch(focus->bref.type) {
1375 case HAMMER2_BREF_TYPE_INODE:
1376 ripdata = &focus->data->ipdata;
1377 wipdata = &scan->data->ipdata;
1378 if ((ripdata->meta.op_flags &
1379 HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1380 bcopy(ripdata, wipdata,
1381 offsetof(hammer2_inode_data_t, u));
1384 /* fall through to full copy */
1385 case HAMMER2_BREF_TYPE_DATA:
1386 bcopy(focus->data, scan->data, focus->bytes);
1388 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1389 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1390 case HAMMER2_BREF_TYPE_FREEMAP:
1391 case HAMMER2_BREF_TYPE_VOLUME:
1392 panic("hammer2_cluster_modsync: illegal node type");
1396 panic("hammer2_cluster_modsync: unknown node type");
1403 * Lookup initialization/completion API. Returns a locked, fully resolved
1404 * cluster with one ref.
1407 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
1409 hammer2_cluster_t *cluster;
1411 cluster = hammer2_cluster_copy(cparent);
1412 if (flags & HAMMER2_LOOKUP_SHARED) {
1413 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
1414 HAMMER2_RESOLVE_SHARED);
1416 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1418 hammer2_cluster_resolve(cluster);
1424 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1427 hammer2_cluster_unlock(cparent);
1428 hammer2_cluster_drop(cparent);
1433 * Locate first match or overlap under parent, return a new, locked, resolved
1434 * cluster with one ref.
1436 * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1439 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
1440 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1443 hammer2_cluster_t *cluster;
1444 hammer2_chain_t *chain;
1445 hammer2_key_t key_accum;
1446 hammer2_key_t key_next;
1451 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1453 pmp = cparent->pmp; /* can be NULL */
1454 key_accum = *key_nextp;
1456 if (flags & HAMMER2_LOOKUP_SHARED)
1457 rflags = HAMMER2_RESOLVE_SHARED;
1461 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1462 cluster->pmp = pmp; /* can be NULL */
1464 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1465 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1468 * Iterating earlier cluster elements with later elements still
1469 * locked is a problem, so we have to unlock the parent and then
1472 hammer2_cluster_unlock(cparent);
1473 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1476 * Pass-1, issue lookups.
1478 for (i = 0; i < cparent->nchains; ++i) {
1479 cluster->array[i].flags = cparent->array[i].flags;
1480 key_next = *key_nextp;
1483 * Always relock the parent as we go.
1485 if (cparent->array[i].chain) {
1486 hammer2_chain_lock(cparent->array[i].chain, rflags);
1490 * Nothing to base the lookup, or parent was not synchronized.
1492 if (cparent->array[i].chain == NULL ||
1493 (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
1498 chain = hammer2_chain_lookup(&cparent->array[i].chain,
1501 &cparent->array[i].cache_index,
1503 cluster->array[i].chain = chain;
1504 if (chain == NULL) {
1507 if (key_accum > key_next)
1508 key_accum = key_next;
1514 cluster->nchains = i;
1515 *key_nextp = key_accum;
1518 * The cluster must be resolved, out of sync elements may be present.
1520 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1522 if (null_count != i)
1523 hammer2_cluster_resolve(cluster);
1524 if (null_count == i ||
1525 (cluster->focus == NULL &&
1526 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1527 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1528 hammer2_cluster_unlock(cluster);
1529 hammer2_cluster_drop(cluster);
1537 * Locate next match or overlap under parent, replace the passed-in cluster.
1538 * The returned cluster is a new, locked, resolved cluster with one ref.
1540 * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1543 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1544 hammer2_key_t *key_nextp,
1545 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1547 hammer2_chain_t *ochain;
1548 hammer2_chain_t *nchain;
1549 hammer2_key_t key_accum;
1550 hammer2_key_t key_next;
1557 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1559 key_accum = *key_nextp;
1561 parent_index = cparent->focus_index; /* save prior focus */
1562 cluster_index = cluster->focus_index;
1563 if (flags & HAMMER2_LOOKUP_SHARED)
1564 rflags = HAMMER2_RESOLVE_SHARED;
1568 cluster->focus = NULL; /* XXX needed any more? */
1569 /*cparent->focus = NULL;*/
1570 cluster->focus_index = 0; /* XXX needed any more? */
1571 /*cparent->focus_index = 0;*/
1573 cluster->ddflag = 0;
1576 * The parent is always locked on entry, the iterator may be locked
1577 * depending on flags.
1579 * We must temporarily unlock the passed-in clusters to avoid a
1580 * deadlock between elements of the cluster with other threads.
1581 * We will fixup the lock in the loop.
1583 * Note that this will clear the focus.
1585 * Reflag the clusters as locked, because we will relock them
1588 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1589 hammer2_cluster_unlock(cluster);
1590 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1592 hammer2_cluster_unlock(cparent);
1593 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1595 for (i = 0; i < cparent->nchains; ++i) {
1596 key_next = *key_nextp;
1597 ochain = cluster->array[i].chain;
1600 * Always relock the parent as we go.
1602 if (cparent->array[i].chain)
1603 hammer2_chain_lock(cparent->array[i].chain, rflags);
1606 * Nothing to iterate from. These cases can occur under
1607 * normal operations. For example, during synchronization
1608 * a slave might reach the end of its scan while records
1609 * are still left on the master(s).
1611 if (ochain == NULL) {
1615 if (cparent->array[i].chain == NULL ||
1616 (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1617 (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1618 /* ochain has not yet been relocked */
1619 hammer2_chain_drop(ochain);
1620 cluster->array[i].chain = NULL;
1626 * Relock the child if necessary. Parent and child will then
1627 * be locked as expected by hammer2_chain_next() and flags.
1629 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1630 hammer2_chain_lock(ochain, rflags);
1631 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1632 &key_next, key_beg, key_end,
1633 &cparent->array[i].cache_index,
1635 /* ochain now invalid but can still be used for focus check */
1636 if (parent_index == i) {
1637 cparent->focus_index = i;
1638 cparent->focus = cparent->array[i].chain;
1641 cluster->array[i].chain = nchain;
1642 if (nchain == NULL) {
1645 if (key_accum > key_next)
1646 key_accum = key_next;
1652 cluster->nchains = i;
1653 *key_nextp = key_accum;
1656 * The cluster must be resolved, out of sync elements may be present.
1658 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1660 if (null_count != i)
1661 hammer2_cluster_resolve(cluster);
1662 if (null_count == i ||
1663 (cluster->focus == NULL &&
1664 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1665 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1666 hammer2_cluster_unlock(cluster);
1667 hammer2_cluster_drop(cluster);
1674 * Advance just one chain in the cluster and recalculate the invalid bit.
1675 * The cluster index is allowed to be flagged invalid on input and is
1676 * recalculated on return.
1678 * (used during synchronization to advance past a chain being deleted).
1680 * The chain being advanced must not be the focus and the clusters in
1681 * question must have already passed normal cluster_lookup/cluster_next
1684 * The cluster always remains intact on return, so void function.
1687 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1688 hammer2_cluster_t *cluster,
1689 hammer2_key_t *key_nextp,
1690 hammer2_key_t key_beg,
1691 hammer2_key_t key_end,
1694 hammer2_chain_t *ochain;
1695 hammer2_chain_t *nchain;
1696 hammer2_chain_t *focus;
1697 hammer2_key_t key_accum;
1698 hammer2_key_t key_next;
1701 key_accum = *key_nextp;
1702 key_next = *key_nextp;
1703 ochain = cluster->array[i].chain;
1706 KKASSERT(ochain != cluster->focus);
1708 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1709 &key_next, key_beg, key_end,
1710 &cparent->array[i].cache_index,
1712 /* ochain now invalid */
1713 if (cparent->focus_index == i)
1714 cparent->focus = cparent->array[i].chain;
1717 * Install nchain. Note that nchain can be NULL, and can also
1718 * be in an unlocked state depending on flags.
1720 cluster->array[i].chain = nchain;
1721 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1723 if (key_accum > key_next)
1724 key_accum = key_next;
1726 focus = cluster->focus;
1732 if (nchain == focus) /* ASSERTED NOT TRUE */
1735 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1736 if (nchain->bref.type != focus->bref.type ||
1737 nchain->bref.key != focus->bref.key ||
1738 nchain->bref.keybits != focus->bref.keybits ||
1739 nchain->bref.modify_tid != focus->bref.modify_tid ||
1740 nchain->bytes != focus->bytes ||
1741 ddflag != cluster->ddflag) {
1742 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1746 *key_nextp = key_accum;
1749 * For now don't re-resolve cluster->flags.
1751 hammer2_cluster_resolve(cluster);
1756 * Create a new cluster using the specified key
1759 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1760 hammer2_cluster_t **clusterp,
1761 hammer2_key_t key, int keybits,
1762 int type, size_t bytes, int flags)
1764 hammer2_cluster_t *cluster;
1769 pmp = trans->pmp; /* can be NULL */
1771 if ((cluster = *clusterp) == NULL) {
1772 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1774 cluster->pmp = pmp; /* can be NULL */
1776 cluster->flags = HAMMER2_CLUSTER_LOCKED;
1778 cluster->focus_index = 0;
1779 cluster->focus = NULL;
1782 * NOTE: cluster->array[] entries can initially be NULL. If
1783 * *clusterp is supplied, skip NULL entries, otherwise
1784 * create new chains.
1786 for (i = 0; i < cparent->nchains; ++i) {
1787 if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1788 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1792 if ((cluster->array[i].flags &
1793 HAMMER2_CITEM_FEMOD) == 0) {
1794 cluster->array[i].flags |=
1795 HAMMER2_CITEM_INVALID;
1798 if (cluster->array[i].chain == NULL)
1801 error = hammer2_chain_create(trans, &cparent->array[i].chain,
1802 &cluster->array[i].chain, pmp,
1804 type, bytes, flags);
1805 if (cparent->focus_index == i)
1806 cparent->focus = cparent->array[i].chain;
1807 KKASSERT(error == 0);
1808 if (cluster->focus == NULL) {
1809 cluster->focus_index = i;
1810 cluster->focus = cluster->array[i].chain;
1812 if (cparent->focus == cparent->array[i].chain) {
1813 cluster->focus_index = i;
1814 cluster->focus = cluster->array[i].chain;
1817 cluster->nchains = i;
1818 *clusterp = cluster;
1819 hammer2_cluster_resolve(cluster);
1825 * Rename a cluster to a new parent.
1827 * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1828 * So the data_off field is not relevant. Only the key and
1832 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1833 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1836 hammer2_chain_t *chain;
1837 hammer2_blockref_t xbref;
1841 cluster->focus = NULL;
1842 cparent->focus = NULL;
1843 cluster->focus_index = 0;
1844 cparent->focus_index = 0;
1847 for (i = 0; i < cluster->nchains; ++i) {
1848 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1849 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1852 chain = cluster->array[i].chain;
1855 xbref = chain->bref;
1856 xbref.key = bref->key;
1857 xbref.keybits = bref->keybits;
1858 hammer2_chain_rename(trans, &xbref,
1859 &cparent->array[i].chain,
1862 hammer2_chain_rename(trans, NULL,
1863 &cparent->array[i].chain,
1866 if (cparent->focus_index == i)
1867 cparent->focus = cparent->array[i].chain;
1868 KKASSERT(cluster->array[i].chain == chain); /*remove*/
1874 * Mark a cluster deleted
1877 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1878 hammer2_cluster_t *cluster, int flags)
1880 hammer2_chain_t *chain;
1881 hammer2_chain_t *parent;
1884 if (cparent == NULL) {
1885 kprintf("cparent is NULL\n");
1889 for (i = 0; i < cluster->nchains; ++i) {
1890 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1891 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1894 parent = cparent->array[i].chain;
1895 chain = cluster->array[i].chain;
1898 if (chain->parent != parent) {
1899 kprintf("hammer2_cluster_delete: parent "
1900 "mismatch chain=%p parent=%p against=%p\n",
1901 chain, chain->parent, parent);
1903 hammer2_chain_delete(trans, parent, chain, flags);
1909 * Create a snapshot of the specified {parent, ochain} with the specified
1910 * label. The originating hammer2_inode must be exclusively locked for
1913 * The ioctl code has already synced the filesystem.
1916 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1917 hammer2_ioc_pfs_t *pfs)
1920 hammer2_cluster_t *ncluster;
1921 const hammer2_inode_data_t *ripdata;
1922 hammer2_inode_data_t *wipdata;
1923 hammer2_chain_t *nchain;
1924 hammer2_inode_t *nip;
1934 kprintf("snapshot %s\n", pfs->name);
1936 name_len = strlen(pfs->name);
1937 lhc = hammer2_dirhash(pfs->name, name_len);
1942 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1944 opfs_clid = ripdata->meta.pfs_clid;
1946 hmp = ocluster->focus->hmp; /* XXX find synchronized local disk */
1949 * Create the snapshot directory under the super-root
1951 * Set PFS type, generate a unique filesystem id, and generate
1952 * a cluster id. Use the same clid when snapshotting a PFS root,
1953 * which theoretically allows the snapshot to be used as part of
1954 * the same cluster (perhaps as a cache).
1956 * Copy the (flushed) blockref array. Theoretically we could use
1957 * chain_duplicate() but it becomes difficult to disentangle
1958 * the shared core so for now just brute-force it.
1964 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1965 proc0.p_ucred, pfs->name, name_len,
1967 HAMMER2_INSERT_PFSROOT, &error);
1970 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1971 wipdata->meta.pfs_type = HAMMER2_PFSTYPE_MASTER;
1972 wipdata->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1973 wipdata->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT;
1974 kern_uuidgen(&wipdata->meta.pfs_fsid, 1);
1977 * Give the snapshot its own private cluster. As a snapshot
1978 * no further synchronization with the original cluster will
1982 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1983 wipdata->meta.pfs_clid = opfs_clid;
1985 kern_uuidgen(&wipdata->meta.pfs_clid, 1);
1987 kern_uuidgen(&wipdata->meta.pfs_clid, 1);
1989 for (i = 0; i < ncluster->nchains; ++i) {
1990 if ((ncluster->array[i].flags &
1991 HAMMER2_CITEM_FEMOD) == 0) {
1992 ncluster->array[i].flags |=
1993 HAMMER2_CITEM_INVALID;
1996 nchain = ncluster->array[i].chain;
1998 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
2001 /* XXX hack blockset copy */
2002 /* XXX doesn't work with real cluster */
2003 KKASSERT(ocluster->nchains == 1);
2004 wipdata->u.blockset = ripdata->u.blockset;
2005 hammer2_cluster_modsync(ncluster);
2006 for (i = 0; i < ncluster->nchains; ++i) {
2007 nchain = ncluster->array[i].chain;
2009 hammer2_flush(trans, nchain, 1);
2011 hammer2_inode_unlock(nip, ncluster);
2017 * Return locked parent cluster given a locked child. The child remains
2018 * locked on return. The new parent's focus follows the child's focus
2019 * and the parent is always resolved.
2021 * We must temporarily unlock the passed-in cluster to avoid a deadlock
2022 * between elements of the cluster.
2024 * We must not try to hammer2_cluster_resolve() cparent. The individual
2025 * parent chains for the nodes are the correct parents for the cluster but
2026 * do not necessarily match, so resolve would likely implode.
2029 hammer2_cluster_parent(hammer2_cluster_t *cluster)
2031 hammer2_cluster_t *cparent;
2034 cparent = hammer2_cluster_copy(cluster);
2035 hammer2_cluster_unlock(cluster);
2037 for (i = 0; i < cparent->nchains; ++i) {
2038 hammer2_chain_t *chain;
2039 hammer2_chain_t *rchain;
2042 * Calculate parent for each element. Old chain has an extra
2043 * ref for cparent but the lock remains with cluster.
2045 chain = cparent->array[i].chain;
2048 while ((rchain = chain->parent) != NULL) {
2049 hammer2_chain_ref(rchain);
2050 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
2051 if (chain->parent == rchain)
2053 hammer2_chain_unlock(rchain);
2054 hammer2_chain_drop(rchain);
2056 cparent->array[i].chain = rchain;
2057 hammer2_chain_drop(chain);
2059 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
2060 /* hammer2_cluster_resolve(cparent); */
2061 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
2066 /************************************************************************
2068 ************************************************************************
2071 * WARNING! blockref[] array data is not universal. These functions should
2072 * only be used to access universal data.
2074 * NOTE! The rdata call will wait for at least one of the chain I/Os to
2075 * complete if necessary. The I/O's should have already been
2076 * initiated by the cluster_lock/chain_lock operation.
2078 * The cluster must already be in a modified state before wdata
2079 * is called. The data will already be available for this case.
2081 const hammer2_media_data_t *
2082 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
2084 KKASSERT(cluster->focus != NULL);
2085 return(cluster->focus->data);
2088 const hammer2_media_data_t *
2089 hammer2_cluster_rdata_bytes(hammer2_cluster_t *cluster, size_t *bytesp)
2091 KKASSERT(cluster->focus != NULL);
2092 *bytesp = cluster->focus->bytes;
2093 return(cluster->focus->data);
2096 hammer2_media_data_t *
2097 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
2099 KKASSERT(cluster->focus != NULL);
2100 KKASSERT(hammer2_cluster_modified(cluster));
2101 return(cluster->focus->data);
2105 * Load cluster data asynchronously with callback.
2107 * The callback is made for the first validated data found, or NULL
2108 * if no valid data is available.
2110 * NOTE! The cluster structure is either unique or serialized (e.g. embedded
2111 * in the inode with an exclusive lock held), the chain structure may be
2115 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
2116 void (*callback)(hammer2_iocb_t *iocb), void *ptr)
2118 hammer2_chain_t *chain;
2119 hammer2_iocb_t *iocb;
2121 hammer2_blockref_t *bref;
2124 i = cluster->focus_index;
2125 chain = cluster->focus;
2127 iocb = &cluster->iocb;
2128 iocb->callback = callback;
2129 iocb->dio = NULL; /* for already-validated case */
2130 iocb->cluster = cluster;
2131 iocb->chain = chain;
2133 iocb->lbase = (off_t)i;
2138 * Data already validated
2146 * We must resolve to a device buffer, either by issuing I/O or
2147 * by creating a zero-fill element. We do not mark the buffer
2148 * dirty when creating a zero-fill element (the hammer2_chain_modify()
2149 * API must still be used to do that).
2151 * The device buffer is variable-sized in powers of 2 down
2152 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage
2153 * chunk always contains buffers of the same size. (XXX)
2155 * The minimum physical IO size may be larger than the variable
2158 * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
2159 * matches hammer2_devblksize()? Or does the freemap's
2160 * pre-zeroing handle the case for us?
2162 bref = &chain->bref;
2166 /* handled by callback? <- TODO XXX even needed for loads? */
2168 * The getblk() optimization for a 100% overwrite can only be used
2169 * if the physical block size matches the request.
2171 if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
2172 chain->bytes == hammer2_devblksize(chain->bytes)) {
2173 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
2174 KKASSERT(error == 0);
2182 * Otherwise issue a read
2184 hammer2_adjreadcounter(&chain->bref, chain->bytes);
2185 hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);