2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * The cluster module collects multiple chains representing the same
36 * information from different nodes into a single entity. It allows direct
37 * access to media data as long as it is not blockref array data (which
38 * will obviously have to be different at each node).
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
48 * WARNING! This module is *extremely* complex. It must issue asynchronous
49 * locks and I/O, do quorum and/or master-slave processing, and
50 * it must operate properly even if some nodes are broken (which
51 * can also mean indefinite locks).
55 * Cluster operations can be broken down into three pieces:
57 * (1) Chain locking and data retrieval.
58 * hammer2_cluster_lock()
59 * hammer2_cluster_parent()
61 * - Most complex functions, quorum management on transaction ids.
63 * - Locking and data accesses must be internally asynchronous.
65 * - Validate and manage cache coherency primitives (cache state
66 * is stored in chain topologies but must be validated by these
69 * (2) Lookups and Scans
70 * hammer2_cluster_lookup()
71 * hammer2_cluster_next()
73 * - Depend on locking & data retrieval functions, but still complex.
75 * - Must do quorum management on transaction ids.
77 * - Lookup and Iteration ops Must be internally asynchronous.
79 * (3) Modifying Operations
80 * hammer2_cluster_create()
81 * hammer2_cluster_rename()
82 * hammer2_cluster_delete()
83 * hammer2_cluster_modify()
84 * hammer2_cluster_modsync()
86 * - Can usually punt on failures, operation continues unless quorum
87 * is lost. If quorum is lost, must wait for resynchronization
88 * (depending on the management mode).
90 * - Must disconnect node on failures (also not flush), remount, and
93 * - Network links (via kdmsg) are relatively easy to issue as the
94 * complex underworkings of hammer2_chain.c don't have to messed
95 * with (the protocol is at a higher level than block-level).
97 * - Multiple local disk nodes (i.e. block devices) are another matter.
98 * Chain operations have to be dispatched to per-node threads (xN)
99 * because we can't asynchronize potentially very complex chain
100 * operations in hammer2_chain.c (it would be a huge mess).
102 * (these threads are also used to terminate incoming kdmsg ops from
105 * - Single-node filesystems do not use threads and will simply call
106 * hammer2_chain.c functions directly. This short-cut is handled
107 * at the base of each cluster function.
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
119 * Returns non-zero if any chain in the cluster needs to be resized.
120 * Errored elements are not used in the calculation.
123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
125 hammer2_chain_t *chain;
128 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
129 for (i = 0; i < cluster->nchains; ++i) {
130 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
132 chain = cluster->array[i].chain;
137 if (chain->bytes != bytes)
144 * Returns the bref type of the cluster's foucs.
146 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
147 * The cluster must be locked.
150 hammer2_cluster_type(hammer2_cluster_t *cluster)
152 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
153 if (cluster->error == 0)
154 return(cluster->focus->bref.type);
159 * Returns non-zero if the cluster's focus is flagged as being modified.
161 * If the cluster is errored, returns 0.
164 hammer2_cluster_modified(hammer2_cluster_t *cluster)
166 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
167 if (cluster->error == 0)
168 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
173 * Returns the bref of the cluster's focus, sans any data-offset information
174 * (since offset information is per-node and wouldn't be useful).
176 * Callers use this function to access modify_tid, mirror_tid, type,
179 * If the cluster is errored, returns an empty bref.
180 * The cluster must be locked.
183 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
185 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
186 if (cluster->error == 0) {
187 *bref = cluster->focus->bref;
190 bzero(bref, sizeof(*bref));
195 * Return non-zero if the chain representing an inode has been flagged
196 * as having been unlinked. Allows the vnode reclaim to avoid loading
197 * the inode data from disk e.g. when unmount or recycling old, clean
200 * The cluster does not need to be locked.
201 * The focus cannot be used since the cluster might not be locked.
204 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster)
206 hammer2_chain_t *chain;
211 for (i = 0; i < cluster->nchains; ++i) {
212 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
214 chain = cluster->array[i].chain;
216 flags |= chain->flags;
218 return (flags & HAMMER2_CHAIN_UNLINKED);
222 * Set a bitmask of flags in all chains related to a cluster.
223 * The cluster should probably be locked.
225 * XXX Only operate on FEMOD elements?
228 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
230 hammer2_chain_t *chain;
233 for (i = 0; i < cluster->nchains; ++i) {
234 chain = cluster->array[i].chain;
236 atomic_set_int(&chain->flags, flags);
241 * Set a bitmask of flags in all chains related to a cluster.
242 * The cluster should probably be locked.
244 * XXX Only operate on FEMOD elements?
247 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
249 hammer2_chain_t *chain;
252 for (i = 0; i < cluster->nchains; ++i) {
253 chain = cluster->array[i].chain;
255 atomic_clear_int(&chain->flags, flags);
260 * Flag the cluster for flushing recursively up to the root. Despite the
261 * work it does, this is relatively benign. It just makes sure that the
262 * flusher has top-down visibility to this cluster.
264 * Errored chains are not flagged for flushing.
266 * The cluster should probably be locked.
269 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
271 hammer2_chain_t *chain;
274 for (i = 0; i < cluster->nchains; ++i) {
275 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
277 chain = cluster->array[i].chain;
282 hammer2_chain_setflush(trans, chain);
287 * Set the check mode for the cluster.
288 * Errored elements of the cluster are ignored.
290 * The cluster must be locked and modified.
293 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
294 hammer2_cluster_t *cluster,
297 hammer2_chain_t *chain;
300 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
301 for (i = 0; i < cluster->nchains; ++i) {
302 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
303 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
306 chain = cluster->array[i].chain;
311 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
312 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
313 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
318 * Create a degenerate cluster with one ref from a single locked chain.
319 * The returned cluster will be focused on the chain and inherit its
322 * The chain's lock and reference are transfered to the new cluster, so
323 * the caller should not try to unlock the chain separately.
328 hammer2_cluster_from_chain(hammer2_chain_t *chain)
330 hammer2_cluster_t *cluster;
332 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
333 cluster->array[0].chain = chain;
334 cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
335 cluster->nchains = 1;
336 cluster->focus = chain;
337 cluster->focus_index = 0;
338 cluster->pmp = chain->pmp;
340 cluster->error = chain->error;
341 cluster->flags = HAMMER2_CLUSTER_LOCKED |
342 HAMMER2_CLUSTER_WRHARD |
343 HAMMER2_CLUSTER_RDHARD |
344 HAMMER2_CLUSTER_MSYNCED |
345 HAMMER2_CLUSTER_SSYNCED;
351 * Add a reference to a cluster and its underlying chains.
353 * We must also ref the underlying chains in order to allow ref/unlock
354 * sequences to later re-lock.
357 hammer2_cluster_ref(hammer2_cluster_t *cluster)
359 atomic_add_int(&cluster->refs, 1);
361 hammer2_chain_t *chain;
364 for (i = 0; i < cluster->nchains; ++i) {
365 chain = cluster->array[i].chain;
367 hammer2_chain_ref(chain);
373 * Drop the caller's reference to the cluster. When the ref count drops to
374 * zero this function frees the cluster and drops all underlying chains.
376 * In-progress read I/Os are typically detached from the cluster once the
377 * first one returns (the remaining stay attached to the DIOs but are then
378 * ignored and drop naturally).
381 hammer2_cluster_drop(hammer2_cluster_t *cluster)
383 hammer2_chain_t *chain;
386 KKASSERT(cluster->refs > 0);
387 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
388 cluster->focus = NULL; /* safety XXX chg to assert */
389 cluster->focus_index = 0;
391 for (i = 0; i < cluster->nchains; ++i) {
392 chain = cluster->array[i].chain;
394 hammer2_chain_drop(chain);
395 cluster->array[i].chain = NULL; /* safety */
398 cluster->nchains = 0; /* safety */
400 kfree(cluster, M_HAMMER2);
401 /* cluster is invalid */
406 hammer2_cluster_wait(hammer2_cluster_t *cluster)
408 tsleep(cluster->focus, 0, "h2clcw", 1);
412 * Lock and ref a cluster. This adds a ref to the cluster and its chains
413 * and then locks them, modified by various RESOLVE flags.
415 * The act of locking a cluster sets its focus. Note that cluster elements
416 * flagged with HAMMER2_CITEM_INVALID cannot be set as a focus. Locking a
417 * cluster does not adjust this flag since exact matches only matter for leafs
418 * (parents can depend on minor differences in topology).
420 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
421 * operations. Typically this is only set on a quorum of MASTERs or
422 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER
423 * is present, this bit is *not* set on a quorum of MASTERs. The
424 * synchronization code ignores this bit, but all hammer2_cluster_*() calls
425 * that create/modify/delete elements use it.
427 * The chains making up the cluster may be narrowed down based on quorum
428 * acceptability, and if RESOLVE_RDONLY is specified the chains can be
429 * narrowed down to a single chain as long as the entire subtopology is known
430 * to be intact. So, for example, we can narrow a read-only op to a single
431 * fast SLAVE but if we focus a CACHE chain we must still retain at least
432 * a SLAVE to ensure that the subtopology can be accessed.
434 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
435 * to be maintained once the topology is validated as-of the top level of
438 * If a failure occurs the operation must be aborted by higher-level code and
442 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
444 hammer2_chain_t *chain;
447 /* cannot be on inode-embedded cluster template, must be on copy */
448 KKASSERT(cluster->refs > 0);
449 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
450 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
451 panic("hammer2_cluster_lock: cluster %p already locked!\n",
454 KKASSERT(cluster->focus == NULL);
456 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
459 * Lock chains and resolve state.
461 for (i = 0; i < cluster->nchains; ++i) {
462 chain = cluster->array[i].chain;
465 hammer2_chain_lock(chain, how);
468 hammer2_cluster_resolve(cluster);
472 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
474 hammer2_chain_t *chain;
476 hammer2_tid_t quorum_tid;
477 hammer2_tid_t last_best_quorum_tid;
489 cluster->focus = NULL;
502 KKASSERT(pmp != NULL || cluster->nchains == 0);
503 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
509 * NOTE: A NULL chain is not necessarily an error, it could be
510 * e.g. a lookup failure or the end of an iteration.
513 for (i = 0; i < cluster->nchains; ++i) {
514 chain = cluster->array[i].chain;
515 if (chain && chain->error) {
516 if (cluster->focus == NULL || cluster->focus == chain) {
517 /* error will be overridden by valid focus */
518 cluster->error = chain->error;
522 * Must count total masters and slaves whether the
523 * chain is errored or not.
525 switch (cluster->pmp->pfs_types[i]) {
526 case HAMMER2_PFSTYPE_MASTER:
529 case HAMMER2_PFSTYPE_SLAVE:
535 switch (cluster->pmp->pfs_types[i]) {
536 case HAMMER2_PFSTYPE_MASTER:
539 case HAMMER2_PFSTYPE_SLAVE:
542 case HAMMER2_PFSTYPE_SOFT_MASTER:
543 nflags |= HAMMER2_CLUSTER_WRSOFT;
544 nflags |= HAMMER2_CLUSTER_RDSOFT;
547 case HAMMER2_PFSTYPE_SOFT_SLAVE:
548 nflags |= HAMMER2_CLUSTER_RDSOFT;
550 case HAMMER2_PFSTYPE_SUPROOT:
552 * Degenerate cluster representing the super-root
553 * topology on a single device. Fake stuff so
554 * cluster ops work as expected.
556 nflags |= HAMMER2_CLUSTER_WRHARD;
557 nflags |= HAMMER2_CLUSTER_RDHARD;
558 cluster->focus_index = i;
559 cluster->focus = chain;
560 cluster->error = chain ? chain->error : 0;
570 * Resolve masters. Calculate nmasters for the highest matching
571 * TID, if a quorum cannot be attained try the next lower matching
572 * TID until we exhaust TIDs.
574 * NOTE: A NULL chain is not necessarily an error, it could be
575 * e.g. a lookup failure or the end of an iteration.
578 last_best_quorum_tid = HAMMER2_TID_MAX;
579 quorum_tid = 0; /* fix gcc warning */
581 while (nmasters < nquorum && last_best_quorum_tid != 0) {
585 for (i = 0; i < cluster->nchains; ++i) {
586 if (cluster->pmp->pfs_types[i] !=
587 HAMMER2_PFSTYPE_MASTER) {
590 chain = cluster->array[i].chain;
592 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
594 * Invalid as in unsynchronized, cannot be
595 * used to calculate the quorum.
597 } else if (chain == NULL && quorum_tid == 0) {
599 * NULL chain on master matches NULL chains
603 } else if (quorum_tid < last_best_quorum_tid &&
605 (quorum_tid < chain->bref.modify_tid ||
608 * Better TID located, reset nmasters count.
611 quorum_tid = chain->bref.modify_tid;
613 quorum_tid == chain->bref.modify_tid) {
615 * TID matches current collection.
620 if (nmasters >= nquorum)
622 last_best_quorum_tid = quorum_tid;
628 * NOTE: A NULL chain is not necessarily an error, it could be
629 * e.g. a lookup failure or the end of an iteration.
632 for (i = 0; i < cluster->nchains; ++i) {
633 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
634 chain = cluster->array[i].chain;
635 if (chain && chain->error) {
636 if (cluster->focus == NULL || cluster->focus == chain) {
637 /* error will be overridden by valid focus */
638 cluster->error = chain->error;
643 switch (cluster->pmp->pfs_types[i]) {
644 case HAMMER2_PFSTYPE_MASTER:
646 * We must have enough up-to-date masters to reach
647 * a quorum and the master modify_tid must match
648 * the quorum's modify_tid.
650 * Do not select an errored or out-of-sync master.
652 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
653 nflags |= HAMMER2_CLUSTER_UNHARD;
654 } else if (nmasters >= nquorum &&
655 (chain == NULL || chain->error == 0) &&
656 ((chain == NULL && quorum_tid == 0) ||
657 (chain != NULL && quorum_tid ==
658 chain->bref.modify_tid))) {
659 nflags |= HAMMER2_CLUSTER_WRHARD;
660 nflags |= HAMMER2_CLUSTER_RDHARD;
662 cluster->array[i].flags |=
665 if (cluster->focus == NULL ||
666 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
667 focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
668 cluster->focus_index = i;
669 cluster->focus = chain; /* NULL ok */
670 cluster->error = chain ? chain->error :
673 } else if (chain == NULL || chain->error == 0) {
674 nflags |= HAMMER2_CLUSTER_UNHARD;
677 case HAMMER2_PFSTYPE_SLAVE:
679 * We must have enough up-to-date masters to reach
680 * a quorum and the slave modify_tid must match the
681 * quorum's modify_tid.
683 * Do not select an errored slave.
685 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
686 nflags |= HAMMER2_CLUSTER_UNHARD;
687 } else if (nmasters >= nquorum &&
688 (chain == NULL || chain->error == 0) &&
689 ((chain == NULL && quorum_tid == 0) ||
690 (chain && quorum_tid ==
691 chain->bref.modify_tid))) {
693 nflags |= HAMMER2_CLUSTER_RDHARD;
695 /* XXX optimize for RESOLVE_RDONLY */
696 if (cluster->focus == NULL) {
697 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
698 cluster->focus_index = i;
699 cluster->focus = chain; /* NULL ok */
700 cluster->error = chain ? chain->error :
704 } else if (chain == NULL || chain->error == 0) {
705 nflags |= HAMMER2_CLUSTER_UNSOFT;
708 case HAMMER2_PFSTYPE_SOFT_MASTER:
710 * Directly mounted soft master always wins. There
711 * should be only one.
713 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
714 cluster->focus_index = i;
715 cluster->focus = chain;
716 cluster->error = chain ? chain->error : 0;
717 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
718 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
720 case HAMMER2_PFSTYPE_SOFT_SLAVE:
722 * Directly mounted soft slave always wins. There
723 * should be only one.
725 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
726 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
727 cluster->focus_index = i;
728 cluster->focus = chain;
729 cluster->error = chain ? chain->error : 0;
730 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
733 case HAMMER2_PFSTYPE_SUPROOT:
735 * spmp (degenerate case)
738 cluster->focus_index = i;
739 cluster->focus = chain;
740 cluster->error = chain ? chain->error : 0;
741 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
742 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
750 nflags |= HAMMER2_CLUSTER_NOSOFT;
752 nflags |= HAMMER2_CLUSTER_NOHARD;
755 * Set SSYNCED or MSYNCED for slaves and masters respectively if
756 * all available nodes (even if 0 are available) are fully
757 * synchronized. This is used by the synchronization thread to
758 * determine if there is work it could potentially accomplish.
760 if (nslaves == ttlslaves)
761 nflags |= HAMMER2_CLUSTER_SSYNCED;
762 if (nmasters == ttlmasters)
763 nflags |= HAMMER2_CLUSTER_MSYNCED;
766 * Determine if the cluster was successfully locked for the
767 * requested operation and generate an error code. The cluster
768 * will not be locked (or ref'd) if an error is returned.
770 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
771 * to determine if reading or writing is possible. If writing, the
772 * cluster still requires a call to hammer2_cluster_modify() first.
774 atomic_set_int(&cluster->flags, nflags);
775 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
779 * Copy a cluster, returned a ref'd cluster. All underlying chains
780 * are also ref'd, but not locked.
782 * The cluster focus is not set because the cluster is not yet locked
783 * (and the originating cluster does not have to be locked either).
786 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
788 hammer2_pfs_t *pmp = ocluster->pmp;
789 hammer2_cluster_t *ncluster;
790 hammer2_chain_t *chain;
793 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
795 ncluster->nchains = ocluster->nchains;
797 ncluster->flags = 0; /* cluster not locked */
799 for (i = 0; i < ocluster->nchains; ++i) {
800 chain = ocluster->array[i].chain;
801 ncluster->array[i].chain = chain;
803 hammer2_chain_ref(chain);
809 * Unlock and deref a cluster. The cluster is destroyed if this is the
813 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
815 hammer2_chain_t *chain;
818 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
819 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
822 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
823 KKASSERT(cluster->refs > 0);
824 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
826 for (i = 0; i < cluster->nchains; ++i) {
827 chain = cluster->array[i].chain;
829 hammer2_chain_unlock(chain);
831 cluster->focus_index = 0;
832 cluster->focus = NULL;
836 * Resize the cluster's physical storage allocation in-place. This may
837 * replace the cluster's chains.
840 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
841 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
842 int nradix, int flags)
844 hammer2_chain_t *chain;
847 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */
848 KKASSERT(cparent->nchains == cluster->nchains);
850 for (i = 0; i < cluster->nchains; ++i) {
851 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
852 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
855 chain = cluster->array[i].chain;
857 KKASSERT(cparent->array[i].chain);
858 hammer2_chain_resize(trans, ip,
859 cparent->array[i].chain, chain,
866 * Set an inode's cluster modified, marking the related chains RW and
867 * duplicating them if necessary.
869 * The passed-in chain is a localized copy of the chain previously acquired
870 * when the inode was locked (and possilby replaced in the mean time), and
871 * must also be updated. In fact, we update it first and then synchronize
872 * the inode's cluster cache.
874 hammer2_inode_data_t *
875 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
876 hammer2_cluster_t *cluster, int flags)
878 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
879 hammer2_cluster_modify(trans, cluster, flags);
881 hammer2_inode_repoint(ip, NULL, cluster);
884 return (&hammer2_cluster_wdata(cluster)->ipdata);
888 * Adjust the cluster's chains to allow modification and adjust the
889 * focus. Data will be accessible on return.
891 * If our focused master errors on modify, re-resolve the cluster to
892 * try to select a different master.
895 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
898 hammer2_chain_t *chain;
903 for (i = 0; i < cluster->nchains; ++i) {
904 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
905 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
908 chain = cluster->array[i].chain;
913 hammer2_chain_modify(trans, chain, flags);
914 if (cluster->focus == chain && chain->error) {
915 cluster->error = chain->error;
920 hammer2_cluster_resolve(cluster);
924 * Synchronize modifications from the focus to other chains in a cluster.
925 * Convenient because nominal API users can just modify the contents of the
926 * focus (at least for non-blockref data).
928 * Nominal front-end operations only edit non-block-table data in a single
929 * chain. This code copies such modifications to the other chains in the
930 * cluster. Blocktable modifications are handled on a chain-by-chain basis
931 * by both the frontend and the backend and will explode in fireworks if
935 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
937 hammer2_chain_t *focus;
938 hammer2_chain_t *scan;
939 const hammer2_inode_data_t *ripdata;
940 hammer2_inode_data_t *wipdata;
943 focus = cluster->focus;
944 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
946 for (i = 0; i < cluster->nchains; ++i) {
947 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
949 scan = cluster->array[i].chain;
950 if (scan == NULL || scan == focus)
954 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
955 KKASSERT(focus->bytes == scan->bytes &&
956 focus->bref.type == scan->bref.type);
957 switch(focus->bref.type) {
958 case HAMMER2_BREF_TYPE_INODE:
959 ripdata = &focus->data->ipdata;
960 wipdata = &scan->data->ipdata;
961 if ((ripdata->op_flags &
962 HAMMER2_OPFLAG_DIRECTDATA) == 0) {
963 bcopy(ripdata, wipdata,
964 offsetof(hammer2_inode_data_t, u));
967 /* fall through to full copy */
968 case HAMMER2_BREF_TYPE_DATA:
969 bcopy(focus->data, scan->data, focus->bytes);
971 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
972 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
973 case HAMMER2_BREF_TYPE_FREEMAP:
974 case HAMMER2_BREF_TYPE_VOLUME:
975 panic("hammer2_cluster_modsync: illegal node type");
979 panic("hammer2_cluster_modsync: unknown node type");
986 * Lookup initialization/completion API. Returns a locked cluster with 1 ref.
989 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
991 hammer2_cluster_t *cluster;
993 cluster = hammer2_cluster_copy(cparent);
994 if (flags & HAMMER2_LOOKUP_SHARED) {
995 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
996 HAMMER2_RESOLVE_SHARED);
998 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1004 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1007 hammer2_cluster_unlock(cparent);
1008 hammer2_cluster_drop(cparent);
1013 * Locate first match or overlap under parent, return a new cluster
1016 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
1017 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1020 hammer2_cluster_t *cluster;
1021 hammer2_chain_t *chain;
1022 hammer2_chain_t *focus;
1023 hammer2_key_t key_accum;
1024 hammer2_key_t key_next;
1028 pmp = cparent->pmp; /* can be NULL */
1029 key_accum = *key_nextp;
1032 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1033 cluster->pmp = pmp; /* can be NULL */
1035 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1036 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1039 * Pass-1, issue lookup and find focus.
1041 for (i = 0; i < cparent->nchains; ++i) {
1042 cluster->array[i].flags = cparent->array[i].flags;
1043 key_next = *key_nextp;
1046 * Nothing to base the lookup, or parent was not synchronized.
1048 if (cparent->array[i].chain == NULL ||
1049 (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
1054 chain = hammer2_chain_lookup(&cparent->array[i].chain,
1057 &cparent->array[i].cache_index,
1059 if (cparent->focus_index == i)
1060 cparent->focus = cparent->array[i].chain;
1061 cluster->array[i].chain = chain;
1062 if (chain == NULL) {
1064 } else if (chain->error) {
1066 * Leave errored chain in cluster, but it cannot be
1067 * the cluster's focus. It is still possible for an
1068 * error'd chain to be synchronized (since we have
1069 * the bref), synchronization state will be handled
1073 } else if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
1075 * Leave unsynchronized chain in cluster, but it cannot
1076 * be the cluster's focus.
1080 int ddflag = (chain->bref.type ==
1081 HAMMER2_BREF_TYPE_INODE);
1083 if (cluster->focus == NULL) {
1084 cluster->focus_index = i;
1085 cluster->focus = chain;
1086 cluster->ddflag = ddflag;
1088 if (cparent->focus == cparent->array[i].chain) {
1089 cluster->focus_index = i;
1090 cluster->focus = chain;
1091 cluster->ddflag = ddflag;
1094 if (key_accum > key_next)
1095 key_accum = key_next;
1099 * Pass-2 invalidate mismatches
1101 focus = cluster->focus;
1105 for (i = 0; i < cparent->nchains; ++i) {
1108 chain = cluster->array[i].chain;
1114 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1117 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1118 if (chain->bref.type != focus->bref.type ||
1119 chain->bref.key != focus->bref.key ||
1120 chain->bref.keybits != focus->bref.keybits ||
1121 chain->bref.modify_tid != focus->bref.modify_tid ||
1122 chain->bytes != focus->bytes ||
1123 ddflag != cluster->ddflag) {
1124 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1129 * Resolve cluster flags. A lookup or locking failure could wind
1130 * up changing the cluster.
1133 *key_nextp = key_accum;
1134 cluster->nchains = i;
1135 hammer2_cluster_resolve(cluster);
1137 if (null_count == i) {
1138 hammer2_cluster_drop(cluster);
1146 * Locate next match or overlap under parent, replace cluster
1149 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1150 hammer2_key_t *key_nextp,
1151 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1153 hammer2_chain_t *ochain;
1154 hammer2_chain_t *nchain;
1155 hammer2_chain_t *focus;
1156 hammer2_key_t key_accum;
1157 hammer2_key_t key_next;
1163 key_accum = *key_nextp;
1165 parent_index = cparent->focus_index; /* save prior focus */
1166 cluster_index = cluster->focus_index;
1168 cluster->focus = NULL; /* XXX needed any more? */
1169 /*cparent->focus = NULL;*/
1170 cluster->focus_index = 0; /* XXX needed any more? */
1171 /*cparent->focus_index = 0;*/
1173 cluster->ddflag = 0;
1176 * The parent is always locked on entry, the iterator may be locked
1177 * depending on flags.
1179 * We must temporarily unlock the passed-in clusters to avoid a
1180 * deadlock between elements of the cluster with other threads.
1181 * We will fixup the lock in the loop.
1183 * Note that this will clear the focus.
1185 * Reflag the clusters as locked, because we will relock them
1188 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1189 hammer2_cluster_unlock(cluster);
1190 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1192 hammer2_cluster_unlock(cparent);
1193 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1195 for (i = 0; i < cparent->nchains; ++i) {
1196 key_next = *key_nextp;
1197 ochain = cluster->array[i].chain;
1200 * Always relock the parent as we go.
1202 if (cparent->array[i].chain) {
1203 hammer2_chain_lock(cparent->array[i].chain,
1204 flags & ~HAMMER2_LOOKUP_NOLOCK);
1208 * Nothing to iterate from. These cases can occur under
1209 * normal operations. For example, during synchronization
1210 * a slave might reach the end of its scan while records
1211 * are still left on the master(s).
1213 if (ochain == NULL) {
1217 if (cparent->array[i].chain == NULL ||
1218 (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1219 (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1220 /* ochain has not yet been relocked */
1221 hammer2_chain_drop(ochain);
1222 cluster->array[i].chain = NULL;
1228 * Relock the child if necessary. Parent and child will then
1229 * be locked as expected by hammer2_chain_next() and flags.
1231 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1232 hammer2_chain_lock(ochain, flags);
1233 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1234 &key_next, key_beg, key_end,
1235 &cparent->array[i].cache_index,
1237 if (parent_index == i) {
1238 cparent->focus_index = i;
1239 cparent->focus = cparent->array[i].chain;
1241 /* ochain now invalid but can still be used for focus check */
1243 cluster->array[i].chain = nchain;
1244 if (nchain == NULL) {
1246 } else if (nchain->error) {
1248 * Leave errored chain in cluster, but it cannot be
1249 * the cluster's focus.
1253 int ddflag = (nchain->bref.type ==
1254 HAMMER2_BREF_TYPE_INODE);
1257 * Possible new focus.
1259 if (cluster->focus == NULL) {
1260 cluster->ddflag = ddflag;
1261 cluster->focus_index = i;
1262 cluster->focus = nchain;
1266 * Fixup pre-existing focus.
1268 if (cluster_index == i) {
1269 cluster->focus_index = i;
1270 cluster->focus = nchain;
1273 if (key_accum > key_next)
1274 key_accum = key_next;
1278 * Pass-2 invalidate mismatches
1280 focus = cluster->focus;
1284 for (i = 0; i < cparent->nchains; ++i) {
1287 nchain = cluster->array[i].chain;
1291 if (nchain == focus)
1293 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1296 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1297 if (nchain->bref.type != focus->bref.type ||
1298 nchain->bref.key != focus->bref.key ||
1299 nchain->bref.keybits != focus->bref.keybits ||
1300 nchain->bref.modify_tid != focus->bref.modify_tid ||
1301 nchain->bytes != focus->bytes ||
1302 ddflag != cluster->ddflag) {
1303 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1308 *key_nextp = key_accum;
1309 cluster->nchains = i;
1310 hammer2_cluster_resolve(cluster);
1312 if (null_count == i) {
1313 hammer2_cluster_drop(cluster);
1320 * Advance just one chain in the cluster and recalculate the invalid bit.
1321 * The cluster index is allowed to be flagged invalid on input and is
1322 * recalculated on return.
1324 * (used during synchronization to advance past a chain being deleted).
1326 * The chain being advanced must not be the focus and the clusters in
1327 * question must have already passed normal cluster_lookup/cluster_next
1330 * The cluster always remains intact on return, so void function.
1333 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1334 hammer2_cluster_t *cluster,
1335 hammer2_key_t *key_nextp,
1336 hammer2_key_t key_beg,
1337 hammer2_key_t key_end,
1340 hammer2_chain_t *ochain;
1341 hammer2_chain_t *nchain;
1342 hammer2_chain_t *focus;
1343 hammer2_key_t key_accum;
1344 hammer2_key_t key_next;
1347 key_accum = *key_nextp;
1348 key_next = *key_nextp;
1349 ochain = cluster->array[i].chain;
1352 KKASSERT(ochain != cluster->focus);
1354 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1355 &key_next, key_beg, key_end,
1356 &cparent->array[i].cache_index,
1358 if (cparent->focus_index == i)
1359 cparent->focus = cparent->array[i].chain;
1360 /* ochain now invalid */
1363 * Install nchain. Note that nchain can be NULL, and can also
1364 * be in an unlocked state depending on flags.
1366 cluster->array[i].chain = nchain;
1367 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1369 if (key_accum > key_next)
1370 key_accum = key_next;
1372 focus = cluster->focus;
1378 if (nchain == focus) /* ASSERTED NOT TRUE */
1381 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1382 if (nchain->bref.type != focus->bref.type ||
1383 nchain->bref.key != focus->bref.key ||
1384 nchain->bref.keybits != focus->bref.keybits ||
1385 nchain->bref.modify_tid != focus->bref.modify_tid ||
1386 nchain->bytes != focus->bytes ||
1387 ddflag != cluster->ddflag) {
1388 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1392 *key_nextp = key_accum;
1395 * For now don't re-resolve cluster->flags.
1397 hammer2_cluster_resolve(cluster);
1402 * Create a new cluster using the specified key
1405 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1406 hammer2_cluster_t **clusterp,
1407 hammer2_key_t key, int keybits,
1408 int type, size_t bytes, int flags)
1410 hammer2_cluster_t *cluster;
1415 pmp = trans->pmp; /* can be NULL */
1417 if ((cluster = *clusterp) == NULL) {
1418 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1420 cluster->pmp = pmp; /* can be NULL */
1422 cluster->flags = HAMMER2_CLUSTER_LOCKED;
1424 cluster->focus_index = 0;
1425 cluster->focus = NULL;
1428 * NOTE: cluster->array[] entries can initially be NULL. If
1429 * *clusterp is supplied, skip NULL entries, otherwise
1430 * create new chains.
1432 for (i = 0; i < cparent->nchains; ++i) {
1433 if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1434 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1438 if ((cluster->array[i].flags &
1439 HAMMER2_CITEM_FEMOD) == 0) {
1440 cluster->array[i].flags |=
1441 HAMMER2_CITEM_INVALID;
1444 if (cluster->array[i].chain == NULL)
1447 error = hammer2_chain_create(trans, &cparent->array[i].chain,
1448 &cluster->array[i].chain, pmp,
1450 type, bytes, flags);
1451 if (cparent->focus_index == i)
1452 cparent->focus = cparent->array[i].chain;
1453 KKASSERT(error == 0);
1454 if (cluster->focus == NULL) {
1455 cluster->focus_index = i;
1456 cluster->focus = cluster->array[i].chain;
1458 if (cparent->focus == cparent->array[i].chain) {
1459 cluster->focus_index = i;
1460 cluster->focus = cluster->array[i].chain;
1463 cluster->nchains = i;
1464 *clusterp = cluster;
1465 hammer2_cluster_resolve(cluster);
1471 * Rename a cluster to a new parent.
1473 * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1474 * So the data_off field is not relevant. Only the key and
1478 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1479 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1482 hammer2_chain_t *chain;
1483 hammer2_blockref_t xbref;
1487 cluster->focus = NULL;
1488 cparent->focus = NULL;
1489 cluster->focus_index = 0;
1490 cparent->focus_index = 0;
1493 for (i = 0; i < cluster->nchains; ++i) {
1494 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1495 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1498 chain = cluster->array[i].chain;
1501 xbref = chain->bref;
1502 xbref.key = bref->key;
1503 xbref.keybits = bref->keybits;
1504 hammer2_chain_rename(trans, &xbref,
1505 &cparent->array[i].chain,
1508 hammer2_chain_rename(trans, NULL,
1509 &cparent->array[i].chain,
1512 if (cparent->focus_index == i)
1513 cparent->focus = cparent->array[i].chain;
1514 KKASSERT(cluster->array[i].chain == chain); /*remove*/
1520 * Mark a cluster deleted
1523 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1524 hammer2_cluster_t *cluster, int flags)
1526 hammer2_chain_t *chain;
1527 hammer2_chain_t *parent;
1530 if (cparent == NULL) {
1531 kprintf("cparent is NULL\n");
1535 for (i = 0; i < cluster->nchains; ++i) {
1536 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1537 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1540 parent = cparent->array[i].chain;
1541 chain = cluster->array[i].chain;
1544 if (chain->parent != parent) {
1545 kprintf("hammer2_cluster_delete: parent "
1546 "mismatch chain=%p parent=%p against=%p\n",
1547 chain, chain->parent, parent);
1549 hammer2_chain_delete(trans, parent, chain, flags);
1555 * Create a snapshot of the specified {parent, ochain} with the specified
1556 * label. The originating hammer2_inode must be exclusively locked for
1559 * The ioctl code has already synced the filesystem.
1562 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1563 hammer2_ioc_pfs_t *pfs)
1566 hammer2_cluster_t *ncluster;
1567 const hammer2_inode_data_t *ripdata;
1568 hammer2_inode_data_t *wipdata;
1569 hammer2_chain_t *nchain;
1570 hammer2_inode_t *nip;
1580 kprintf("snapshot %s\n", pfs->name);
1582 name_len = strlen(pfs->name);
1583 lhc = hammer2_dirhash(pfs->name, name_len);
1588 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1590 opfs_clid = ripdata->pfs_clid;
1592 hmp = ocluster->focus->hmp; /* XXX find synchronized local disk */
1595 * Create the snapshot directory under the super-root
1597 * Set PFS type, generate a unique filesystem id, and generate
1598 * a cluster id. Use the same clid when snapshotting a PFS root,
1599 * which theoretically allows the snapshot to be used as part of
1600 * the same cluster (perhaps as a cache).
1602 * Copy the (flushed) blockref array. Theoretically we could use
1603 * chain_duplicate() but it becomes difficult to disentangle
1604 * the shared core so for now just brute-force it.
1610 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1611 proc0.p_ucred, pfs->name, name_len,
1613 HAMMER2_INSERT_PFSROOT, &error);
1616 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1617 wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER;
1618 wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1619 wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT;
1620 kern_uuidgen(&wipdata->pfs_fsid, 1);
1623 * Give the snapshot its own private cluster. As a snapshot
1624 * no further synchronization with the original cluster will
1628 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1629 wipdata->pfs_clid = opfs_clid;
1631 kern_uuidgen(&wipdata->pfs_clid, 1);
1633 kern_uuidgen(&wipdata->pfs_clid, 1);
1635 for (i = 0; i < ncluster->nchains; ++i) {
1636 if ((ncluster->array[i].flags &
1637 HAMMER2_CITEM_FEMOD) == 0) {
1638 ncluster->array[i].flags |=
1639 HAMMER2_CITEM_INVALID;
1642 nchain = ncluster->array[i].chain;
1644 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1647 /* XXX can't set this unless we do an explicit flush, which
1648 we also need a pmp assigned to do, else the flush code
1649 won't flush ncluster because it thinks it is crossing a
1651 hammer2_cluster_set_chainflags(ncluster,
1652 HAMMER2_CHAIN_PFSBOUNDARY);
1655 /* XXX hack blockset copy */
1656 /* XXX doesn't work with real cluster */
1657 KKASSERT(ocluster->nchains == 1);
1658 wipdata->u.blockset = ripdata->u.blockset;
1659 hammer2_cluster_modsync(ncluster);
1660 for (i = 0; i < ncluster->nchains; ++i) {
1661 nchain = ncluster->array[i].chain;
1663 hammer2_flush(trans, nchain, 1);
1665 hammer2_inode_unlock(nip, ncluster);
1671 * Return locked parent cluster given a locked child. The child remains
1672 * locked on return. The new parent's focus follows the child's focus
1673 * and the parent is always resolved.
1675 * We must temporarily unlock the passed-in cluster to avoid a deadlock
1676 * between elements of the cluster.
1679 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1681 hammer2_cluster_t *cparent;
1684 cparent = hammer2_cluster_copy(cluster);
1685 hammer2_cluster_unlock(cluster);
1687 for (i = 0; i < cparent->nchains; ++i) {
1688 hammer2_chain_t *chain;
1689 hammer2_chain_t *rchain;
1692 * Calculate parent for each element. Old chain has an extra
1693 * ref for cparent but the lock remains with cluster.
1695 chain = cparent->array[i].chain;
1698 while ((rchain = chain->parent) != NULL) {
1699 hammer2_chain_ref(rchain);
1700 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1701 if (chain->parent == rchain)
1703 hammer2_chain_unlock(rchain);
1704 hammer2_chain_drop(rchain);
1706 if (cluster->focus == chain) {
1707 cparent->focus_index = i;
1708 cparent->focus = rchain;
1710 cparent->array[i].chain = rchain;
1711 hammer2_chain_drop(chain);
1713 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1714 hammer2_cluster_resolve(cparent);
1715 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1720 /************************************************************************
1722 ************************************************************************
1725 * WARNING! blockref[] array data is not universal. These functions should
1726 * only be used to access universal data.
1728 * NOTE! The rdata call will wait for at least one of the chain I/Os to
1729 * complete if necessary. The I/O's should have already been
1730 * initiated by the cluster_lock/chain_lock operation.
1732 * The cluster must already be in a modified state before wdata
1733 * is called. The data will already be available for this case.
1735 const hammer2_media_data_t *
1736 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1738 return(cluster->focus->data);
1741 hammer2_media_data_t *
1742 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1744 KKASSERT(hammer2_cluster_modified(cluster));
1745 return(cluster->focus->data);
1749 * Load cluster data asynchronously with callback.
1751 * The callback is made for the first validated data found, or NULL
1752 * if no valid data is available.
1754 * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1755 * in the inode with an exclusive lock held), the chain structure may be
1759 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1760 void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1762 hammer2_chain_t *chain;
1763 hammer2_iocb_t *iocb;
1765 hammer2_blockref_t *bref;
1769 * Try to find a chain whos data is already resolved. If none can
1770 * be found, start with the first chain.
1773 for (i = 0; i < cluster->nchains; ++i) {
1774 chain = cluster->array[i].chain;
1775 if (chain && chain->data)
1778 if (i == cluster->nchains) {
1779 chain = cluster->array[0].chain;
1783 iocb = &cluster->iocb;
1784 iocb->callback = callback;
1785 iocb->dio = NULL; /* for already-validated case */
1786 iocb->cluster = cluster;
1787 iocb->chain = chain;
1789 iocb->lbase = (off_t)i;
1794 * Data already validated
1802 * We must resolve to a device buffer, either by issuing I/O or
1803 * by creating a zero-fill element. We do not mark the buffer
1804 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1805 * API must still be used to do that).
1807 * The device buffer is variable-sized in powers of 2 down
1808 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage
1809 * chunk always contains buffers of the same size. (XXX)
1811 * The minimum physical IO size may be larger than the variable
1814 * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
1815 * matches hammer2_devblksize()? Or does the freemap's
1816 * pre-zeroing handle the case for us?
1818 bref = &chain->bref;
1822 /* handled by callback? <- TODO XXX even needed for loads? */
1824 * The getblk() optimization for a 100% overwrite can only be used
1825 * if the physical block size matches the request.
1827 if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1828 chain->bytes == hammer2_devblksize(chain->bytes)) {
1829 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1830 KKASSERT(error == 0);
1838 * Otherwise issue a read
1840 hammer2_adjreadcounter(&chain->bref, chain->bytes);
1841 hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);