2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * The cluster module collects multiple chains representing the same
36 * information from different nodes into a single entity. It allows direct
37 * access to media data as long as it is not blockref array data (which
38 * will obviously have to be different at each node).
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
48 * WARNING! This module is *extremely* complex. It must issue asynchronous
49 * locks and I/O, do quorum and/or master-slave processing, and
50 * it must operate properly even if some nodes are broken (which
51 * can also mean indefinite locks).
55 * Cluster operations can be broken down into three pieces:
57 * (1) Chain locking and data retrieval.
58 * hammer2_cluster_lock()
59 * hammer2_cluster_parent()
61 * - Most complex functions, quorum management on transaction ids.
63 * - Locking and data accesses must be internally asynchronous.
65 * - Validate and manage cache coherency primitives (cache state
66 * is stored in chain topologies but must be validated by these
69 * (2) Lookups and Scans
70 * hammer2_cluster_lookup()
71 * hammer2_cluster_next()
73 * - Depend on locking & data retrieval functions, but still complex.
75 * - Must do quorum management on transaction ids.
77 * - Lookup and Iteration ops Must be internally asynchronous.
79 * (3) Modifying Operations
80 * hammer2_cluster_create()
81 * hammer2_cluster_rename()
82 * hammer2_cluster_delete()
83 * hammer2_cluster_modify()
84 * hammer2_cluster_modsync()
86 * - Can usually punt on failures, operation continues unless quorum
87 * is lost. If quorum is lost, must wait for resynchronization
88 * (depending on the management mode).
90 * - Must disconnect node on failures (also not flush), remount, and
93 * - Network links (via kdmsg) are relatively easy to issue as the
94 * complex underworkings of hammer2_chain.c don't have to messed
95 * with (the protocol is at a higher level than block-level).
97 * - Multiple local disk nodes (i.e. block devices) are another matter.
98 * Chain operations have to be dispatched to per-node threads (xN)
99 * because we can't asynchronize potentially very complex chain
100 * operations in hammer2_chain.c (it would be a huge mess).
102 * (these threads are also used to terminate incoming kdmsg ops from
105 * - Single-node filesystems do not use threads and will simply call
106 * hammer2_chain.c functions directly. This short-cut is handled
107 * at the base of each cluster function.
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
119 * Returns non-zero if any chain in the cluster needs to be resized.
120 * Errored elements are not used in the calculation.
123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
125 hammer2_chain_t *chain;
128 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
129 for (i = 0; i < cluster->nchains; ++i) {
130 chain = cluster->array[i].chain;
135 if (chain->bytes != bytes)
142 * Returns the bref type of the cluster's foucs.
144 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
145 * The cluster must be locked.
148 hammer2_cluster_type(hammer2_cluster_t *cluster)
150 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
151 if (cluster->error == 0)
152 return(cluster->focus->bref.type);
157 * Returns non-zero if the cluster's focus is flagged as being modified.
159 * If the cluster is errored, returns 0.
162 hammer2_cluster_modified(hammer2_cluster_t *cluster)
164 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
165 if (cluster->error == 0)
166 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
171 * Returns the bref of the cluster's focus, sans any data-offset information
172 * (since offset information is per-node and wouldn't be useful).
174 * Callers use this function to access mirror_tid, key, and keybits.
176 * If the cluster is errored, returns an empty bref.
177 * The cluster must be locked.
180 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
182 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
183 if (cluster->error == 0) {
184 *bref = cluster->focus->bref;
187 bzero(bref, sizeof(*bref));
192 * Return non-zero if the chain representing an inode has been flagged
193 * as having been unlinked. Allows the vnode reclaim to avoid loading
194 * the inode data from disk e.g. when unmount or recycling old, clean
197 * The cluster does not need to be locked.
198 * The focus cannot be used since the cluster might not be locked.
201 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster)
203 hammer2_chain_t *chain;
208 for (i = 0; i < cluster->nchains; ++i) {
209 chain = cluster->array[i].chain;
211 flags |= chain->flags;
213 return (flags & HAMMER2_CHAIN_UNLINKED);
217 * Set a bitmask of flags in all chains related to a cluster.
218 * The cluster should probably be locked.
221 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
223 hammer2_chain_t *chain;
226 for (i = 0; i < cluster->nchains; ++i) {
227 chain = cluster->array[i].chain;
229 atomic_set_int(&chain->flags, flags);
234 * Set a bitmask of flags in all chains related to a cluster.
235 * The cluster should probably be locked.
238 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
240 hammer2_chain_t *chain;
243 for (i = 0; i < cluster->nchains; ++i) {
244 chain = cluster->array[i].chain;
246 atomic_clear_int(&chain->flags, flags);
251 * Flag the cluster for flushing recursively up to the root. Despite the
252 * work it does, this is relatively benign. It just makes sure that the
253 * flusher has top-down visibility to this cluster.
255 * Errored chains are not flagged for flushing.
257 * The cluster should probably be locked.
260 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
262 hammer2_chain_t *chain;
265 for (i = 0; i < cluster->nchains; ++i) {
266 chain = cluster->array[i].chain;
271 hammer2_chain_setflush(trans, chain);
276 * Set the check mode for the cluster.
277 * Errored elements of the cluster are ignored.
279 * The cluster must be locked.
282 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
283 hammer2_cluster_t *cluster,
286 hammer2_chain_t *chain;
289 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
290 for (i = 0; i < cluster->nchains; ++i) {
291 chain = cluster->array[i].chain;
296 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
297 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
298 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
303 * Create a degenerate cluster with one ref from a single locked chain.
304 * The returned cluster will be focused on the chain and inherit its
307 * The chain's lock and reference are transfered to the new cluster, so
308 * the caller should not try to unlock the chain separately.
313 hammer2_cluster_from_chain(hammer2_chain_t *chain)
315 hammer2_cluster_t *cluster;
317 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
318 cluster->array[0].chain = chain;
319 cluster->nchains = 1;
320 cluster->focus = chain;
321 cluster->pmp = chain->pmp;
323 cluster->error = chain->error;
324 cluster->flags = HAMMER2_CLUSTER_LOCKED |
325 HAMMER2_CLUSTER_WRHARD |
326 HAMMER2_CLUSTER_RDHARD |
327 HAMMER2_CLUSTER_MSYNCED |
328 HAMMER2_CLUSTER_SSYNCED;
334 * Add a reference to a cluster and its underlying chains.
336 * We must also ref the underlying chains in order to allow ref/unlock
337 * sequences to later re-lock.
340 hammer2_cluster_ref(hammer2_cluster_t *cluster)
342 hammer2_chain_t *chain;
345 atomic_add_int(&cluster->refs, 1);
346 for (i = 0; i < cluster->nchains; ++i) {
347 chain = cluster->array[i].chain;
349 hammer2_chain_ref(chain);
354 * Drop the caller's reference to the cluster. When the ref count drops to
355 * zero this function frees the cluster and drops all underlying chains.
357 * In-progress read I/Os are typically detached from the cluster once the
358 * first one returns (the remaining stay attached to the DIOs but are then
359 * ignored and drop naturally).
362 hammer2_cluster_drop(hammer2_cluster_t *cluster)
364 hammer2_chain_t *chain;
367 KKASSERT(cluster->refs > 0);
368 for (i = 0; i < cluster->nchains; ++i) {
369 chain = cluster->array[i].chain;
371 hammer2_chain_drop(chain);
372 if (cluster->refs == 1)
373 cluster->array[i].chain = NULL;
376 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
377 cluster->focus = NULL; /* safety XXX chg to assert */
378 kfree(cluster, M_HAMMER2);
379 /* cluster is invalid */
384 hammer2_cluster_wait(hammer2_cluster_t *cluster)
386 tsleep(cluster->focus, 0, "h2clcw", 1);
390 * Lock and ref a cluster. This adds a ref to the cluster and its chains
391 * and then locks them, modified by various RESOLVE flags.
393 * The act of locking a cluster sets its focus.
395 * The chains making up the cluster may be narrowed down based on quorum
396 * acceptability, and if RESOLVE_RDONLY is specified the chains can be
397 * narrowed down to a single chain as long as the entire subtopology is known
398 * to be intact. So, for example, we can narrow a read-only op to a single
399 * fast SLAVE but if we focus a CACHE chain we must still retain at least
400 * a SLAVE to ensure that the subtopology can be accessed.
402 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
403 * to be maintained once the topology is validated as-of the top level of
406 * If a failure occurs the operation must be aborted by higher-level code and
410 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
412 hammer2_chain_t *chain;
415 /* cannot be on inode-embedded cluster template, must be on copy */
416 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
417 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
418 kprintf("hammer2_cluster_lock: cluster %p already locked!\n",
421 KKASSERT(cluster->focus == NULL);
423 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
425 if ((how & HAMMER2_RESOLVE_NOREF) == 0)
426 atomic_add_int(&cluster->refs, 1);
429 * Lock chains and resolve state.
431 for (i = 0; i < cluster->nchains; ++i) {
432 chain = cluster->array[i].chain;
435 hammer2_chain_lock(chain, how);
438 hammer2_cluster_resolve(cluster);
442 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
444 hammer2_chain_t *chain;
446 hammer2_tid_t quorum_tid;
470 KKASSERT(pmp != NULL || cluster->nchains == 0);
471 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
476 for (i = 0; i < cluster->nchains; ++i) {
477 chain = cluster->array[i].chain;
481 if (cluster->focus == NULL || cluster->focus == chain) {
482 /* error will be overridden by valid focus */
483 cluster->error = chain->error;
487 * Must count total masters and slaves whether the
488 * chain is errored or not.
490 switch (cluster->pmp->pfs_types[i]) {
491 case HAMMER2_PFSTYPE_MASTER:
494 case HAMMER2_PFSTYPE_SLAVE:
501 switch (cluster->pmp->pfs_types[i]) {
502 case HAMMER2_PFSTYPE_MASTER:
504 if (quorum_tid < chain->bref.mirror_tid ||
507 quorum_tid = chain->bref.mirror_tid;
508 } else if (quorum_tid == chain->bref.mirror_tid) {
512 case HAMMER2_PFSTYPE_SLAVE:
515 case HAMMER2_PFSTYPE_SOFT_MASTER:
516 nflags |= HAMMER2_CLUSTER_WRSOFT;
517 nflags |= HAMMER2_CLUSTER_RDSOFT;
519 case HAMMER2_PFSTYPE_SOFT_SLAVE:
520 nflags |= HAMMER2_CLUSTER_RDSOFT;
522 case HAMMER2_PFSTYPE_SUPROOT:
524 * Degenerate cluster representing the super-root
525 * topology on a single device.
527 nflags |= HAMMER2_CLUSTER_WRHARD;
528 nflags |= HAMMER2_CLUSTER_RDHARD;
529 cluster->focus = chain;
530 cluster->error = chain->error;
540 for (i = 0; i < cluster->nchains; ++i) {
541 chain = cluster->array[i].chain;
545 if (cluster->focus == NULL || cluster->focus == chain) {
546 /* error will be overridden by valid focus */
547 cluster->error = chain->error;
552 switch (cluster->pmp->pfs_types[i]) {
553 case HAMMER2_PFSTYPE_MASTER:
555 * We must have enough up-to-date masters to reach
556 * a quorum and the master mirror_tid must match
557 * the quorum's mirror_tid.
559 * Do not select an errored master.
561 if (nmasters >= nquorum &&
563 quorum_tid == chain->bref.mirror_tid) {
564 nflags |= HAMMER2_CLUSTER_WRHARD;
565 nflags |= HAMMER2_CLUSTER_RDHARD;
566 if (cluster->focus == NULL ||
567 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
568 focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
569 cluster->focus = chain;
570 cluster->error = chain->error;
572 } else if (chain->error == 0) {
573 nflags |= HAMMER2_CLUSTER_UNHARD;
576 case HAMMER2_PFSTYPE_SLAVE:
578 * We must have enough up-to-date masters to reach
579 * a quorum and the slave mirror_tid must match the
580 * quorum's mirror_tid.
582 * Do not select an errored slave.
584 if (nmasters >= nquorum &&
586 quorum_tid == chain->bref.mirror_tid) {
588 nflags |= HAMMER2_CLUSTER_RDHARD;
589 if (cluster->focus == NULL) {
590 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
591 cluster->focus = chain;
592 cluster->error = chain->error;
594 } else if (chain->error == 0) {
595 nflags |= HAMMER2_CLUSTER_UNSOFT;
598 case HAMMER2_PFSTYPE_SOFT_MASTER:
600 * Directly mounted soft master always wins. There
601 * should be only one.
603 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
604 cluster->focus = chain;
605 cluster->error = chain->error;
606 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
608 case HAMMER2_PFSTYPE_SOFT_SLAVE:
610 * Directly mounted soft slave always wins. There
611 * should be only one.
613 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
614 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
615 cluster->focus = chain;
616 cluster->error = chain->error;
617 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
626 nflags |= HAMMER2_CLUSTER_NOHARD;
628 nflags |= HAMMER2_CLUSTER_NOSOFT;
631 * Set SSYNCED or MSYNCED for slaves and masters respectively if
632 * all available nodes (even if 0 are available) are fully
633 * synchronized. This is used by the synchronization thread to
634 * determine if there is work it could potentially accomplish.
636 if (nslaves == ttlslaves)
637 nflags |= HAMMER2_CLUSTER_SSYNCED;
638 if (nmasters == ttlmasters)
639 nflags |= HAMMER2_CLUSTER_MSYNCED;
642 * Determine if the cluster was successfully locked for the
643 * requested operation and generate an error code. The cluster
644 * will not be locked (or ref'd) if an error is returned.
646 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
647 * to determine if reading or writing is possible. If writing, the
648 * cluster still requires a call to hammer2_cluster_modify() first.
650 atomic_set_int(&cluster->flags, nflags);
651 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
655 * Copy a cluster, returned a ref'd cluster. All underlying chains
656 * are also ref'd, but not locked.
658 * The cluster focus is not set because the cluster is not yet locked
659 * (and the originating cluster does not have to be locked either).
662 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
664 hammer2_pfs_t *pmp = ocluster->pmp;
665 hammer2_cluster_t *ncluster;
666 hammer2_chain_t *chain;
669 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
671 ncluster->nchains = ocluster->nchains;
673 ncluster->flags = 0; /* cluster not locked */
675 for (i = 0; i < ocluster->nchains; ++i) {
676 chain = ocluster->array[i].chain;
677 ncluster->array[i].chain = chain;
679 hammer2_chain_ref(chain);
685 * Unlock and deref a cluster. The cluster is destroyed if this is the
689 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
691 hammer2_chain_t *chain;
694 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
695 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
698 /* KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); */
699 KKASSERT(cluster->refs > 0);
700 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
702 for (i = 0; i < cluster->nchains; ++i) {
703 chain = cluster->array[i].chain;
705 hammer2_chain_unlock(chain);
706 if (cluster->refs == 1)
707 cluster->array[i].chain = NULL; /* safety */
710 cluster->focus = NULL;
712 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
713 kfree(cluster, M_HAMMER2);
714 /* cluster = NULL; safety */
719 * Resize the cluster's physical storage allocation in-place. This may
720 * replace the cluster's chains.
723 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
724 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
725 int nradix, int flags)
727 hammer2_chain_t *chain;
730 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */
731 KKASSERT(cparent->nchains == cluster->nchains);
733 for (i = 0; i < cluster->nchains; ++i) {
734 chain = cluster->array[i].chain;
736 KKASSERT(cparent->array[i].chain);
737 hammer2_chain_resize(trans, ip,
738 cparent->array[i].chain, chain,
745 * Set an inode's cluster modified, marking the related chains RW and
746 * duplicating them if necessary.
748 * The passed-in chain is a localized copy of the chain previously acquired
749 * when the inode was locked (and possilby replaced in the mean time), and
750 * must also be updated. In fact, we update it first and then synchronize
751 * the inode's cluster cache.
753 hammer2_inode_data_t *
754 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
755 hammer2_cluster_t *cluster, int flags)
757 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
758 hammer2_cluster_modify(trans, cluster, flags);
760 hammer2_inode_repoint(ip, NULL, cluster);
763 return (&hammer2_cluster_wdata(cluster)->ipdata);
767 * Adjust the cluster's chains to allow modification and adjust the
768 * focus. Data will be accessible on return.
770 * If our focused master errors on modify, re-resolve the cluster to
771 * try to select a different master.
774 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
777 hammer2_chain_t *chain;
782 for (i = 0; i < cluster->nchains; ++i) {
783 chain = cluster->array[i].chain;
788 hammer2_chain_modify(trans, chain, flags);
789 if (cluster->focus == chain && chain->error) {
790 cluster->error = chain->error;
795 hammer2_cluster_resolve(cluster);
799 * Synchronize modifications from the focus to other chains in a cluster.
800 * Convenient because nominal API users can just modify the contents of the
801 * focus (at least for non-blockref data).
803 * Nominal front-end operations only edit non-block-table data in a single
804 * chain. This code copies such modifications to the other chains in the
805 * cluster. Blocktable modifications are handled on a chain-by-chain basis
806 * by both the frontend and the backend and will explode in fireworks if
810 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
812 hammer2_chain_t *focus;
813 hammer2_chain_t *scan;
814 const hammer2_inode_data_t *ripdata;
815 hammer2_inode_data_t *wipdata;
818 focus = cluster->focus;
819 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
821 for (i = 0; i < cluster->nchains; ++i) {
822 scan = cluster->array[i].chain;
823 if (scan == NULL || scan == focus)
827 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
828 KKASSERT(focus->bytes == scan->bytes &&
829 focus->bref.type == scan->bref.type);
830 switch(focus->bref.type) {
831 case HAMMER2_BREF_TYPE_INODE:
832 ripdata = &focus->data->ipdata;
833 wipdata = &scan->data->ipdata;
834 if ((ripdata->op_flags &
835 HAMMER2_OPFLAG_DIRECTDATA) == 0) {
836 bcopy(ripdata, wipdata,
837 offsetof(hammer2_inode_data_t, u));
840 /* fall through to full copy */
841 case HAMMER2_BREF_TYPE_DATA:
842 bcopy(focus->data, scan->data, focus->bytes);
844 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
845 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
846 case HAMMER2_BREF_TYPE_FREEMAP:
847 case HAMMER2_BREF_TYPE_VOLUME:
848 panic("hammer2_cluster_modsync: illegal node type");
852 panic("hammer2_cluster_modsync: unknown node type");
859 * Lookup initialization/completion API
862 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
864 hammer2_cluster_t *cluster;
867 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
868 cluster->pmp = cparent->pmp; /* can be NULL */
869 cluster->flags = 0; /* cluster not locked (yet) */
870 /* cluster->focus = NULL; already null */
872 for (i = 0; i < cparent->nchains; ++i)
873 cluster->array[i].chain = cparent->array[i].chain;
874 cluster->nchains = cparent->nchains;
877 * Independently lock (this will also give cluster 1 ref)
879 if (flags & HAMMER2_LOOKUP_SHARED) {
880 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
881 HAMMER2_RESOLVE_SHARED);
883 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
889 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
892 hammer2_cluster_unlock(cparent);
896 * Locate first match or overlap under parent, return a new cluster
899 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
900 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
903 hammer2_cluster_t *cluster;
904 hammer2_chain_t *chain;
905 hammer2_key_t key_accum;
906 hammer2_key_t key_next;
907 hammer2_key_t bref_key;
914 pmp = cparent->pmp; /* can be NULL */
915 key_accum = *key_nextp;
922 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
923 cluster->pmp = pmp; /* can be NULL */
925 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
926 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
928 for (i = 0; i < cparent->nchains; ++i) {
929 key_next = *key_nextp;
930 if (cparent->array[i].chain == NULL) {
934 chain = hammer2_chain_lookup(&cparent->array[i].chain,
937 &cparent->array[i].cache_index,
939 cluster->array[i].chain = chain;
942 } else if (chain->error) {
944 * Leave errored chain in cluster, but it cannot be
945 * the cluster's focus.
949 int ddflag = (chain->bref.type ==
950 HAMMER2_BREF_TYPE_INODE);
955 if (cluster->focus == NULL) {
956 bref_type = chain->bref.type;
957 bref_key = chain->bref.key;
958 bref_keybits = chain->bref.keybits;
959 bytes = chain->bytes;
960 cluster->ddflag = ddflag;
961 cluster->focus = chain;
965 * Override default focus to follow the parent.
967 if (cparent->focus == cparent->array[i].chain)
968 cluster->focus = chain;
970 KKASSERT(bref_type == chain->bref.type);
971 KKASSERT(bref_key == chain->bref.key);
972 KKASSERT(bref_keybits == chain->bref.keybits);
973 KKASSERT(bytes == chain->bytes);
974 KKASSERT(cluster->ddflag == ddflag);
976 if (key_accum > key_next)
977 key_accum = key_next;
979 *key_nextp = key_accum;
980 cluster->nchains = i;
981 hammer2_cluster_resolve(cluster);
983 if (null_count == i) {
984 hammer2_cluster_drop(cluster);
992 * Locate next match or overlap under parent, replace cluster
995 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
996 hammer2_key_t *key_nextp,
997 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
999 hammer2_chain_t *chain;
1000 hammer2_key_t key_accum;
1001 hammer2_key_t key_next;
1002 hammer2_key_t bref_key;
1009 key_accum = *key_nextp;
1011 cluster->focus = NULL;
1012 cparent->focus = NULL;
1018 cluster->ddflag = 0;
1020 for (i = 0; i < cparent->nchains; ++i) {
1021 key_next = *key_nextp;
1022 chain = cluster->array[i].chain;
1023 if (chain == NULL) {
1027 if (cparent->array[i].chain == NULL) {
1028 if (flags & HAMMER2_LOOKUP_NOLOCK)
1029 hammer2_chain_drop(chain);
1031 hammer2_chain_unlock(chain);
1035 chain = hammer2_chain_next(&cparent->array[i].chain, chain,
1036 &key_next, key_beg, key_end,
1037 &cparent->array[i].cache_index,
1039 cluster->array[i].chain = chain;
1040 if (chain == NULL) {
1042 } else if (chain->error) {
1044 * Leave errored chain in cluster, but it cannot be
1045 * the cluster's focus.
1049 int ddflag = (chain->bref.type ==
1050 HAMMER2_BREF_TYPE_INODE);
1051 if (cluster->focus == NULL) {
1052 bref_type = chain->bref.type;
1053 bref_key = chain->bref.key;
1054 bref_keybits = chain->bref.keybits;
1055 bytes = chain->bytes;
1056 cluster->ddflag = ddflag;
1057 cluster->focus = chain;
1061 * Override default focus to follow the parent.
1063 if (cparent->focus == cparent->array[i].chain)
1064 cluster->focus = chain;
1066 KKASSERT(bref_type == chain->bref.type);
1067 KKASSERT(bref_key == chain->bref.key);
1068 KKASSERT(bref_keybits == chain->bref.keybits);
1069 KKASSERT(bytes == chain->bytes);
1070 KKASSERT(cluster->ddflag == ddflag);
1072 if (key_accum > key_next)
1073 key_accum = key_next;
1075 cluster->nchains = i;
1076 hammer2_cluster_resolve(cluster);
1078 if (null_count == i) {
1079 hammer2_cluster_drop(cluster);
1086 * Create a new cluster using the specified key
1089 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1090 hammer2_cluster_t **clusterp,
1091 hammer2_key_t key, int keybits,
1092 int type, size_t bytes, int flags)
1094 hammer2_cluster_t *cluster;
1099 pmp = trans->pmp; /* can be NULL */
1101 if ((cluster = *clusterp) == NULL) {
1102 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1104 cluster->pmp = pmp; /* can be NULL */
1106 cluster->flags = HAMMER2_CLUSTER_LOCKED;
1108 cluster->focus = NULL;
1111 * NOTE: cluster->array[] entries can initially be NULL. If
1112 * *clusterp is supplied, skip NULL entries, otherwise
1113 * create new chains.
1115 for (i = 0; i < cparent->nchains; ++i) {
1116 if (*clusterp && cluster->array[i].chain == NULL) {
1119 error = hammer2_chain_create(trans, &cparent->array[i].chain,
1120 &cluster->array[i].chain, pmp,
1122 type, bytes, flags);
1123 KKASSERT(error == 0);
1124 if (cluster->focus == NULL)
1125 cluster->focus = cluster->array[i].chain;
1126 if (cparent->focus == cparent->array[i].chain)
1127 cluster->focus = cluster->array[i].chain;
1129 cluster->nchains = i;
1130 *clusterp = cluster;
1131 hammer2_cluster_resolve(cluster);
1137 * Rename a cluster to a new parent.
1139 * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1140 * So the data_off field is not relevant. Only the key and
1144 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1145 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1148 hammer2_chain_t *chain;
1149 hammer2_blockref_t xbref;
1152 cluster->focus = NULL;
1153 cparent->focus = NULL;
1155 for (i = 0; i < cluster->nchains; ++i) {
1156 chain = cluster->array[i].chain;
1159 xbref = chain->bref;
1160 xbref.key = bref->key;
1161 xbref.keybits = bref->keybits;
1162 hammer2_chain_rename(trans, &xbref,
1163 &cparent->array[i].chain,
1166 hammer2_chain_rename(trans, NULL,
1167 &cparent->array[i].chain,
1170 KKASSERT(cluster->array[i].chain == chain); /*remove*/
1176 * Mark a cluster deleted
1179 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1180 hammer2_cluster_t *cluster, int flags)
1182 hammer2_chain_t *chain;
1183 hammer2_chain_t *parent;
1186 if (cparent == NULL) {
1187 kprintf("cparent is NULL\n");
1191 for (i = 0; i < cluster->nchains; ++i) {
1192 parent = (i < cparent->nchains) ?
1193 cparent->array[i].chain : NULL;
1194 chain = cluster->array[i].chain;
1197 if (chain->parent != parent) {
1198 kprintf("hammer2_cluster_delete: parent "
1199 "mismatch chain=%p parent=%p against=%p\n",
1200 chain, chain->parent, parent);
1202 hammer2_chain_delete(trans, parent, chain, flags);
1208 * Create a snapshot of the specified {parent, ochain} with the specified
1209 * label. The originating hammer2_inode must be exclusively locked for
1212 * The ioctl code has already synced the filesystem.
1215 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1216 hammer2_ioc_pfs_t *pfs)
1219 hammer2_cluster_t *ncluster;
1220 const hammer2_inode_data_t *ripdata;
1221 hammer2_inode_data_t *wipdata;
1222 hammer2_chain_t *nchain;
1223 hammer2_inode_t *nip;
1233 kprintf("snapshot %s\n", pfs->name);
1235 name_len = strlen(pfs->name);
1236 lhc = hammer2_dirhash(pfs->name, name_len);
1241 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1243 opfs_clid = ripdata->pfs_clid;
1245 hmp = ocluster->focus->hmp; /* XXX find synchronized local disk */
1248 * Create the snapshot directory under the super-root
1250 * Set PFS type, generate a unique filesystem id, and generate
1251 * a cluster id. Use the same clid when snapshotting a PFS root,
1252 * which theoretically allows the snapshot to be used as part of
1253 * the same cluster (perhaps as a cache).
1255 * Copy the (flushed) blockref array. Theoretically we could use
1256 * chain_duplicate() but it becomes difficult to disentangle
1257 * the shared core so for now just brute-force it.
1263 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1264 proc0.p_ucred, pfs->name, name_len,
1266 HAMMER2_INSERT_PFSROOT, &error);
1269 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1270 wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER;
1271 wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1272 wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT;
1273 kern_uuidgen(&wipdata->pfs_fsid, 1);
1276 * Give the snapshot its own private cluster. As a snapshot
1277 * no further synchronization with the original cluster will
1281 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1282 wipdata->pfs_clid = opfs_clid;
1284 kern_uuidgen(&wipdata->pfs_clid, 1);
1286 kern_uuidgen(&wipdata->pfs_clid, 1);
1288 for (i = 0; i < ncluster->nchains; ++i) {
1289 nchain = ncluster->array[i].chain;
1291 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1294 /* XXX can't set this unless we do an explicit flush, which
1295 we also need a pmp assigned to do, else the flush code
1296 won't flush ncluster because it thinks it is crossing a
1298 hammer2_cluster_set_chainflags(ncluster,
1299 HAMMER2_CHAIN_PFSBOUNDARY);
1302 /* XXX hack blockset copy */
1303 /* XXX doesn't work with real cluster */
1304 KKASSERT(ocluster->nchains == 1);
1305 wipdata->u.blockset = ripdata->u.blockset;
1306 hammer2_cluster_modsync(ncluster);
1307 for (i = 0; i < ncluster->nchains; ++i) {
1308 nchain = ncluster->array[i].chain;
1310 hammer2_flush(trans, nchain);
1312 hammer2_inode_unlock(nip, ncluster);
1318 * Return locked parent cluster given a locked child. The child remains
1319 * locked on return. The new parent's focus follows the child's focus
1320 * and the parent is always resolved.
1323 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1325 hammer2_cluster_t *cparent;
1328 cparent = hammer2_cluster_copy(cluster);
1330 for (i = 0; i < cparent->nchains; ++i) {
1331 hammer2_chain_t *chain;
1332 hammer2_chain_t *rchain;
1335 * Calculate parent for each element. Old chain has an extra
1336 * ref for cparent but the lock remains with cluster.
1338 chain = cparent->array[i].chain;
1341 while ((rchain = chain->parent) != NULL) {
1342 hammer2_chain_ref(rchain);
1343 hammer2_chain_unlock(chain);
1344 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1345 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1346 hammer2_chain_drop(rchain);
1347 if (chain->parent == rchain)
1349 hammer2_chain_unlock(rchain);
1351 if (cluster->focus == chain)
1352 cparent->focus = rchain;
1353 cparent->array[i].chain = rchain;
1354 hammer2_chain_drop(chain);
1356 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1357 hammer2_cluster_resolve(cparent);
1362 /************************************************************************
1364 ************************************************************************
1367 * WARNING! blockref[] array data is not universal. These functions should
1368 * only be used to access universal data.
1370 * NOTE! The rdata call will wait for at least one of the chain I/Os to
1371 * complete if necessary. The I/O's should have already been
1372 * initiated by the cluster_lock/chain_lock operation.
1374 * The cluster must already be in a modified state before wdata
1375 * is called. The data will already be available for this case.
1377 const hammer2_media_data_t *
1378 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1380 return(cluster->focus->data);
1383 hammer2_media_data_t *
1384 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1386 KKASSERT(hammer2_cluster_modified(cluster));
1387 return(cluster->focus->data);
1391 * Load cluster data asynchronously with callback.
1393 * The callback is made for the first validated data found, or NULL
1394 * if no valid data is available.
1396 * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1397 * in the inode with an exclusive lock held), the chain structure may be
1401 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1402 void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1404 hammer2_chain_t *chain;
1405 hammer2_iocb_t *iocb;
1407 hammer2_blockref_t *bref;
1411 * Try to find a chain whos data is already resolved. If none can
1412 * be found, start with the first chain.
1415 for (i = 0; i < cluster->nchains; ++i) {
1416 chain = cluster->array[i].chain;
1417 if (chain && chain->data)
1420 if (i == cluster->nchains) {
1421 chain = cluster->array[0].chain;
1425 iocb = &cluster->iocb;
1426 iocb->callback = callback;
1427 iocb->dio = NULL; /* for already-validated case */
1428 iocb->cluster = cluster;
1429 iocb->chain = chain;
1431 iocb->lbase = (off_t)i;
1436 * Data already validated
1444 * We must resolve to a device buffer, either by issuing I/O or
1445 * by creating a zero-fill element. We do not mark the buffer
1446 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1447 * API must still be used to do that).
1449 * The device buffer is variable-sized in powers of 2 down
1450 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage
1451 * chunk always contains buffers of the same size. (XXX)
1453 * The minimum physical IO size may be larger than the variable
1456 * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
1457 * matches hammer2_devblksize()? Or does the freemap's
1458 * pre-zeroing handle the case for us?
1460 bref = &chain->bref;
1464 /* handled by callback? <- TODO XXX even needed for loads? */
1466 * The getblk() optimization for a 100% overwrite can only be used
1467 * if the physical block size matches the request.
1469 if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1470 chain->bytes == hammer2_devblksize(chain->bytes)) {
1471 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1472 KKASSERT(error == 0);
1480 * Otherwise issue a read
1482 hammer2_adjreadcounter(&chain->bref, chain->bytes);
1483 hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1486 /************************************************************************
1488 ************************************************************************
1490 * A node failure can occur for numerous reasons.
1492 * - A read I/O may fail
1493 * - A write I/O may fail
1494 * - An unexpected chain might be found (or be missing)
1495 * - A node might disconnect temporarily and reconnect later
1496 * (for example, a USB stick could get pulled, or a node might
1497 * be programmatically disconnected).
1498 * - A node might run out of space during a modifying operation.
1500 * When a read failure or an unexpected chain state is found, the chain and
1501 * parent chain at the failure point for the nodes involved (the nodes
1502 * which we determine to be in error) are flagged as failed and removed
1503 * from the cluster. The node itself is allowed to remain active. The
1504 * highest common point (usually a parent chain) is queued to the
1505 * resynchronization thread for action.
1507 * When a write I/O fails or a node runs out of space, we first adjust
1508 * as if a read failure occurs but we further disable flushes on the
1509 * ENTIRE node. Concurrent modifying transactions are allowed to complete
1510 * but any new modifying transactions will automatically remove the node
1511 * from consideration in all related cluster structures and not generate
1512 * any new modified chains. The ROOT chain for the failed node(s) is queued
1513 * to the resynchronization thread for action.
1515 * A temporary disconnect is handled as if a write failure occurred.
1517 * Any of these failures might or might not stall related high level VNOPS,
1518 * depending on what has failed, what nodes remain, the type of cluster,
1519 * and the operating state of the cluster.
1521 * FLUSH ON WRITE-DISABLED NODES
1523 * A flush on a write-disabled node is not allowed to write anything because
1524 * we cannot safely update the mirror_tid anywhere on the failed node. The
1525 * synchronization thread uses mirror_tid to calculate incremental resyncs.
1526 * Dirty meta-data related to the failed node is thrown away.
1528 * Dirty buffer cache buffers and inodes are only thrown away if they can be
1529 * retired... that is, if the filesystem still has enough nodes to complete
1533 /************************************************************************
1534 * SYNCHRONIZATION THREAD *
1535 ************************************************************************
1537 * This thread is responsible for [re]synchronizing the cluster representing
1538 * a PFS. Any out-of-sync or failed node starts this thread on a
1539 * node-by-node basis when the failure is detected.
1541 * Clusters needing resynchronization are queued at the highest point
1542 * where the parent on the failed node is still valid, or a special
1543 * incremental scan from the ROOT is queued if no parent exists. This
1544 * thread is also responsible for waiting for reconnections of the failed
1545 * node if the cause was due to a disconnect, and waiting for space to be
1546 * freed up if the cause was due to running out of space.
1548 * If the cause is due to a node running out of space, this thread will also
1549 * remove older (unlocked) snapshots to make new space, recover space, and
1550 * then start resynchronization.
1552 * Each resynchronization pass virtually snapshots the PFS on the good nodes
1553 * and synchronizes using that snapshot against the target node. This
1554 * ensures a consistent chain topology and also avoids interference between
1555 * the resynchronization thread and frontend operations.
1557 * Since these are per-node threads it is possible to resynchronize several