2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * The cluster module collects multiple chains representing the same
36 * information from different nodes into a single entity. It allows direct
37 * access to media data as long as it is not blockref array data (which
38 * will obviously have to be different at each node).
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
48 * WARNING! This module is *extremely* complex. It must issue asynchronous
49 * locks and I/O, do quorum and/or master-slave processing, and
50 * it must operate properly even if some nodes are broken (which
51 * can also mean indefinite locks).
55 * Cluster operations can be broken down into three pieces:
57 * (1) Chain locking and data retrieval.
58 * hammer2_cluster_lock()
59 * hammer2_cluster_parent()
61 * - Most complex functions, quorum management on transaction ids.
63 * - Locking and data accesses must be internally asynchronous.
65 * - Validate and manage cache coherency primitives (cache state
66 * is stored in chain topologies but must be validated by these
69 * (2) Lookups and Scans
70 * hammer2_cluster_lookup()
71 * hammer2_cluster_next()
73 * - Depend on locking & data retrieval functions, but still complex.
75 * - Must do quorum management on transaction ids.
77 * - Lookup and Iteration ops Must be internally asynchronous.
79 * (3) Modifying Operations
80 * hammer2_cluster_create()
81 * hammer2_cluster_rename()
82 * hammer2_cluster_delete()
83 * hammer2_cluster_modify()
84 * hammer2_cluster_modsync()
86 * - Can usually punt on failures, operation continues unless quorum
87 * is lost. If quorum is lost, must wait for resynchronization
88 * (depending on the management mode).
90 * - Must disconnect node on failures (also not flush), remount, and
93 * - Network links (via kdmsg) are relatively easy to issue as the
94 * complex underworkings of hammer2_chain.c don't have to messed
95 * with (the protocol is at a higher level than block-level).
97 * - Multiple local disk nodes (i.e. block devices) are another matter.
98 * Chain operations have to be dispatched to per-node threads (xN)
99 * because we can't asynchronize potentially very complex chain
100 * operations in hammer2_chain.c (it would be a huge mess).
102 * (these threads are also used to terminate incoming kdmsg ops from
105 * - Single-node filesystems do not use threads and will simply call
106 * hammer2_chain.c functions directly. This short-cut is handled
107 * at the base of each cluster function.
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
119 * Returns the bref type of the cluster's foucs.
121 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
122 * The cluster must be locked.
125 hammer2_cluster_type(hammer2_cluster_t *cluster)
127 if (cluster->error == 0) {
128 KKASSERT(cluster->focus != NULL);
129 return(cluster->focus->bref.type);
135 * Returns non-zero if the cluster's focus is flagged as being modified.
137 * If the cluster is errored, returns 0.
141 hammer2_cluster_modified(hammer2_cluster_t *cluster)
143 if (cluster->error == 0) {
144 KKASSERT(cluster->focus != NULL);
145 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
151 * Returns the bref of the cluster's focus, sans any data-offset information
152 * (since offset information is per-node and wouldn't be useful).
154 * Callers use this function to access modify_tid, mirror_tid, type,
157 * If the cluster is errored, returns an empty bref.
158 * The cluster must be locked.
161 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
163 if (cluster->error == 0) {
164 KKASSERT(cluster->focus != NULL);
165 *bref = cluster->focus->bref;
168 bzero(bref, sizeof(*bref));
173 * Create a degenerate cluster with one ref from a single locked chain.
174 * The returned cluster will be focused on the chain and inherit its
177 * The chain's lock and reference are transfered to the new cluster, so
178 * the caller should not try to unlock the chain separately.
183 hammer2_cluster_from_chain(hammer2_chain_t *chain)
185 hammer2_cluster_t *cluster;
187 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
188 cluster->array[0].chain = chain;
189 cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
190 cluster->nchains = 1;
191 cluster->focus = chain;
192 cluster->focus_index = 0;
193 cluster->pmp = chain->pmp;
195 cluster->error = chain->error;
196 cluster->flags = HAMMER2_CLUSTER_LOCKED |
197 HAMMER2_CLUSTER_WRHARD |
198 HAMMER2_CLUSTER_RDHARD |
199 HAMMER2_CLUSTER_MSYNCED |
200 HAMMER2_CLUSTER_SSYNCED;
206 * Add a reference to a cluster and its underlying chains.
208 * We must also ref the underlying chains in order to allow ref/unlock
209 * sequences to later re-lock.
212 hammer2_cluster_ref(hammer2_cluster_t *cluster)
214 atomic_add_int(&cluster->refs, 1);
218 * Drop the caller's reference to the cluster. When the ref count drops to
219 * zero this function frees the cluster and drops all underlying chains.
221 * In-progress read I/Os are typically detached from the cluster once the
222 * first one returns (the remaining stay attached to the DIOs but are then
223 * ignored and drop naturally).
226 hammer2_cluster_drop(hammer2_cluster_t *cluster)
228 hammer2_chain_t *chain;
231 KKASSERT(cluster->refs > 0);
232 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
233 cluster->focus = NULL; /* safety XXX chg to assert */
234 cluster->focus_index = 0;
236 for (i = 0; i < cluster->nchains; ++i) {
237 chain = cluster->array[i].chain;
239 hammer2_chain_drop(chain);
240 cluster->array[i].chain = NULL; /* safety */
243 cluster->nchains = 0; /* safety */
245 kfree(cluster, M_HAMMER2);
246 /* cluster is invalid */
251 * Lock a cluster. Cluster must already be referenced. Focus is maintained.
253 * WARNING! This function expects the caller to handle resolution of the
254 * cluster. We never re-resolve the cluster in this function,
255 * because it might be used to temporarily unlock/relock a cparent
256 * in an iteration or recursrion, and the cparents elements do not
260 hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
262 hammer2_chain_t *chain;
265 /* cannot be on inode-embedded cluster template, must be on copy */
266 KKASSERT(cluster->refs > 0);
267 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
268 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
269 panic("hammer2_cluster_lock: cluster %p already locked!\n",
272 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
275 * Lock chains and resolve state.
277 for (i = 0; i < cluster->nchains; ++i) {
280 chain = cluster->array[i].chain;
283 hammer2_chain_lock(chain, how);
288 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
290 hammer2_cluster_lock_except(cluster, -1, how);
294 * Calculate the clustering state for the cluster and set its focus.
295 * This routine must be called with care. For example, it should not
296 * normally be called after relocking a non-leaf cluster because parent
297 * clusters help iterations and each element might be at a slightly different
298 * indirect node (each node's topology is independently indexed).
300 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
301 * operations. Typically this is only set on a quorum of MASTERs or
302 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER
303 * is present, this bit is *not* set on a quorum of MASTERs. The
304 * synchronization code ignores this bit, but all hammer2_cluster_*() calls
305 * that create/modify/delete elements use it.
307 * The chains making up the cluster may be narrowed down based on quorum
308 * acceptability, and if RESOLVE_RDONLY is specified the chains can be
309 * narrowed down to a single chain as long as the entire subtopology is known
310 * to be intact. So, for example, we can narrow a read-only op to a single
311 * fast SLAVE but if we focus a CACHE chain we must still retain at least
312 * a SLAVE to ensure that the subtopology can be accessed.
314 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
315 * to be maintained once the topology is validated as-of the top level of
318 * If a failure occurs the operation must be aborted by higher-level code and
322 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
324 hammer2_chain_t *chain;
325 hammer2_chain_t *focus;
327 hammer2_tid_t quorum_tid;
328 hammer2_tid_t last_best_quorum_tid;
340 cluster->focus = NULL;
353 KKASSERT(pmp != NULL || cluster->nchains == 0);
354 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
360 * NOTE: A NULL chain is not necessarily an error, it could be
361 * e.g. a lookup failure or the end of an iteration.
364 for (i = 0; i < cluster->nchains; ++i) {
365 chain = cluster->array[i].chain;
366 if (chain && chain->error) {
367 if (cluster->focus == NULL || cluster->focus == chain) {
368 /* error will be overridden by valid focus */
369 cluster->error = chain->error;
373 * Must count total masters and slaves whether the
374 * chain is errored or not.
376 switch (cluster->pmp->pfs_types[i]) {
377 case HAMMER2_PFSTYPE_MASTER:
380 case HAMMER2_PFSTYPE_SLAVE:
386 switch (cluster->pmp->pfs_types[i]) {
387 case HAMMER2_PFSTYPE_MASTER:
390 case HAMMER2_PFSTYPE_SLAVE:
393 case HAMMER2_PFSTYPE_SOFT_MASTER:
394 nflags |= HAMMER2_CLUSTER_WRSOFT;
395 nflags |= HAMMER2_CLUSTER_RDSOFT;
398 case HAMMER2_PFSTYPE_SOFT_SLAVE:
399 nflags |= HAMMER2_CLUSTER_RDSOFT;
401 case HAMMER2_PFSTYPE_SUPROOT:
403 * Degenerate cluster representing the super-root
404 * topology on a single device. Fake stuff so
405 * cluster ops work as expected.
407 nflags |= HAMMER2_CLUSTER_WRHARD;
408 nflags |= HAMMER2_CLUSTER_RDHARD;
409 cluster->focus_index = i;
410 cluster->focus = chain;
411 cluster->error = chain ? chain->error : 0;
421 * Resolve masters. Calculate nmasters for the highest matching
422 * TID, if a quorum cannot be attained try the next lower matching
423 * TID until we exhaust TIDs.
425 * NOTE: A NULL chain is not necessarily an error, it could be
426 * e.g. a lookup failure or the end of an iteration.
429 last_best_quorum_tid = HAMMER2_TID_MAX;
430 quorum_tid = 0; /* fix gcc warning */
432 while (nmasters < nquorum && last_best_quorum_tid != 0) {
436 for (i = 0; i < cluster->nchains; ++i) {
437 if (cluster->pmp->pfs_types[i] !=
438 HAMMER2_PFSTYPE_MASTER) {
441 chain = cluster->array[i].chain;
443 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
445 * Invalid as in unsynchronized, cannot be
446 * used to calculate the quorum.
448 } else if (chain == NULL && quorum_tid == 0) {
450 * NULL chain on master matches NULL chains
454 } else if (quorum_tid < last_best_quorum_tid &&
456 (quorum_tid < chain->bref.modify_tid ||
459 * Better TID located, reset nmasters count.
462 quorum_tid = chain->bref.modify_tid;
464 quorum_tid == chain->bref.modify_tid) {
466 * TID matches current collection.
471 if (nmasters >= nquorum)
473 last_best_quorum_tid = quorum_tid;
479 * NOTE: A NULL chain is not necessarily an error, it could be
480 * e.g. a lookup failure or the end of an iteration.
483 for (i = 0; i < cluster->nchains; ++i) {
484 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
485 chain = cluster->array[i].chain;
486 if (chain && chain->error) {
487 if (cluster->focus == NULL || cluster->focus == chain) {
488 /* error will be overridden by valid focus */
489 cluster->error = chain->error;
494 switch (cluster->pmp->pfs_types[i]) {
495 case HAMMER2_PFSTYPE_MASTER:
497 * We must have enough up-to-date masters to reach
498 * a quorum and the master modify_tid must match
499 * the quorum's modify_tid.
501 * Do not select an errored or out-of-sync master.
503 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
504 nflags |= HAMMER2_CLUSTER_UNHARD;
505 } else if (nmasters >= nquorum &&
506 (chain == NULL || chain->error == 0) &&
507 ((chain == NULL && quorum_tid == 0) ||
508 (chain != NULL && quorum_tid ==
509 chain->bref.modify_tid))) {
510 nflags |= HAMMER2_CLUSTER_WRHARD;
511 nflags |= HAMMER2_CLUSTER_RDHARD;
513 cluster->array[i].flags |=
516 if (cluster->focus == NULL ||
517 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
518 focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
519 cluster->focus_index = i;
520 cluster->focus = chain; /* NULL ok */
521 cluster->error = chain ? chain->error :
524 } else if (chain == NULL || chain->error == 0) {
525 nflags |= HAMMER2_CLUSTER_UNHARD;
528 case HAMMER2_PFSTYPE_SLAVE:
530 * We must have enough up-to-date masters to reach
531 * a quorum and the slave modify_tid must match the
532 * quorum's modify_tid.
534 * Do not select an errored slave.
536 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
537 nflags |= HAMMER2_CLUSTER_UNHARD;
538 } else if (nmasters >= nquorum &&
539 (chain == NULL || chain->error == 0) &&
540 ((chain == NULL && quorum_tid == 0) ||
541 (chain && quorum_tid ==
542 chain->bref.modify_tid))) {
544 nflags |= HAMMER2_CLUSTER_RDHARD;
546 /* XXX optimize for RESOLVE_RDONLY */
547 if (cluster->focus == NULL) {
548 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
549 cluster->focus_index = i;
550 cluster->focus = chain; /* NULL ok */
551 cluster->error = chain ? chain->error :
555 } else if (chain == NULL || chain->error == 0) {
556 nflags |= HAMMER2_CLUSTER_UNSOFT;
559 case HAMMER2_PFSTYPE_SOFT_MASTER:
561 * Directly mounted soft master always wins. There
562 * should be only one.
564 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
565 cluster->focus_index = i;
566 cluster->focus = chain;
567 cluster->error = chain ? chain->error : 0;
568 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
569 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
571 case HAMMER2_PFSTYPE_SOFT_SLAVE:
573 * Directly mounted soft slave always wins. There
574 * should be only one.
576 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
577 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
578 cluster->focus_index = i;
579 cluster->focus = chain;
580 cluster->error = chain ? chain->error : 0;
581 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
584 case HAMMER2_PFSTYPE_SUPROOT:
586 * spmp (degenerate case)
589 cluster->focus_index = i;
590 cluster->focus = chain;
591 cluster->error = chain ? chain->error : 0;
592 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
593 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
601 * Focus now set, adjust ddflag. Skip this pass if the focus
602 * is bad or if we are at the PFS root (the bref won't match at
603 * the PFS root, obviously).
605 focus = cluster->focus;
608 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
613 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
619 * Validate the elements that were not marked invalid. They should
622 for (i = 0; i < cluster->nchains; ++i) {
625 chain = cluster->array[i].chain;
631 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
634 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
635 if (chain->bref.type != focus->bref.type ||
636 chain->bref.key != focus->bref.key ||
637 chain->bref.keybits != focus->bref.keybits ||
638 chain->bref.modify_tid != focus->bref.modify_tid ||
639 chain->bytes != focus->bytes ||
640 ddflag != cluster->ddflag) {
641 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
642 if (hammer2_debug & 1)
643 kprintf("cluster_resolve: matching modify_tid failed "
644 "bref test: idx=%d type=%02x/%02x "
645 "key=%016jx/%d-%016jx/%d "
646 "mod=%016jx/%016jx bytes=%u/%u\n",
648 chain->bref.type, focus->bref.type,
649 chain->bref.key, chain->bref.keybits,
650 focus->bref.key, focus->bref.keybits,
651 chain->bref.modify_tid, focus->bref.modify_tid,
652 chain->bytes, focus->bytes);
653 if (hammer2_debug & 0x4000)
654 panic("cluster_resolve");
655 /* flag issue and force resync? */
661 nflags |= HAMMER2_CLUSTER_NOSOFT;
663 nflags |= HAMMER2_CLUSTER_NOHARD;
666 * Set SSYNCED or MSYNCED for slaves and masters respectively if
667 * all available nodes (even if 0 are available) are fully
668 * synchronized. This is used by the synchronization thread to
669 * determine if there is work it could potentially accomplish.
671 if (nslaves == ttlslaves)
672 nflags |= HAMMER2_CLUSTER_SSYNCED;
673 if (nmasters == ttlmasters)
674 nflags |= HAMMER2_CLUSTER_MSYNCED;
677 * Determine if the cluster was successfully locked for the
678 * requested operation and generate an error code. The cluster
679 * will not be locked (or ref'd) if an error is returned.
681 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
682 * to determine if reading or writing is possible. If writing, the
683 * cluster still requires a call to hammer2_cluster_modify() first.
685 atomic_set_int(&cluster->flags, nflags);
686 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
690 * This is used by the XOPS subsystem to calculate the state of
691 * the collection and tell hammer2_xop_collect() what to do with it.
692 * The collection can be in various states of desynchronization, the
693 * caller specifically wants to resolve the passed-in key.
696 * 0 - Quorum agreement, key is valid
698 * ENOENT - Quorum agreement, end of scan
700 * ESRCH - Quorum agreement, key is INVALID (caller should
703 * EIO - Quorum agreement but all elements had errors.
705 * EDEADLK - No quorum agreement possible for key, a repair
706 * may be needed. Caller has to decide what to do,
707 * possibly iterating the key or generating an EIO.
709 * EINPROGRESS - No quorum agreement yet, but agreement is still
710 * possible if caller waits for more responses. Caller
711 * should not iterate key.
713 * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
716 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
718 hammer2_chain_t *chain;
719 hammer2_chain_t *focus;
721 hammer2_tid_t quorum_tid;
722 hammer2_tid_t last_best_quorum_tid;
727 int nmasters_keymatch;
730 int umasters; /* unknown masters (still in progress) */
735 cluster->focus = NULL;
741 nmasters_keymatch = 0;
749 KKASSERT(pmp != NULL || cluster->nchains == 0);
750 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
756 * NOTE: A NULL chain is not necessarily an error, it could be
757 * e.g. a lookup failure or the end of an iteration.
760 for (i = 0; i < cluster->nchains; ++i) {
761 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
762 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
764 chain = cluster->array[i].chain;
765 if (chain && chain->error) {
766 if (cluster->focus == NULL || cluster->focus == chain) {
767 /* error will be overridden by valid focus */
768 cluster->error = chain->error;
772 * Must count total masters and slaves whether the
773 * chain is errored or not.
775 switch (cluster->pmp->pfs_types[i]) {
776 case HAMMER2_PFSTYPE_MASTER:
779 case HAMMER2_PFSTYPE_SLAVE:
785 switch (cluster->pmp->pfs_types[i]) {
786 case HAMMER2_PFSTYPE_MASTER:
789 case HAMMER2_PFSTYPE_SLAVE:
792 case HAMMER2_PFSTYPE_SOFT_MASTER:
793 nflags |= HAMMER2_CLUSTER_WRSOFT;
794 nflags |= HAMMER2_CLUSTER_RDSOFT;
797 case HAMMER2_PFSTYPE_SOFT_SLAVE:
798 nflags |= HAMMER2_CLUSTER_RDSOFT;
800 case HAMMER2_PFSTYPE_SUPROOT:
802 * Degenerate cluster representing the super-root
803 * topology on a single device. Fake stuff so
804 * cluster ops work as expected.
806 nflags |= HAMMER2_CLUSTER_WRHARD;
807 nflags |= HAMMER2_CLUSTER_RDHARD;
808 cluster->focus_index = i;
809 cluster->focus = chain;
810 cluster->error = chain ? chain->error : 0;
820 * Resolve nmasters - master nodes fully match
822 * Resolve umasters - master nodes operation still
825 * Resolve nmasters_keymatch - master nodes match the passed-in
826 * key and may or may not match
827 * the quorum-agreed tid.
829 * The quorum-agreed TID is the highest matching TID.
831 last_best_quorum_tid = HAMMER2_TID_MAX;
832 quorum_tid = 0; /* fix gcc warning */
834 while (nmasters < nquorum && last_best_quorum_tid != 0) {
838 for (i = 0; i < cluster->nchains; ++i) {
839 /* XXX SOFT smpresent handling */
840 if (cluster->pmp->pfs_types[i] !=
841 HAMMER2_PFSTYPE_MASTER) {
845 chain = cluster->array[i].chain;
848 * Skip elements still in progress. umasters keeps
849 * track of masters that might still be in-progress.
851 if (chain == NULL && (cluster->array[i].flags &
852 HAMMER2_CITEM_NULL) == 0) {
860 if (flags & HAMMER2_CHECK_NULL) {
866 (key == (hammer2_key_t)-1 ||
867 chain->bref.key == key)) {
869 if (quorum_tid < last_best_quorum_tid &&
870 (quorum_tid < chain->bref.modify_tid ||
873 * Better TID located, reset
877 quorum_tid = chain->bref.modify_tid;
879 if (quorum_tid == chain->bref.modify_tid) {
881 * TID matches current collection.
884 if (chain->error == 0) {
885 cluster->focus = chain;
886 cluster->focus_index = i;
891 if (nmasters >= nquorum)
893 last_best_quorum_tid = quorum_tid;
897 kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
898 nmasters, nquorum, nmasters_keymatch, umasters);
902 * Early return if we do not have enough masters.
904 if (nmasters < nquorum) {
905 if (nmasters + umasters >= nquorum)
907 if (nmasters_keymatch < nquorum)
913 * Validated end of scan.
915 if (flags & HAMMER2_CHECK_NULL)
919 * If we have a NULL focus at this point the agreeing quorum all
922 if (cluster->focus == NULL)
928 * We have quorum agreement, validate elements, not end of scan.
930 for (i = 0; i < cluster->nchains; ++i) {
931 chain = cluster->array[i].chain;
933 chain->bref.key != key ||
934 chain->bref.modify_tid != quorum_tid) {
938 switch (cluster->pmp->pfs_types[i]) {
939 case HAMMER2_PFSTYPE_MASTER:
940 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
941 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
942 nflags |= HAMMER2_CLUSTER_WRHARD;
943 nflags |= HAMMER2_CLUSTER_RDHARD;
945 case HAMMER2_PFSTYPE_SLAVE:
947 * We must have enough up-to-date masters to reach
948 * a quorum and the slave modify_tid must match the
949 * quorum's modify_tid.
951 * Do not select an errored slave.
953 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
954 nflags |= HAMMER2_CLUSTER_RDHARD;
957 case HAMMER2_PFSTYPE_SOFT_MASTER:
959 * Directly mounted soft master always wins. There
960 * should be only one.
962 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
963 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
965 case HAMMER2_PFSTYPE_SOFT_SLAVE:
967 * Directly mounted soft slave always wins. There
968 * should be only one.
972 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
974 case HAMMER2_PFSTYPE_SUPROOT:
976 * spmp (degenerate case)
978 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
979 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
987 * Focus now set, adjust ddflag. Skip this pass if the focus
988 * is bad or if we are at the PFS root (the bref won't match at
989 * the PFS root, obviously).
991 focus = cluster->focus;
994 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
999 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1005 * Validate the elements that were not marked invalid. They should
1008 for (i = 0; i < cluster->nchains; ++i) {
1011 chain = cluster->array[i].chain;
1017 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1020 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1021 if (chain->bref.type != focus->bref.type ||
1022 chain->bref.key != focus->bref.key ||
1023 chain->bref.keybits != focus->bref.keybits ||
1024 chain->bref.modify_tid != focus->bref.modify_tid ||
1025 chain->bytes != focus->bytes ||
1026 ddflag != cluster->ddflag) {
1027 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1028 if (hammer2_debug & 1)
1029 kprintf("cluster_resolve: matching modify_tid failed "
1030 "bref test: idx=%d type=%02x/%02x "
1031 "key=%016jx/%d-%016jx/%d "
1032 "mod=%016jx/%016jx bytes=%u/%u\n",
1034 chain->bref.type, focus->bref.type,
1035 chain->bref.key, chain->bref.keybits,
1036 focus->bref.key, focus->bref.keybits,
1037 chain->bref.modify_tid, focus->bref.modify_tid,
1038 chain->bytes, focus->bytes);
1039 if (hammer2_debug & 0x4000)
1040 panic("cluster_resolve");
1041 /* flag issue and force resync? */
1047 nflags |= HAMMER2_CLUSTER_NOSOFT;
1048 if (ttlmasters == 0)
1049 nflags |= HAMMER2_CLUSTER_NOHARD;
1052 * Set SSYNCED or MSYNCED for slaves and masters respectively if
1053 * all available nodes (even if 0 are available) are fully
1054 * synchronized. This is used by the synchronization thread to
1055 * determine if there is work it could potentially accomplish.
1057 if (nslaves == ttlslaves)
1058 nflags |= HAMMER2_CLUSTER_SSYNCED;
1059 if (nmasters == ttlmasters)
1060 nflags |= HAMMER2_CLUSTER_MSYNCED;
1063 * Determine if the cluster was successfully locked for the
1064 * requested operation and generate an error code. The cluster
1065 * will not be locked (or ref'd) if an error is returned.
1067 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
1068 * to determine if reading or writing is possible. If writing, the
1069 * cluster still requires a call to hammer2_cluster_modify() first.
1071 atomic_set_int(&cluster->flags, nflags);
1072 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1078 * This is used by the sync thread to force non-NULL elements of a copy
1079 * of the pmp->iroot cluster to be good which is required to prime the
1083 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1087 for (i = 0; i < cluster->nchains; ++i) {
1088 if (cluster->array[i].chain)
1089 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1094 * Copy a cluster, returned a ref'd cluster. All underlying chains
1095 * are also ref'd, but not locked. Focus state is also copied.
1097 * Original cluster does not have to be locked but usually is.
1098 * New cluster will not be flagged as locked.
1100 * Callers using this function to initialize a new cluster from an inode
1101 * generally lock and resolve the resulting cluster.
1103 * Callers which use this function to save/restore a cluster structure
1104 * generally retain the focus state and do not re-resolve it. Caller should
1105 * not try to re-resolve internal (cparent) node state during an iteration
1106 * as the individual tracking elements of cparent in an iteration may not
1107 * match even though they are correct.
1110 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
1112 hammer2_pfs_t *pmp = ocluster->pmp;
1113 hammer2_cluster_t *ncluster;
1114 hammer2_chain_t *chain;
1117 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
1118 ncluster->pmp = pmp;
1119 ncluster->nchains = ocluster->nchains;
1122 for (i = 0; i < ocluster->nchains; ++i) {
1123 chain = ocluster->array[i].chain;
1124 ncluster->array[i].chain = chain;
1125 ncluster->array[i].flags = ocluster->array[i].flags;
1127 hammer2_chain_ref(chain);
1129 ncluster->focus_index = ocluster->focus_index;
1130 ncluster->focus = ocluster->focus;
1131 ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED |
1132 HAMMER2_CLUSTER_INODE);
1138 * Unlock a cluster. Refcount and focus is maintained.
1141 hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx)
1143 hammer2_chain_t *chain;
1146 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1147 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1150 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1151 KKASSERT(cluster->refs > 0);
1152 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1154 for (i = 0; i < cluster->nchains; ++i) {
1157 chain = cluster->array[i].chain;
1159 hammer2_chain_unlock(chain);
1164 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1166 hammer2_cluster_unlock_except(cluster, -1);
1170 * Set an inode's cluster modified, marking the related chains RW and
1171 * duplicating them if necessary.
1173 * The passed-in chain is a localized copy of the chain previously acquired
1174 * when the inode was locked (and possilby replaced in the mean time), and
1175 * must also be updated. In fact, we update it first and then synchronize
1176 * the inode's cluster cache.
1178 hammer2_inode_data_t *
1179 hammer2_cluster_modify_ip(hammer2_inode_t *ip,
1180 hammer2_cluster_t *cluster, int flags)
1182 hammer2_inode_modify(ip);
1183 hammer2_cluster_modify(cluster, flags);
1184 hammer2_inode_repoint(ip, NULL, cluster);
1185 return (&hammer2_cluster_wdata(cluster)->ipdata);
1189 * Adjust the cluster's chains to allow modification and adjust the
1190 * focus. Data will be accessible on return.
1192 * If our focused master errors on modify, re-resolve the cluster to
1193 * try to select a different master.
1196 hammer2_cluster_modify(hammer2_cluster_t *cluster, int flags)
1198 hammer2_chain_t *chain;
1203 for (i = 0; i < cluster->nchains; ++i) {
1204 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1205 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1208 chain = cluster->array[i].chain;
1213 hammer2_chain_modify(chain, flags);
1214 if (cluster->focus == chain && chain->error) {
1215 cluster->error = chain->error;
1220 hammer2_cluster_resolve(cluster);
1224 * Synchronize modifications from the focus to other chains in a cluster.
1225 * Convenient because nominal API users can just modify the contents of the
1226 * focus (at least for non-blockref data).
1228 * Nominal front-end operations only edit non-block-table data in a single
1229 * chain. This code copies such modifications to the other chains in the
1230 * cluster. Blocktable modifications are handled on a chain-by-chain basis
1231 * by both the frontend and the backend and will explode in fireworks if
1235 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
1237 hammer2_chain_t *focus;
1238 hammer2_chain_t *scan;
1239 const hammer2_inode_data_t *ripdata;
1240 hammer2_inode_data_t *wipdata;
1243 focus = cluster->focus;
1244 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
1246 for (i = 0; i < cluster->nchains; ++i) {
1247 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1249 scan = cluster->array[i].chain;
1250 if (scan == NULL || scan == focus)
1254 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
1255 KKASSERT(focus->bytes == scan->bytes &&
1256 focus->bref.type == scan->bref.type);
1257 switch(focus->bref.type) {
1258 case HAMMER2_BREF_TYPE_INODE:
1259 ripdata = &focus->data->ipdata;
1260 wipdata = &scan->data->ipdata;
1261 if ((ripdata->meta.op_flags &
1262 HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1263 bcopy(ripdata, wipdata,
1264 offsetof(hammer2_inode_data_t, u));
1267 /* fall through to full copy */
1268 case HAMMER2_BREF_TYPE_DATA:
1269 bcopy(focus->data, scan->data, focus->bytes);
1271 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1272 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1273 case HAMMER2_BREF_TYPE_FREEMAP:
1274 case HAMMER2_BREF_TYPE_VOLUME:
1275 panic("hammer2_cluster_modsync: illegal node type");
1279 panic("hammer2_cluster_modsync: unknown node type");
1286 * Lookup initialization/completion API. Returns a locked, fully resolved
1287 * cluster with one ref.
1290 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
1292 hammer2_cluster_t *cluster;
1294 cluster = hammer2_cluster_copy(cparent);
1295 if (flags & HAMMER2_LOOKUP_SHARED) {
1296 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
1297 HAMMER2_RESOLVE_SHARED);
1299 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1301 hammer2_cluster_resolve(cluster);
1307 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1310 hammer2_cluster_unlock(cparent);
1311 hammer2_cluster_drop(cparent);
1316 * Locate first match or overlap under parent, return a new, locked, resolved
1317 * cluster with one ref.
1319 * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1322 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
1323 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1326 hammer2_cluster_t *cluster;
1327 hammer2_chain_t *chain;
1328 hammer2_key_t key_accum;
1329 hammer2_key_t key_next;
1334 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1336 pmp = cparent->pmp; /* can be NULL */
1337 key_accum = *key_nextp;
1339 if (flags & HAMMER2_LOOKUP_SHARED)
1340 rflags = HAMMER2_RESOLVE_SHARED;
1344 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1345 cluster->pmp = pmp; /* can be NULL */
1347 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1348 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1351 * Iterating earlier cluster elements with later elements still
1352 * locked is a problem, so we have to unlock the parent and then
1355 hammer2_cluster_unlock(cparent);
1356 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1359 * Pass-1, issue lookups.
1361 for (i = 0; i < cparent->nchains; ++i) {
1362 cluster->array[i].flags = cparent->array[i].flags;
1363 key_next = *key_nextp;
1366 * Always relock the parent as we go.
1368 if (cparent->array[i].chain) {
1369 hammer2_chain_lock(cparent->array[i].chain, rflags);
1373 * Nothing to base the lookup, or parent was not synchronized.
1375 if (cparent->array[i].chain == NULL ||
1376 (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
1381 chain = hammer2_chain_lookup(&cparent->array[i].chain,
1384 &cparent->array[i].cache_index,
1386 cluster->array[i].chain = chain;
1387 if (chain == NULL) {
1390 if (key_accum > key_next)
1391 key_accum = key_next;
1397 cluster->nchains = i;
1398 *key_nextp = key_accum;
1401 * The cluster must be resolved, out of sync elements may be present.
1403 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1405 if (null_count != i)
1406 hammer2_cluster_resolve(cluster);
1407 if (null_count == i ||
1408 (cluster->focus == NULL &&
1409 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1410 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1411 hammer2_cluster_unlock(cluster);
1412 hammer2_cluster_drop(cluster);
1420 * Locate next match or overlap under parent, replace the passed-in cluster.
1421 * The returned cluster is a new, locked, resolved cluster with one ref.
1423 * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1426 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1427 hammer2_key_t *key_nextp,
1428 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1430 hammer2_chain_t *ochain;
1431 hammer2_chain_t *nchain;
1432 hammer2_key_t key_accum;
1433 hammer2_key_t key_next;
1440 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1442 key_accum = *key_nextp;
1444 parent_index = cparent->focus_index; /* save prior focus */
1445 cluster_index = cluster->focus_index;
1446 if (flags & HAMMER2_LOOKUP_SHARED)
1447 rflags = HAMMER2_RESOLVE_SHARED;
1451 cluster->focus = NULL; /* XXX needed any more? */
1452 /*cparent->focus = NULL;*/
1453 cluster->focus_index = 0; /* XXX needed any more? */
1454 /*cparent->focus_index = 0;*/
1456 cluster->ddflag = 0;
1459 * The parent is always locked on entry, the iterator may be locked
1460 * depending on flags.
1462 * We must temporarily unlock the passed-in clusters to avoid a
1463 * deadlock between elements of the cluster with other threads.
1464 * We will fixup the lock in the loop.
1466 * Note that this will clear the focus.
1468 * Reflag the clusters as locked, because we will relock them
1471 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1472 hammer2_cluster_unlock(cluster);
1473 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1475 hammer2_cluster_unlock(cparent);
1476 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1478 for (i = 0; i < cparent->nchains; ++i) {
1479 key_next = *key_nextp;
1480 ochain = cluster->array[i].chain;
1483 * Always relock the parent as we go.
1485 if (cparent->array[i].chain)
1486 hammer2_chain_lock(cparent->array[i].chain, rflags);
1489 * Nothing to iterate from. These cases can occur under
1490 * normal operations. For example, during synchronization
1491 * a slave might reach the end of its scan while records
1492 * are still left on the master(s).
1494 if (ochain == NULL) {
1498 if (cparent->array[i].chain == NULL ||
1499 (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1500 (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1501 /* ochain has not yet been relocked */
1502 hammer2_chain_drop(ochain);
1503 cluster->array[i].chain = NULL;
1509 * Relock the child if necessary. Parent and child will then
1510 * be locked as expected by hammer2_chain_next() and flags.
1512 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1513 hammer2_chain_lock(ochain, rflags);
1514 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1515 &key_next, key_beg, key_end,
1516 &cparent->array[i].cache_index,
1518 /* ochain now invalid but can still be used for focus check */
1519 if (parent_index == i) {
1520 cparent->focus_index = i;
1521 cparent->focus = cparent->array[i].chain;
1524 cluster->array[i].chain = nchain;
1525 if (nchain == NULL) {
1528 if (key_accum > key_next)
1529 key_accum = key_next;
1535 cluster->nchains = i;
1536 *key_nextp = key_accum;
1539 * The cluster must be resolved, out of sync elements may be present.
1541 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1543 if (null_count != i)
1544 hammer2_cluster_resolve(cluster);
1545 if (null_count == i ||
1546 (cluster->focus == NULL &&
1547 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1548 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1549 hammer2_cluster_unlock(cluster);
1550 hammer2_cluster_drop(cluster);
1557 * Advance just one chain in the cluster and recalculate the invalid bit.
1558 * The cluster index is allowed to be flagged invalid on input and is
1559 * recalculated on return.
1561 * (used during synchronization to advance past a chain being deleted).
1563 * The chain being advanced must not be the focus and the clusters in
1564 * question must have already passed normal cluster_lookup/cluster_next
1567 * The cluster always remains intact on return, so void function.
1570 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1571 hammer2_cluster_t *cluster,
1572 hammer2_key_t *key_nextp,
1573 hammer2_key_t key_beg,
1574 hammer2_key_t key_end,
1577 hammer2_chain_t *ochain;
1578 hammer2_chain_t *nchain;
1579 hammer2_chain_t *focus;
1580 hammer2_key_t key_accum;
1581 hammer2_key_t key_next;
1584 key_accum = *key_nextp;
1585 key_next = *key_nextp;
1586 ochain = cluster->array[i].chain;
1589 KKASSERT(ochain != cluster->focus);
1591 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1592 &key_next, key_beg, key_end,
1593 &cparent->array[i].cache_index,
1595 /* ochain now invalid */
1596 if (cparent->focus_index == i)
1597 cparent->focus = cparent->array[i].chain;
1600 * Install nchain. Note that nchain can be NULL, and can also
1601 * be in an unlocked state depending on flags.
1603 cluster->array[i].chain = nchain;
1604 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1606 if (key_accum > key_next)
1607 key_accum = key_next;
1609 focus = cluster->focus;
1615 if (nchain == focus) /* ASSERTED NOT TRUE */
1618 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1619 if (nchain->bref.type != focus->bref.type ||
1620 nchain->bref.key != focus->bref.key ||
1621 nchain->bref.keybits != focus->bref.keybits ||
1622 nchain->bref.modify_tid != focus->bref.modify_tid ||
1623 nchain->bytes != focus->bytes ||
1624 ddflag != cluster->ddflag) {
1625 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1629 *key_nextp = key_accum;
1632 * For now don't re-resolve cluster->flags.
1634 hammer2_cluster_resolve(cluster);
1639 * Mark a cluster deleted
1642 hammer2_cluster_delete(hammer2_cluster_t *cparent,
1643 hammer2_cluster_t *cluster, int flags)
1645 hammer2_chain_t *chain;
1646 hammer2_chain_t *parent;
1649 if (cparent == NULL) {
1650 kprintf("cparent is NULL\n");
1654 for (i = 0; i < cluster->nchains; ++i) {
1655 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1656 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1659 parent = cparent->array[i].chain;
1660 chain = cluster->array[i].chain;
1663 if (chain->parent != parent) {
1664 kprintf("hammer2_cluster_delete: parent "
1665 "mismatch chain=%p parent=%p against=%p\n",
1666 chain, chain->parent, parent);
1668 hammer2_chain_delete(parent, chain, flags);
1674 * Create a snapshot of the specified {parent, ochain} with the specified
1675 * label. The originating hammer2_inode must be exclusively locked for
1678 * The ioctl code has already synced the filesystem.
1681 hammer2_cluster_snapshot(hammer2_cluster_t *ocluster,
1682 hammer2_ioc_pfs_t *pmp)
1685 const hammer2_inode_data_t *ripdata;
1686 hammer2_inode_data_t *wipdata;
1687 hammer2_chain_t *nchain;
1688 hammer2_inode_t *nip;
1697 kprintf("snapshot %s\n", pmp->name);
1699 name_len = strlen(pmp->name);
1700 lhc = hammer2_dirhash(pmp->name, name_len);
1705 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1707 opfs_clid = ripdata->meta.pfs_clid;
1709 hmp = ocluster->focus->hmp; /* XXX find synchronized local disk */
1712 * Create the snapshot directory under the super-root
1714 * Set PFS type, generate a unique filesystem id, and generate
1715 * a cluster id. Use the same clid when snapshotting a PFS root,
1716 * which theoretically allows the snapshot to be used as part of
1717 * the same cluster (perhaps as a cache).
1719 * Copy the (flushed) blockref array. Theoretically we could use
1720 * chain_duplicate() but it becomes difficult to disentangle
1721 * the shared core so for now just brute-force it.
1726 nip = hammer2_inode_create(hmp->spmp->iroot, &vat, proc0.p_ucred,
1727 pmp->name, name_len, 0,
1729 HAMMER2_INSERT_PFSROOT, &error);
1732 hammer2_inode_modify(nip);
1733 nchain = hammer2_inode_chain(nip, 0, HAMMER2_RESOLVE_ALWAYS);
1734 hammer2_chain_modify(nchain, 0);
1735 wipdata = &nchain->data->ipdata;
1737 nip->meta.pfs_type = HAMMER2_PFSTYPE_MASTER;
1738 nip->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1739 nip->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT;
1740 kern_uuidgen(&nip->meta.pfs_fsid, 1);
1743 * Give the snapshot its own private cluster id. As a
1744 * snapshot no further synchronization with the original
1745 * cluster will be done.
1748 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1749 nip->meta.pfs_clid = opfs_clid;
1751 kern_uuidgen(&nip->meta.pfs_clid, 1);
1753 kern_uuidgen(&nip->meta.pfs_clid, 1);
1754 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1756 /* XXX hack blockset copy */
1757 /* XXX doesn't work with real cluster */
1758 KKASSERT(ocluster->nchains == 1);
1759 wipdata->meta = nip->meta;
1760 wipdata->u.blockset = ripdata->u.blockset;
1761 hammer2_flush(nchain, 1);
1762 hammer2_chain_unlock(nchain);
1763 hammer2_chain_drop(nchain);
1764 hammer2_inode_unlock(nip);
1769 /************************************************************************
1771 ************************************************************************
1774 * WARNING! blockref[] array data is not universal. These functions should
1775 * only be used to access universal data.
1777 * NOTE! The rdata call will wait for at least one of the chain I/Os to
1778 * complete if necessary. The I/O's should have already been
1779 * initiated by the cluster_lock/chain_lock operation.
1781 * The cluster must already be in a modified state before wdata
1782 * is called. The data will already be available for this case.
1784 const hammer2_media_data_t *
1785 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1787 KKASSERT(cluster->focus != NULL);
1788 return(cluster->focus->data);
1791 const hammer2_media_data_t *
1792 hammer2_cluster_rdata_bytes(hammer2_cluster_t *cluster, size_t *bytesp)
1794 KKASSERT(cluster->focus != NULL);
1795 *bytesp = cluster->focus->bytes;
1796 return(cluster->focus->data);
1799 hammer2_media_data_t *
1800 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1802 KKASSERT(cluster->focus != NULL);
1803 KKASSERT(hammer2_cluster_modified(cluster));
1804 return(cluster->focus->data);