2 * Copyright (c) 2013-2014 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * The cluster module collects multiple chains representing the same
36 * information into a single entity. It allows direct access to media
37 * data as long as it is not blockref array data. Meaning, basically,
38 * just inode and file data.
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
48 #include <sys/cdefs.h>
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/types.h>
58 hammer2_cluster_bytes(hammer2_cluster_t *cluster)
60 return(cluster->focus->bytes);
64 hammer2_cluster_type(hammer2_cluster_t *cluster)
66 return(cluster->focus->bref.type);
70 * NOTE: When modifying a cluster object via hammer2_cluster_wdata()
71 * and hammer2_cluster_modsync(), remember that block array
72 * entries are not copied to the elements of the cluster.
74 const hammer2_media_data_t *
75 hammer2_cluster_data(hammer2_cluster_t *cluster)
77 return(cluster->focus->data);
80 hammer2_media_data_t *
81 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
83 return(cluster->focus->data);
87 hammer2_cluster_modified(hammer2_cluster_t *cluster)
89 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
93 hammer2_cluster_unlinked(hammer2_cluster_t *cluster)
95 return((cluster->focus->flags & HAMMER2_CHAIN_UNLINKED) != 0);
99 * Return a bref representative of the cluster. Any data offset is removed
100 * (since it would only be applicable to a particular chain in the cluster).
102 * However, the radix portion of data_off is used for many purposes and will
106 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
108 *bref = cluster->focus->bref;
109 bref->data_off &= HAMMER2_OFF_MASK_RADIX;
113 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
115 hammer2_chain_t *chain;
118 for (i = 0; i < cluster->nchains; ++i) {
119 chain = cluster->array[i];
121 atomic_set_int(&chain->flags, flags);
126 hammer2_cluster_setsubmod(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
128 hammer2_chain_t *chain;
131 for (i = 0; i < cluster->nchains; ++i) {
132 chain = cluster->array[i];
134 hammer2_chain_setsubmod(trans, chain);
139 * Create a cluster with one ref from the specified chain. The chain
140 * is not further referenced. The caller typically supplies a locked
141 * chain and transfers ownership to the cluster.
144 hammer2_cluster_from_chain(hammer2_chain_t *chain)
146 hammer2_cluster_t *cluster;
148 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
149 cluster->array[0] = chain;
150 cluster->nchains = 1;
151 cluster->focus = chain;
152 cluster->pmp = chain->pmp;
159 * Allocates a cluster and its underlying chain structures. The underlying
160 * chains will be locked. The cluster and underlying chains will have one
164 hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
165 hammer2_trans_t *trans, hammer2_blockref_t *bref)
167 hammer2_cluster_t *cluster;
168 hammer2_cluster_t *rcluster;
169 hammer2_chain_t *chain;
170 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
173 KKASSERT(pmp != NULL);
176 * Construct the appropriate system structure.
179 case HAMMER2_BREF_TYPE_INODE:
180 case HAMMER2_BREF_TYPE_INDIRECT:
181 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
182 case HAMMER2_BREF_TYPE_DATA:
183 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
185 * Chain's are really only associated with the hmp but we
186 * maintain a pmp association for per-mount memory tracking
187 * purposes. The pmp can be NULL.
190 case HAMMER2_BREF_TYPE_VOLUME:
191 case HAMMER2_BREF_TYPE_FREEMAP:
193 panic("hammer2_cluster_alloc volume type illegal for op");
196 panic("hammer2_cluster_alloc: unrecognized blockref type: %d",
200 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
203 rcluster = &pmp->iroot->cluster;
204 for (i = 0; i < rcluster->nchains; ++i) {
205 chain = hammer2_chain_alloc(rcluster->array[i]->hmp,
207 chain->hmp = rcluster->array[i]->hmp;
209 chain->bytes = bytes;
211 chain->flags = HAMMER2_CHAIN_ALLOCATED;
212 chain->delete_xid = HAMMER2_XID_MAX;
215 * Set modify_tid if a transaction is creating the inode.
216 * Enforce update_xlo = 0 so nearby transactions do not think
217 * it has been flushed when it hasn't.
219 * NOTE: When loading a chain from backing store or creating a
220 * snapshot, trans will be NULL and the caller is
221 * responsible for setting these fields.
224 chain->modify_xid = trans->sync_xid;
225 chain->update_xlo = 0;
227 cluster->array[i] = chain;
229 cluster->nchains = i;
231 cluster->focus = cluster->array[0];
237 * Associate an existing core with the chain or allocate a new core.
239 * The core is not locked. No additional refs on the chain are made.
240 * (trans) must not be NULL if (core) is not NULL.
242 * When chains are delete-duplicated during flushes we insert nchain on
243 * the ownerq after ochain instead of at the end in order to give the
244 * drop code visibility in the correct order, otherwise drops can be missed.
247 hammer2_cluster_core_alloc(hammer2_trans_t *trans,
248 hammer2_cluster_t *ncluster,
249 hammer2_cluster_t *ocluster)
253 for (i = 0; i < ocluster->nchains; ++i) {
254 if (ncluster->array[i]) {
255 hammer2_chain_core_alloc(trans,
263 * Add a reference to a cluster.
265 * We must also ref the underlying chains in order to allow ref/unlock
266 * sequences to later re-lock.
269 hammer2_cluster_ref(hammer2_cluster_t *cluster)
271 hammer2_chain_t *chain;
274 atomic_add_int(&cluster->refs, 1);
275 for (i = 0; i < cluster->nchains; ++i) {
276 chain = cluster->array[i];
278 hammer2_chain_ref(chain);
283 * Drop the caller's reference to the cluster. When the ref count drops to
284 * zero this function frees the cluster and drops all underlying chains.
287 hammer2_cluster_drop(hammer2_cluster_t *cluster)
289 hammer2_chain_t *chain;
292 KKASSERT(cluster->refs > 0);
293 for (i = 0; i < cluster->nchains; ++i) {
294 chain = cluster->array[i];
296 hammer2_chain_drop(chain);
297 if (cluster->refs == 1)
298 cluster->array[i] = NULL;
301 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
302 cluster->focus = NULL;
303 kfree(cluster, M_HAMMER2);
304 /* cluster = NULL; safety */
309 hammer2_cluster_wait(hammer2_cluster_t *cluster)
311 tsleep(cluster->focus, 0, "h2clcw", 1);
315 * Lock and ref a cluster. This adds a ref to the cluster and its chains
316 * and then locks them.
319 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
321 hammer2_chain_t *chain;
326 atomic_add_int(&cluster->refs, 1);
327 for (i = 0; i < cluster->nchains; ++i) {
328 chain = cluster->array[i];
330 error = hammer2_chain_lock(chain, how);
333 hammer2_chain_unlock(cluster->array[i]);
334 atomic_add_int(&cluster->refs, -1);
343 * Replace the contents of dst with src, adding a reference to src's chains.
344 * dst is assumed to already have a ref and any chains present in dst are
345 * assumed to be locked and will be unlocked.
347 * If the chains in src are locked, only one of (src) or (dst) should be
348 * considered locked by the caller after return, not both.
351 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src)
353 hammer2_chain_t *chain;
356 KKASSERT(dst->refs == 1);
359 for (i = 0; i < src->nchains; ++i) {
360 chain = src->array[i];
362 hammer2_chain_ref(chain);
363 if (i < dst->nchains && dst->array[i])
364 hammer2_chain_unlock(dst->array[i]);
365 dst->array[i] = chain;
366 if (dst->focus == NULL)
370 while (i < dst->nchains) {
371 chain = dst->array[i];
373 hammer2_chain_unlock(chain);
374 dst->array[i] = NULL;
378 dst->nchains = src->nchains;
382 * Replace the contents of the locked destination with the contents of the
383 * locked source. Destination must have one ref.
385 * Returns with the destination still with one ref and the copied chains
386 * with an additional lock (representing their state on the destination).
387 * The original chains associated with the destination are unlocked.
390 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src)
392 hammer2_chain_t *chain;
395 KKASSERT(dst->refs == 1);
398 for (i = 0; i < src->nchains; ++i) {
399 chain = src->array[i];
401 hammer2_chain_lock(chain, 0);
402 if (i < dst->nchains && dst->array[i])
403 hammer2_chain_unlock(dst->array[i]);
404 dst->array[i] = src->array[i];
405 if (dst->focus == NULL)
409 while (i < dst->nchains) {
410 chain = dst->array[i];
412 hammer2_chain_unlock(chain);
413 dst->array[i] = NULL;
417 dst->nchains = src->nchains;
421 * Copy a cluster, returned a ref'd cluster. All underlying chains
422 * are also ref'd, but not locked.
424 * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied
425 * to the new cluster and a reference is nominally added to them and to
426 * the cluster. The cluster will have 1 ref.
428 * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains
429 * are copied but no additional references are made and the cluster will have
430 * 0 refs. Callers must ref the cluster and the chains before dropping it
431 * (typically by locking it).
433 * If flags are passed as 0 the copy is setup as if it contained the chains
434 * but the chains will not be copied over, and the cluster will have 0 refs.
435 * Callers must ref the cluster before dropping it (typically by locking it).
438 hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags)
440 hammer2_pfsmount_t *pmp = ocluster->pmp;
441 hammer2_cluster_t *ncluster;
442 hammer2_chain_t *chain;
445 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
447 ncluster->nchains = ocluster->nchains;
448 ncluster->focus = ocluster->focus;
449 ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1;
450 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) {
451 for (i = 0; i < ocluster->nchains; ++i) {
452 chain = ocluster->array[i];
453 ncluster->array[i] = chain;
454 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 &&
456 hammer2_chain_ref(chain);
464 * Unlock and deref a cluster. The cluster is destroyed if this is the
468 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
470 hammer2_chain_t *chain;
473 KKASSERT(cluster->refs > 0);
474 for (i = 0; i < cluster->nchains; ++i) {
475 chain = cluster->array[i];
477 hammer2_chain_unlock(chain);
478 if (cluster->refs == 1)
479 cluster->array[i] = NULL; /* safety */
482 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
483 cluster->focus = NULL;
484 kfree(cluster, M_HAMMER2);
485 /* cluster = NULL; safety */
490 * Refactor the locked chains of a cluster.
493 hammer2_cluster_refactor(hammer2_cluster_t *cluster)
497 cluster->focus = NULL;
498 for (i = 0; i < cluster->nchains; ++i) {
499 if (cluster->array[i]) {
500 hammer2_chain_refactor(&cluster->array[i]);
501 if (cluster->focus == NULL)
502 cluster->focus = cluster->array[i];
508 * Resize the cluster's physical storage allocation in-place. This may
509 * replace the cluster's chains.
512 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
513 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
514 int nradix, int flags)
518 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */
519 KKASSERT(cparent->nchains == cluster->nchains);
521 cluster->focus = NULL;
522 for (i = 0; i < cluster->nchains; ++i) {
523 if (cluster->array[i]) {
524 KKASSERT(cparent->array[i]);
525 hammer2_chain_resize(trans, ip,
529 if (cluster->focus == NULL)
530 cluster->focus = cluster->array[i];
536 * Set an inode's cluster modified, marking the related chains RW and
537 * duplicating them if necessary.
539 * The passed-in chain is a localized copy of the chain previously acquired
540 * when the inode was locked (and possilby replaced in the mean time), and
541 * must also be updated. In fact, we update it first and then synchronize
542 * the inode's cluster cache.
544 hammer2_inode_data_t *
545 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
546 hammer2_cluster_t *cluster, int flags)
548 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
549 hammer2_cluster_modify(trans, cluster, flags);
551 hammer2_inode_repoint(ip, NULL, cluster);
554 return (&hammer2_cluster_wdata(cluster)->ipdata);
558 * Adjust the cluster's chains to allow modification.
561 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
566 cluster->focus = NULL;
567 for (i = 0; i < cluster->nchains; ++i) {
568 if (cluster->array[i]) {
569 hammer2_chain_modify(trans, &cluster->array[i], flags);
570 if (cluster->focus == NULL)
571 cluster->focus = cluster->array[i];
577 * Synchronize modifications with other chains in a cluster.
579 * Nominal front-end operations only edit non-block-table data in a single
580 * chain. This code copies such modifications to the other chains in the
583 /* hammer2_cluster_modsync() */
586 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
588 hammer2_chain_t *focus;
589 hammer2_chain_t *scan;
590 const hammer2_inode_data_t *ripdata;
591 hammer2_inode_data_t *wipdata;
594 focus = cluster->focus;
595 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
597 for (i = 0; i < cluster->nchains; ++i) {
598 scan = cluster->array[i];
599 if (scan == NULL || scan == focus)
601 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
602 KKASSERT(focus->bytes == scan->bytes &&
603 focus->bref.type == scan->bref.type);
604 switch(focus->bref.type) {
605 case HAMMER2_BREF_TYPE_INODE:
606 ripdata = &focus->data->ipdata;
607 wipdata = &scan->data->ipdata;
608 if ((ripdata->op_flags &
609 HAMMER2_OPFLAG_DIRECTDATA) == 0) {
610 bcopy(ripdata, wipdata,
611 offsetof(hammer2_inode_data_t, u));
615 case HAMMER2_BREF_TYPE_DATA:
616 bcopy(focus->data, scan->data, focus->bytes);
618 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
619 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
620 case HAMMER2_BREF_TYPE_FREEMAP:
621 case HAMMER2_BREF_TYPE_VOLUME:
622 panic("hammer2_cluster_modsync: illegal node type");
626 panic("hammer2_cluster_modsync: unknown node type");
633 * Lookup initialization/completion API
636 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
638 hammer2_cluster_t *cluster;
641 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
642 cluster->pmp = cparent->pmp; /* can be NULL */
643 /* cluster->focus = NULL; already null */
645 for (i = 0; i < cparent->nchains; ++i) {
646 cluster->array[i] = cparent->array[i];
647 if (cluster->focus == NULL)
648 cluster->focus = cluster->array[i];
650 cluster->nchains = cparent->nchains;
653 * Independently lock (this will also give cluster 1 ref)
655 if (flags & HAMMER2_LOOKUP_SHARED) {
656 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
657 HAMMER2_RESOLVE_SHARED);
659 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
665 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
668 hammer2_cluster_unlock(cparent);
672 * Locate first match or overlap under parent, return a new cluster
675 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
676 hammer2_key_t key_beg, hammer2_key_t key_end,
677 int flags, int *ddflagp)
679 hammer2_pfsmount_t *pmp;
680 hammer2_cluster_t *cluster;
681 hammer2_chain_t *chain;
682 hammer2_key_t key_accum;
683 hammer2_key_t key_next;
684 hammer2_key_t bref_key;
692 pmp = cparent->pmp; /* can be NULL */
693 key_accum = *key_nextp;
700 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
701 cluster->pmp = pmp; /* can be NULL */
703 /* cluster->focus = NULL; already null */
704 cparent->focus = NULL;
707 for (i = 0; i < cparent->nchains; ++i) {
708 key_next = *key_nextp;
709 if (cparent->array[i] == NULL) {
713 chain = hammer2_chain_lookup(&cparent->array[i], &key_next,
715 &cparent->cache_index[i],
717 if (cparent->focus == NULL)
718 cparent->focus = cparent->array[i];
719 cluster->array[i] = chain;
723 if (cluster->focus == NULL) {
724 bref_type = chain->bref.type;
725 bref_key = chain->bref.key;
726 bref_keybits = chain->bref.keybits;
727 bytes = chain->bytes;
729 cluster->focus = chain;
731 KKASSERT(bref_type == chain->bref.type);
732 KKASSERT(bref_key == chain->bref.key);
733 KKASSERT(bref_keybits == chain->bref.keybits);
734 KKASSERT(bytes == chain->bytes);
735 KKASSERT(*ddflagp == ddflag);
737 if (key_accum > key_next)
738 key_accum = key_next;
740 *key_nextp = key_accum;
741 cluster->nchains = i;
743 if (null_count == i) {
744 hammer2_cluster_drop(cluster);
752 * Locate next match or overlap under parent, replace cluster
755 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
756 hammer2_key_t *key_nextp,
757 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
759 hammer2_chain_t *chain;
760 hammer2_key_t key_accum;
761 hammer2_key_t key_next;
765 key_accum = *key_nextp;
767 cluster->focus = NULL;
768 cparent->focus = NULL;
770 for (i = 0; i < cparent->nchains; ++i) {
771 key_next = *key_nextp;
772 chain = cluster->array[i];
774 if (cparent->focus == NULL)
775 cparent->focus = cparent->array[i];
779 if (cparent->array[i] == NULL) {
780 if (flags & HAMMER2_LOOKUP_NOLOCK)
781 hammer2_chain_drop(chain);
783 hammer2_chain_unlock(chain);
787 chain = hammer2_chain_next(&cparent->array[i], chain,
788 &key_next, key_beg, key_end,
789 &cparent->cache_index[i], flags);
790 if (cparent->focus == NULL)
791 cparent->focus = cparent->array[i];
792 cluster->array[i] = chain;
795 } else if (cluster->focus == NULL) {
796 cluster->focus = chain;
798 if (key_accum > key_next)
799 key_accum = key_next;
802 if (null_count == i) {
803 hammer2_cluster_drop(cluster);
811 * XXX initial NULL cluster needs reworking (pass **clusterp ?)
813 * The raw scan function is similar to lookup/next but does not seek to a key.
814 * Blockrefs are iterated via first_chain = (parent, NULL) and
815 * next_chain = (parent, chain).
817 * The passed-in parent must be locked and its data resolved. The returned
818 * chain will be locked. Pass chain == NULL to acquire the first sub-chain
819 * under parent and then iterate with the passed-in chain (which this
820 * function will unlock).
823 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
826 hammer2_chain_t *chain;
832 for (i = 0; i < cparent->nchains; ++i) {
833 chain = cluster->array[i];
838 if (cparent->array[i] == NULL) {
839 if (flags & HAMMER2_LOOKUP_NOLOCK)
840 hammer2_chain_drop(chain);
842 hammer2_chain_unlock(chain);
847 chain = hammer2_chain_scan(cparent->array[i], chain,
848 &cparent->cache_index[i], flags);
849 cluster->array[i] = chain;
854 if (null_count == i) {
855 hammer2_cluster_drop(cluster);
864 * Create a new cluster using the specified key
867 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
868 hammer2_cluster_t **clusterp,
869 hammer2_key_t key, int keybits, int type, size_t bytes)
871 hammer2_cluster_t *cluster;
872 hammer2_pfsmount_t *pmp;
876 pmp = trans->pmp; /* can be NULL */
878 if ((cluster = *clusterp) == NULL) {
879 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
881 cluster->pmp = pmp; /* can be NULL */
884 cluster->focus = NULL;
885 cparent->focus = NULL;
888 * NOTE: cluster->array[] entries can initially be NULL. If
889 * *clusterp is supplied, skip NULL entries, otherwise
892 for (i = 0; i < cparent->nchains; ++i) {
893 if (*clusterp && cluster->array[i] == NULL) {
894 if (cparent->focus == NULL)
895 cparent->focus = cparent->array[i];
898 error = hammer2_chain_create(trans, &cparent->array[i],
899 &cluster->array[i], pmp,
900 key, keybits, type, bytes);
901 KKASSERT(error == 0);
902 if (cparent->focus == NULL)
903 cparent->focus = cparent->array[i];
904 if (cluster->focus == NULL)
905 cluster->focus = cluster->array[i];
907 cluster->nchains = i;
914 * Duplicate a cluster under a new parent.
916 * WARNING! Unlike hammer2_chain_duplicate(), only the key and keybits fields
917 * are used from a passed-in non-NULL bref pointer. All other fields
918 * are extracted from the original chain for each chain in the
922 hammer2_cluster_duplicate(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
923 hammer2_cluster_t *cluster, hammer2_blockref_t *bref,
924 int snapshot, int duplicate_reason)
926 hammer2_chain_t *chain;
927 hammer2_blockref_t xbref;
930 cluster->focus = NULL;
931 cparent->focus = NULL;
933 for (i = 0; i < cluster->nchains; ++i) {
934 chain = cluster->array[i];
938 xbref.key = bref->key;
939 xbref.keybits = bref->keybits;
940 hammer2_chain_duplicate(trans,
946 hammer2_chain_duplicate(trans,
952 cluster->array[i] = chain;
953 if (cluster->focus == NULL)
954 cluster->focus = chain;
955 if (cparent->focus == NULL)
956 cparent->focus = cparent->array[i];
958 if (cparent->focus == NULL)
959 cparent->focus = cparent->array[i];
965 * Delete-duplicate a cluster in-place.
968 hammer2_cluster_delete_duplicate(hammer2_trans_t *trans,
969 hammer2_cluster_t *cluster, int flags)
971 hammer2_chain_t *chain;
974 cluster->focus = NULL;
975 for (i = 0; i < cluster->nchains; ++i) {
976 chain = cluster->array[i];
978 hammer2_chain_delete_duplicate(trans, &chain, flags);
979 cluster->array[i] = chain;
980 if (cluster->focus == NULL)
981 cluster->focus = chain;
987 * Mark a cluster deleted
990 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
993 hammer2_chain_t *chain;
996 for (i = 0; i < cluster->nchains; ++i) {
997 chain = cluster->array[i];
999 hammer2_chain_delete(trans, chain, flags);
1004 * Create a snapshot of the specified {parent, ochain} with the specified
1005 * label. The originating hammer2_inode must be exclusively locked for
1008 * The ioctl code has already synced the filesystem.
1011 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1012 hammer2_ioc_pfs_t *pfs)
1014 hammer2_mount_t *hmp;
1015 hammer2_cluster_t *ncluster;
1016 const hammer2_inode_data_t *ipdata;
1017 hammer2_inode_data_t *wipdata;
1018 hammer2_inode_t *nip;
1025 kprintf("snapshot %s\n", pfs->name);
1027 name_len = strlen(pfs->name);
1028 lhc = hammer2_dirhash(pfs->name, name_len);
1030 ipdata = &hammer2_cluster_data(ocluster)->ipdata;
1031 opfs_clid = ipdata->pfs_clid;
1032 hmp = ocluster->focus->hmp;
1035 * Create the snapshot directory under the super-root
1037 * Set PFS type, generate a unique filesystem id, and generate
1038 * a cluster id. Use the same clid when snapshotting a PFS root,
1039 * which theoretically allows the snapshot to be used as part of
1040 * the same cluster (perhaps as a cache).
1042 * Copy the (flushed) blockref array. Theoretically we could use
1043 * chain_duplicate() but it becomes difficult to disentangle
1044 * the shared core so for now just brute-force it.
1050 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1051 proc0.p_ucred, pfs->name, name_len,
1055 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1056 wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
1057 kern_uuidgen(&wipdata->pfs_fsid, 1);
1058 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSROOT)
1059 wipdata->pfs_clid = opfs_clid;
1061 kern_uuidgen(&wipdata->pfs_clid, 1);
1062 hammer2_cluster_set_chainflags(ncluster, HAMMER2_CHAIN_PFSROOT);
1064 /* XXX hack blockset copy */
1065 /* XXX doesn't work with real cluster */
1066 KKASSERT(ocluster->nchains == 1);
1067 wipdata->u.blockset = ocluster->focus->data->ipdata.u.blockset;
1068 hammer2_cluster_modsync(ncluster);
1069 hammer2_inode_unlock_ex(nip, ncluster);
1074 /************************************************************************
1076 ************************************************************************
1078 * A node failure can occur for numerous reasons.
1080 * - A read I/O may fail
1081 * - A write I/O may fail
1082 * - An unexpected chain might be found (or be missing)
1083 * - A node might disconnect temporarily and reconnect later
1084 * (for example, a USB stick could get pulled, or a node might
1085 * be programmatically disconnected).
1086 * - A node might run out of space during a modifying operation.
1088 * When a read failure or an unexpected chain state is found, the chain and
1089 * parent chain at the failure point for the nodes involved (the nodes
1090 * which we determine to be in error) are flagged as failed and removed
1091 * from the cluster. The node itself is allowed to remain active. The
1092 * highest common point (usually a parent chain) is queued to the
1093 * resynchronization thread for action.
1095 * When a write I/O fails or a node runs out of space, we first adjust
1096 * as if a read failure occurs but we further disable flushes on the
1097 * ENTIRE node. Concurrent modifying transactions are allowed to complete
1098 * but any new modifying transactions will automatically remove the node
1099 * from consideration in all related cluster structures and not generate
1100 * any new modified chains. The ROOT chain for the failed node(s) is queued
1101 * to the resynchronization thread for action.
1103 * A temporary disconnect is handled as if a write failure occurred.
1105 * Any of these failures might or might not stall related high level VNOPS,
1106 * depending on what has failed, what nodes remain, the type of cluster,
1107 * and the operating state of the cluster.
1109 * FLUSH ON WRITE-DISABLED NODES
1111 * A flush on a write-disabled node is not allowed to write anything because
1112 * we cannot safely update the mirror_tid anywhere on the failed node. The
1113 * synchronization thread uses mirror_tid to calculate incremental resyncs.
1114 * Dirty meta-data related to the failed node is thrown away.
1116 * Dirty buffer cache buffers and inodes are only thrown away if they can be
1117 * retired... that is, if the filesystem still has enough nodes to complete
1121 /************************************************************************
1122 * SYNCHRONIZATION THREAD *
1123 ************************************************************************
1125 * This thread is responsible for [re]synchronizing the cluster representing
1126 * a PFS. Any out-of-sync or failed node starts this thread on a
1127 * node-by-node basis when the failure is detected.
1129 * Clusters needing resynchronization are queued at the highest point
1130 * where the parent on the failed node is still valid, or a special
1131 * incremental scan from the ROOT is queued if no parent exists. This
1132 * thread is also responsible for waiting for reconnections of the failed
1133 * node if the cause was due to a disconnect, and waiting for space to be
1134 * freed up if the cause was due to running out of space.
1136 * If the cause is due to a node running out of space, this thread will also
1137 * remove older (unlocked) snapshots to make new space, recover space, and
1138 * then start resynchronization.
1140 * Each resynchronization pass virtually snapshots the PFS on the good nodes
1141 * and synchronizes using that snapshot against the target node. This
1142 * ensures a consistent chain topology and also avoid interference between
1143 * the resynchronization thread and frontend operations.
1145 * Since these are per-node threads it is possible to resynchronize several