2 * Copyright (c) 2013-2014 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * The cluster module collects multiple chains representing the same
36 * information into a single entity. It allows direct access to media
37 * data as long as it is not blockref array data. Meaning, basically,
38 * just inode and file data.
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
48 #include <sys/cdefs.h>
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/types.h>
58 * Returns TRUE if any chain in the cluster needs to be resized.
61 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
63 hammer2_chain_t *chain;
66 for (i = 0; i < cluster->nchains; ++i) {
67 chain = cluster->array[i];
68 if (chain && chain->bytes != bytes)
75 hammer2_cluster_type(hammer2_cluster_t *cluster)
77 return(cluster->focus->bref.type);
81 * NOTE: When modifying a cluster object via hammer2_cluster_wdata()
82 * and hammer2_cluster_modsync(), remember that block array
83 * entries are not copied to the elements of the cluster.
85 const hammer2_media_data_t *
86 hammer2_cluster_data(hammer2_cluster_t *cluster)
88 return(cluster->focus->data);
91 hammer2_media_data_t *
92 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
94 return(cluster->focus->data);
98 hammer2_cluster_modified(hammer2_cluster_t *cluster)
100 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
104 * Return a bref representative of the cluster. Any data offset is removed
105 * (since it would only be applicable to a particular chain in the cluster).
107 * However, the radix portion of data_off is used for many purposes and will
111 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
113 *bref = cluster->focus->bref;
114 bref->data_off &= HAMMER2_OFF_MASK_RADIX;
118 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
120 hammer2_chain_t *chain;
123 for (i = 0; i < cluster->nchains; ++i) {
124 chain = cluster->array[i];
126 atomic_set_int(&chain->flags, flags);
131 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
133 hammer2_chain_t *chain;
136 for (i = 0; i < cluster->nchains; ++i) {
137 chain = cluster->array[i];
139 hammer2_chain_setflush(trans, chain);
144 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
145 hammer2_cluster_t *cluster,
148 hammer2_chain_t *chain;
151 for (i = 0; i < cluster->nchains; ++i) {
152 chain = cluster->array[i];
154 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
155 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
156 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
162 * Create a cluster with one ref from the specified chain. The chain
163 * is not further referenced. The caller typically supplies a locked
164 * chain and transfers ownership to the cluster.
167 hammer2_cluster_from_chain(hammer2_chain_t *chain)
169 hammer2_cluster_t *cluster;
171 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
172 cluster->array[0] = chain;
173 cluster->nchains = 1;
174 cluster->focus = chain;
175 cluster->pmp = chain->pmp;
182 * Allocates a cluster and its underlying chain structures. The underlying
183 * chains will be locked. The cluster and underlying chains will have one
187 hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
188 hammer2_trans_t *trans, hammer2_blockref_t *bref)
190 hammer2_cluster_t *cluster;
191 hammer2_cluster_t *rcluster;
192 hammer2_chain_t *chain;
194 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
198 KKASSERT(pmp != NULL);
201 * Construct the appropriate system structure.
204 case HAMMER2_BREF_TYPE_INODE:
205 case HAMMER2_BREF_TYPE_INDIRECT:
206 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
207 case HAMMER2_BREF_TYPE_DATA:
208 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
210 * Chain's are really only associated with the hmp but we
211 * maintain a pmp association for per-mount memory tracking
212 * purposes. The pmp can be NULL.
215 case HAMMER2_BREF_TYPE_VOLUME:
216 case HAMMER2_BREF_TYPE_FREEMAP:
218 panic("hammer2_cluster_alloc volume type illegal for op");
221 panic("hammer2_cluster_alloc: unrecognized blockref type: %d",
225 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
228 rcluster = &pmp->iroot->cluster;
229 for (i = 0; i < rcluster->nchains; ++i) {
230 chain = hammer2_chain_alloc(rcluster->array[i]->hmp,
233 chain->hmp = rcluster->array[i]->hmp;
235 chain->bytes = bytes;
237 chain->flags = HAMMER2_CHAIN_ALLOCATED;
241 * NOTE: When loading a chain from backing store or creating a
242 * snapshot, trans will be NULL and the caller is
243 * responsible for setting these fields.
245 cluster->array[i] = chain;
247 cluster->nchains = i;
249 cluster->focus = cluster->array[0];
255 * Add a reference to a cluster.
257 * We must also ref the underlying chains in order to allow ref/unlock
258 * sequences to later re-lock.
261 hammer2_cluster_ref(hammer2_cluster_t *cluster)
263 hammer2_chain_t *chain;
266 atomic_add_int(&cluster->refs, 1);
267 for (i = 0; i < cluster->nchains; ++i) {
268 chain = cluster->array[i];
270 hammer2_chain_ref(chain);
275 * Drop the caller's reference to the cluster. When the ref count drops to
276 * zero this function frees the cluster and drops all underlying chains.
279 hammer2_cluster_drop(hammer2_cluster_t *cluster)
281 hammer2_chain_t *chain;
284 KKASSERT(cluster->refs > 0);
285 for (i = 0; i < cluster->nchains; ++i) {
286 chain = cluster->array[i];
288 hammer2_chain_drop(chain);
289 if (cluster->refs == 1)
290 cluster->array[i] = NULL;
293 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
294 cluster->focus = NULL;
295 kfree(cluster, M_HAMMER2);
296 /* cluster = NULL; safety */
301 hammer2_cluster_wait(hammer2_cluster_t *cluster)
303 tsleep(cluster->focus, 0, "h2clcw", 1);
307 * Lock and ref a cluster. This adds a ref to the cluster and its chains
308 * and then locks them.
311 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
313 hammer2_chain_t *chain;
318 atomic_add_int(&cluster->refs, 1);
319 for (i = 0; i < cluster->nchains; ++i) {
320 chain = cluster->array[i];
322 error = hammer2_chain_lock(chain, how);
325 hammer2_chain_unlock(cluster->array[i]);
326 atomic_add_int(&cluster->refs, -1);
335 * Replace the contents of dst with src, adding a reference to src's chains.
336 * dst is assumed to already have a ref and any chains present in dst are
337 * assumed to be locked and will be unlocked.
339 * If the chains in src are locked, only one of (src) or (dst) should be
340 * considered locked by the caller after return, not both.
343 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src)
345 hammer2_chain_t *chain;
348 KKASSERT(dst->refs == 1);
351 for (i = 0; i < src->nchains; ++i) {
352 chain = src->array[i];
354 hammer2_chain_ref(chain);
355 if (i < dst->nchains && dst->array[i])
356 hammer2_chain_unlock(dst->array[i]);
357 dst->array[i] = chain;
358 if (dst->focus == NULL)
362 while (i < dst->nchains) {
363 chain = dst->array[i];
365 hammer2_chain_unlock(chain);
366 dst->array[i] = NULL;
370 dst->nchains = src->nchains;
374 * Replace the contents of the locked destination with the contents of the
375 * locked source. Destination must have one ref.
377 * Returns with the destination still with one ref and the copied chains
378 * with an additional lock (representing their state on the destination).
379 * The original chains associated with the destination are unlocked.
382 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src)
384 hammer2_chain_t *chain;
387 KKASSERT(dst->refs == 1);
390 for (i = 0; i < src->nchains; ++i) {
391 chain = src->array[i];
393 hammer2_chain_lock(chain, 0);
394 if (i < dst->nchains && dst->array[i])
395 hammer2_chain_unlock(dst->array[i]);
396 dst->array[i] = src->array[i];
397 if (dst->focus == NULL)
401 while (i < dst->nchains) {
402 chain = dst->array[i];
404 hammer2_chain_unlock(chain);
405 dst->array[i] = NULL;
409 dst->nchains = src->nchains;
413 * Copy a cluster, returned a ref'd cluster. All underlying chains
414 * are also ref'd, but not locked.
416 * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied
417 * to the new cluster and a reference is nominally added to them and to
418 * the cluster. The cluster will have 1 ref.
420 * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains
421 * are copied but no additional references are made and the cluster will have
422 * 0 refs. Callers must ref the cluster and the chains before dropping it
423 * (typically by locking it).
425 * If flags are passed as 0 the copy is setup as if it contained the chains
426 * but the chains will not be copied over, and the cluster will have 0 refs.
427 * Callers must ref the cluster before dropping it (typically by locking it).
430 hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags)
432 hammer2_pfsmount_t *pmp = ocluster->pmp;
433 hammer2_cluster_t *ncluster;
434 hammer2_chain_t *chain;
437 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
439 ncluster->nchains = ocluster->nchains;
440 ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1;
441 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) {
442 ncluster->focus = ocluster->focus;
443 for (i = 0; i < ocluster->nchains; ++i) {
444 chain = ocluster->array[i];
445 ncluster->array[i] = chain;
446 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 &&
448 hammer2_chain_ref(chain);
456 * Unlock and deref a cluster. The cluster is destroyed if this is the
460 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
462 hammer2_chain_t *chain;
465 KKASSERT(cluster->refs > 0);
466 for (i = 0; i < cluster->nchains; ++i) {
467 chain = cluster->array[i];
469 hammer2_chain_unlock(chain);
470 if (cluster->refs == 1)
471 cluster->array[i] = NULL; /* safety */
474 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
475 cluster->focus = NULL;
476 kfree(cluster, M_HAMMER2);
477 /* cluster = NULL; safety */
482 * Resize the cluster's physical storage allocation in-place. This may
483 * replace the cluster's chains.
486 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
487 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
488 int nradix, int flags)
492 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */
493 KKASSERT(cparent->nchains == cluster->nchains);
495 cluster->focus = NULL;
496 for (i = 0; i < cluster->nchains; ++i) {
497 if (cluster->array[i]) {
498 KKASSERT(cparent->array[i]);
499 hammer2_chain_resize(trans, ip,
503 if (cluster->focus == NULL)
504 cluster->focus = cluster->array[i];
510 * Set an inode's cluster modified, marking the related chains RW and
511 * duplicating them if necessary.
513 * The passed-in chain is a localized copy of the chain previously acquired
514 * when the inode was locked (and possilby replaced in the mean time), and
515 * must also be updated. In fact, we update it first and then synchronize
516 * the inode's cluster cache.
518 hammer2_inode_data_t *
519 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
520 hammer2_cluster_t *cluster, int flags)
522 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
523 hammer2_cluster_modify(trans, cluster, flags);
525 hammer2_inode_repoint(ip, NULL, cluster);
528 return (&hammer2_cluster_wdata(cluster)->ipdata);
532 * Adjust the cluster's chains to allow modification.
535 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
540 cluster->focus = NULL;
541 for (i = 0; i < cluster->nchains; ++i) {
542 if (cluster->array[i]) {
543 hammer2_chain_modify(trans, cluster->array[i], flags);
544 if (cluster->focus == NULL)
545 cluster->focus = cluster->array[i];
551 * Synchronize modifications with other chains in a cluster.
553 * Nominal front-end operations only edit non-block-table data in a single
554 * chain. This code copies such modifications to the other chains in the
555 * cluster. Blocktable modifications are handled on a chain-by-chain basis
556 * by both the frontend and the backend and will explode in fireworks if
560 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
562 hammer2_chain_t *focus;
563 hammer2_chain_t *scan;
564 const hammer2_inode_data_t *ripdata;
565 hammer2_inode_data_t *wipdata;
568 focus = cluster->focus;
569 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
571 for (i = 0; i < cluster->nchains; ++i) {
572 scan = cluster->array[i];
573 if (scan == NULL || scan == focus)
575 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
576 KKASSERT(focus->bytes == scan->bytes &&
577 focus->bref.type == scan->bref.type);
578 switch(focus->bref.type) {
579 case HAMMER2_BREF_TYPE_INODE:
580 ripdata = &focus->data->ipdata;
581 wipdata = &scan->data->ipdata;
582 if ((ripdata->op_flags &
583 HAMMER2_OPFLAG_DIRECTDATA) == 0) {
584 bcopy(ripdata, wipdata,
585 offsetof(hammer2_inode_data_t, u));
589 case HAMMER2_BREF_TYPE_DATA:
590 bcopy(focus->data, scan->data, focus->bytes);
592 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
593 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
594 case HAMMER2_BREF_TYPE_FREEMAP:
595 case HAMMER2_BREF_TYPE_VOLUME:
596 panic("hammer2_cluster_modsync: illegal node type");
600 panic("hammer2_cluster_modsync: unknown node type");
607 * Lookup initialization/completion API
610 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
612 hammer2_cluster_t *cluster;
615 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
616 cluster->pmp = cparent->pmp; /* can be NULL */
617 /* cluster->focus = NULL; already null */
619 for (i = 0; i < cparent->nchains; ++i) {
620 cluster->array[i] = cparent->array[i];
621 if (cluster->focus == NULL)
622 cluster->focus = cluster->array[i];
624 cluster->nchains = cparent->nchains;
627 * Independently lock (this will also give cluster 1 ref)
629 if (flags & HAMMER2_LOOKUP_SHARED) {
630 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
631 HAMMER2_RESOLVE_SHARED);
633 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
639 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
642 hammer2_cluster_unlock(cparent);
646 * Locate first match or overlap under parent, return a new cluster
649 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
650 hammer2_key_t key_beg, hammer2_key_t key_end,
651 int flags, int *ddflagp)
653 hammer2_pfsmount_t *pmp;
654 hammer2_cluster_t *cluster;
655 hammer2_chain_t *chain;
656 hammer2_key_t key_accum;
657 hammer2_key_t key_next;
658 hammer2_key_t bref_key;
666 pmp = cparent->pmp; /* can be NULL */
667 key_accum = *key_nextp;
674 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
675 cluster->pmp = pmp; /* can be NULL */
677 /* cluster->focus = NULL; already null */
678 cparent->focus = NULL;
681 for (i = 0; i < cparent->nchains; ++i) {
682 key_next = *key_nextp;
683 if (cparent->array[i] == NULL) {
687 chain = hammer2_chain_lookup(&cparent->array[i], &key_next,
689 &cparent->cache_index[i],
691 if (cparent->focus == NULL)
692 cparent->focus = cparent->array[i];
693 cluster->array[i] = chain;
697 if (cluster->focus == NULL) {
698 bref_type = chain->bref.type;
699 bref_key = chain->bref.key;
700 bref_keybits = chain->bref.keybits;
701 bytes = chain->bytes;
703 cluster->focus = chain;
705 KKASSERT(bref_type == chain->bref.type);
706 KKASSERT(bref_key == chain->bref.key);
707 KKASSERT(bref_keybits == chain->bref.keybits);
708 KKASSERT(bytes == chain->bytes);
709 KKASSERT(*ddflagp == ddflag);
711 if (key_accum > key_next)
712 key_accum = key_next;
714 *key_nextp = key_accum;
715 cluster->nchains = i;
717 if (null_count == i) {
718 hammer2_cluster_drop(cluster);
726 * Locate next match or overlap under parent, replace cluster
729 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
730 hammer2_key_t *key_nextp,
731 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
733 hammer2_chain_t *chain;
734 hammer2_key_t key_accum;
735 hammer2_key_t key_next;
739 key_accum = *key_nextp;
741 cluster->focus = NULL;
742 cparent->focus = NULL;
744 for (i = 0; i < cparent->nchains; ++i) {
745 key_next = *key_nextp;
746 chain = cluster->array[i];
748 if (cparent->focus == NULL)
749 cparent->focus = cparent->array[i];
753 if (cparent->array[i] == NULL) {
754 if (flags & HAMMER2_LOOKUP_NOLOCK)
755 hammer2_chain_drop(chain);
757 hammer2_chain_unlock(chain);
761 chain = hammer2_chain_next(&cparent->array[i], chain,
762 &key_next, key_beg, key_end,
763 &cparent->cache_index[i], flags);
764 if (cparent->focus == NULL)
765 cparent->focus = cparent->array[i];
766 cluster->array[i] = chain;
769 } else if (cluster->focus == NULL) {
770 cluster->focus = chain;
772 if (key_accum > key_next)
773 key_accum = key_next;
776 if (null_count == i) {
777 hammer2_cluster_drop(cluster);
785 * XXX initial NULL cluster needs reworking (pass **clusterp ?)
787 * The raw scan function is similar to lookup/next but does not seek to a key.
788 * Blockrefs are iterated via first_chain = (parent, NULL) and
789 * next_chain = (parent, chain).
791 * The passed-in parent must be locked and its data resolved. The returned
792 * chain will be locked. Pass chain == NULL to acquire the first sub-chain
793 * under parent and then iterate with the passed-in chain (which this
794 * function will unlock).
797 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
800 hammer2_chain_t *chain;
806 for (i = 0; i < cparent->nchains; ++i) {
807 chain = cluster->array[i];
812 if (cparent->array[i] == NULL) {
813 if (flags & HAMMER2_LOOKUP_NOLOCK)
814 hammer2_chain_drop(chain);
816 hammer2_chain_unlock(chain);
821 chain = hammer2_chain_scan(cparent->array[i], chain,
822 &cparent->cache_index[i], flags);
823 cluster->array[i] = chain;
828 if (null_count == i) {
829 hammer2_cluster_drop(cluster);
838 * Create a new cluster using the specified key
841 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
842 hammer2_cluster_t **clusterp,
843 hammer2_key_t key, int keybits,
844 int type, size_t bytes, int flags)
846 hammer2_cluster_t *cluster;
847 hammer2_pfsmount_t *pmp;
851 pmp = trans->pmp; /* can be NULL */
853 if ((cluster = *clusterp) == NULL) {
854 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
856 cluster->pmp = pmp; /* can be NULL */
859 cluster->focus = NULL;
860 cparent->focus = NULL;
863 * NOTE: cluster->array[] entries can initially be NULL. If
864 * *clusterp is supplied, skip NULL entries, otherwise
867 for (i = 0; i < cparent->nchains; ++i) {
868 if (*clusterp && cluster->array[i] == NULL) {
869 if (cparent->focus == NULL)
870 cparent->focus = cparent->array[i];
873 error = hammer2_chain_create(trans, &cparent->array[i],
874 &cluster->array[i], pmp,
877 KKASSERT(error == 0);
878 if (cparent->focus == NULL)
879 cparent->focus = cparent->array[i];
880 if (cluster->focus == NULL)
881 cluster->focus = cluster->array[i];
883 cluster->nchains = i;
890 * Rename a cluster to a new parent.
892 * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields
893 * are used from a passed-in non-NULL bref pointer. All other fields
894 * are extracted from the original chain for each chain in the
898 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
899 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
902 hammer2_chain_t *chain;
903 hammer2_blockref_t xbref;
906 cluster->focus = NULL;
907 cparent->focus = NULL;
909 for (i = 0; i < cluster->nchains; ++i) {
910 chain = cluster->array[i];
914 xbref.key = bref->key;
915 xbref.keybits = bref->keybits;
916 hammer2_chain_rename(trans, &xbref,
920 hammer2_chain_rename(trans, NULL,
924 cluster->array[i] = chain;
925 if (cluster->focus == NULL)
926 cluster->focus = chain;
927 if (cparent->focus == NULL)
928 cparent->focus = cparent->array[i];
930 if (cparent->focus == NULL)
931 cparent->focus = cparent->array[i];
937 * Mark a cluster deleted
940 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
941 hammer2_cluster_t *cluster, int flags)
943 hammer2_chain_t *chain;
944 hammer2_chain_t *parent;
947 if (cparent == NULL) {
948 kprintf("cparent is NULL\n");
952 for (i = 0; i < cluster->nchains; ++i) {
953 parent = (i < cparent->nchains) ? cparent->array[i] : NULL;
954 chain = cluster->array[i];
957 if (chain->parent != parent) {
958 kprintf("hammer2_cluster_delete: parent "
959 "mismatch chain=%p parent=%p against=%p\n",
960 chain, chain->parent, parent);
962 hammer2_chain_delete(trans, parent, chain, flags);
968 * Create a snapshot of the specified {parent, ochain} with the specified
969 * label. The originating hammer2_inode must be exclusively locked for
972 * The ioctl code has already synced the filesystem.
975 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
976 hammer2_ioc_pfs_t *pfs)
978 hammer2_mount_t *hmp;
979 hammer2_cluster_t *ncluster;
980 const hammer2_inode_data_t *ipdata;
981 hammer2_inode_data_t *wipdata;
982 hammer2_inode_t *nip;
990 kprintf("snapshot %s\n", pfs->name);
992 name_len = strlen(pfs->name);
993 lhc = hammer2_dirhash(pfs->name, name_len);
995 ipdata = &hammer2_cluster_data(ocluster)->ipdata;
996 opfs_clid = ipdata->pfs_clid;
997 hmp = ocluster->focus->hmp;
1000 * Create the snapshot directory under the super-root
1002 * Set PFS type, generate a unique filesystem id, and generate
1003 * a cluster id. Use the same clid when snapshotting a PFS root,
1004 * which theoretically allows the snapshot to be used as part of
1005 * the same cluster (perhaps as a cache).
1007 * Copy the (flushed) blockref array. Theoretically we could use
1008 * chain_duplicate() but it becomes difficult to disentangle
1009 * the shared core so for now just brute-force it.
1015 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1016 proc0.p_ucred, pfs->name, name_len,
1020 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1021 wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
1022 kern_uuidgen(&wipdata->pfs_fsid, 1);
1023 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1024 wipdata->pfs_clid = opfs_clid;
1026 kern_uuidgen(&wipdata->pfs_clid, 1);
1028 for (i = 0; i < ncluster->nchains; ++i) {
1029 if (ncluster->array[i]) {
1030 ncluster->array[i]->bref.flags |=
1031 HAMMER2_BREF_FLAG_PFSROOT;
1035 /* XXX can't set this unless we do an explicit flush, which
1036 we also need a pmp assigned to do, else the flush code
1037 won't flush ncluster because it thinks it is crossing a
1039 hammer2_cluster_set_chainflags(ncluster,
1040 HAMMER2_CHAIN_PFSBOUNDARY);
1043 /* XXX hack blockset copy */
1044 /* XXX doesn't work with real cluster */
1045 KKASSERT(ocluster->nchains == 1);
1046 wipdata->u.blockset = ocluster->focus->data->ipdata.u.blockset;
1047 hammer2_cluster_modsync(ncluster);
1048 for (i = 0; i < ncluster->nchains; ++i) {
1049 if (ncluster->array[i])
1050 hammer2_flush(trans, ncluster->array[i]);
1052 hammer2_inode_unlock_ex(nip, ncluster);
1058 * Return locked parent cluster given a locked child. The child remains
1062 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1064 hammer2_cluster_t *cparent;
1067 cparent = hammer2_cluster_copy(cluster, HAMMER2_CLUSTER_COPY_NOCHAINS);
1068 for (i = 0; i < cluster->nchains; ++i) {
1069 hammer2_chain_t *chain;
1070 hammer2_chain_t *rchain;
1072 chain = cluster->array[i];
1075 hammer2_chain_ref(chain);
1076 while ((rchain = chain->parent) != NULL) {
1077 hammer2_chain_ref(rchain);
1078 hammer2_chain_unlock(chain);
1079 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1080 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1081 hammer2_chain_drop(rchain);
1082 if (chain->parent == rchain)
1084 hammer2_chain_unlock(rchain);
1086 hammer2_chain_drop(chain);
1087 cparent->array[i] = rchain;
1092 /************************************************************************
1094 ************************************************************************
1096 * A node failure can occur for numerous reasons.
1098 * - A read I/O may fail
1099 * - A write I/O may fail
1100 * - An unexpected chain might be found (or be missing)
1101 * - A node might disconnect temporarily and reconnect later
1102 * (for example, a USB stick could get pulled, or a node might
1103 * be programmatically disconnected).
1104 * - A node might run out of space during a modifying operation.
1106 * When a read failure or an unexpected chain state is found, the chain and
1107 * parent chain at the failure point for the nodes involved (the nodes
1108 * which we determine to be in error) are flagged as failed and removed
1109 * from the cluster. The node itself is allowed to remain active. The
1110 * highest common point (usually a parent chain) is queued to the
1111 * resynchronization thread for action.
1113 * When a write I/O fails or a node runs out of space, we first adjust
1114 * as if a read failure occurs but we further disable flushes on the
1115 * ENTIRE node. Concurrent modifying transactions are allowed to complete
1116 * but any new modifying transactions will automatically remove the node
1117 * from consideration in all related cluster structures and not generate
1118 * any new modified chains. The ROOT chain for the failed node(s) is queued
1119 * to the resynchronization thread for action.
1121 * A temporary disconnect is handled as if a write failure occurred.
1123 * Any of these failures might or might not stall related high level VNOPS,
1124 * depending on what has failed, what nodes remain, the type of cluster,
1125 * and the operating state of the cluster.
1127 * FLUSH ON WRITE-DISABLED NODES
1129 * A flush on a write-disabled node is not allowed to write anything because
1130 * we cannot safely update the mirror_tid anywhere on the failed node. The
1131 * synchronization thread uses mirror_tid to calculate incremental resyncs.
1132 * Dirty meta-data related to the failed node is thrown away.
1134 * Dirty buffer cache buffers and inodes are only thrown away if they can be
1135 * retired... that is, if the filesystem still has enough nodes to complete
1139 /************************************************************************
1140 * SYNCHRONIZATION THREAD *
1141 ************************************************************************
1143 * This thread is responsible for [re]synchronizing the cluster representing
1144 * a PFS. Any out-of-sync or failed node starts this thread on a
1145 * node-by-node basis when the failure is detected.
1147 * Clusters needing resynchronization are queued at the highest point
1148 * where the parent on the failed node is still valid, or a special
1149 * incremental scan from the ROOT is queued if no parent exists. This
1150 * thread is also responsible for waiting for reconnections of the failed
1151 * node if the cause was due to a disconnect, and waiting for space to be
1152 * freed up if the cause was due to running out of space.
1154 * If the cause is due to a node running out of space, this thread will also
1155 * remove older (unlocked) snapshots to make new space, recover space, and
1156 * then start resynchronization.
1158 * Each resynchronization pass virtually snapshots the PFS on the good nodes
1159 * and synchronizes using that snapshot against the target node. This
1160 * ensures a consistent chain topology and also avoid interference between
1161 * the resynchronization thread and frontend operations.
1163 * Since these are per-node threads it is possible to resynchronize several