hammer2 - Start work on quorum validation.
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *          locks and I/O, do quorum and/or master-slave processing, and
50  *          it must operate properly even if some nodes are broken (which
51  *          can also mean indefinite locks).
52  *
53  *                              CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *              hammer2_cluster_lock()
59  *              hammer2_cluster_parent()
60  *
61  *      - Most complex functions, quorum management on transaction ids.
62  *
63  *      - Locking and data accesses must be internally asynchronous.
64  *
65  *      - Validate and manage cache coherency primitives (cache state
66  *        is stored in chain topologies but must be validated by these
67  *        functions).
68  *
69  * (2) Lookups and Scans
70  *              hammer2_cluster_lookup()
71  *              hammer2_cluster_next()
72  *
73  *      - Depend on locking & data retrieval functions, but still complex.
74  *
75  *      - Must do quorum management on transaction ids.
76  *
77  *      - Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *              hammer2_cluster_create()
81  *              hammer2_cluster_rename()
82  *              hammer2_cluster_delete()
83  *              hammer2_cluster_modify()
84  *              hammer2_cluster_modsync()
85  *
86  *      - Can usually punt on failures, operation continues unless quorum
87  *        is lost.  If quorum is lost, must wait for resynchronization
88  *        (depending on the management mode).
89  *
90  *      - Must disconnect node on failures (also not flush), remount, and
91  *        resynchronize.
92  *
93  *      - Network links (via kdmsg) are relatively easy to issue as the
94  *        complex underworkings of hammer2_chain.c don't have to messed
95  *        with (the protocol is at a higher level than block-level).
96  *
97  *      - Multiple local disk nodes (i.e. block devices) are another matter.
98  *        Chain operations have to be dispatched to per-node threads (xN)
99  *        because we can't asynchronize potentially very complex chain
100  *        operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *        (these threads are also used to terminate incoming kdmsg ops from
103  *        other machines).
104  *
105  *      - Single-node filesystems do not use threads and will simply call
106  *        hammer2_chain.c functions directly.  This short-cut is handled
107  *        at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115
116 #include "hammer2.h"
117
118 /*
119  * Returns TRUE if any chain in the cluster needs to be resized.
120  */
121 int
122 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
123 {
124         hammer2_chain_t *chain;
125         int i;
126
127         for (i = 0; i < cluster->nchains; ++i) {
128                 chain = cluster->array[i].chain;
129                 if (chain && chain->bytes != bytes)
130                         return 1;
131         }
132         return 0;
133 }
134
135 uint8_t
136 hammer2_cluster_type(hammer2_cluster_t *cluster)
137 {
138         return(cluster->focus->bref.type);
139 }
140
141 int
142 hammer2_cluster_modified(hammer2_cluster_t *cluster)
143 {
144         return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
145 }
146
147 /*
148  * Return a bref representative of the cluster.  Any data offset is removed
149  * (since it would only be applicable to a particular chain in the cluster).
150  *
151  * However, the radix portion of data_off is used for many purposes and will
152  * be retained.
153  */
154 void
155 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
156 {
157         *bref = cluster->focus->bref;
158         bref->data_off &= HAMMER2_OFF_MASK_RADIX;
159 }
160
161 /*
162  * Return non-zero if the chain representing an inode has been flagged
163  * as having been unlinked.  Allows the vnode reclaim to avoid loading
164  * the inode data from disk e.g. when unmount or recycling old, clean
165  * vnodes.
166  */
167 int
168 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster)
169 {
170         hammer2_chain_t *chain;
171         int flags;
172         int i;
173
174         flags = 0;
175         for (i = 0; i < cluster->nchains; ++i) {
176                 chain = cluster->array[i].chain;
177                 if (chain)
178                         flags |= chain->flags;
179         }
180         return (flags & HAMMER2_CHAIN_UNLINKED);
181 }
182
183 void
184 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
185 {
186         hammer2_chain_t *chain;
187         int i;
188
189         for (i = 0; i < cluster->nchains; ++i) {
190                 chain = cluster->array[i].chain;
191                 if (chain)
192                         atomic_set_int(&chain->flags, flags);
193         }
194 }
195
196 void
197 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
198 {
199         hammer2_chain_t *chain;
200         int i;
201
202         for (i = 0; i < cluster->nchains; ++i) {
203                 chain = cluster->array[i].chain;
204                 if (chain)
205                         atomic_clear_int(&chain->flags, flags);
206         }
207 }
208
209 void
210 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
211 {
212         hammer2_chain_t *chain;
213         int i;
214
215         for (i = 0; i < cluster->nchains; ++i) {
216                 chain = cluster->array[i].chain;
217                 if (chain)
218                         hammer2_chain_setflush(trans, chain);
219         }
220 }
221
222 void
223 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
224                                 hammer2_cluster_t *cluster,
225                                 int check_algo)
226 {
227         hammer2_chain_t *chain;
228         int i;
229
230         for (i = 0; i < cluster->nchains; ++i) {
231                 chain = cluster->array[i].chain;
232                 if (chain) {
233                         KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
234                         chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
235                         chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
236                 }
237         }
238 }
239
240 /*
241  * Create a cluster with one ref from the specified chain.  The chain
242  * is not further referenced.  The caller typically supplies a locked
243  * chain and transfers ownership to the cluster.
244  *
245  * The returned cluster will be focused on the chain (strictly speaking,
246  * the focus should be NULL if the chain is not locked but we do not check
247  * for this condition).
248  *
249  * We fake the flags.
250  */
251 hammer2_cluster_t *
252 hammer2_cluster_from_chain(hammer2_chain_t *chain)
253 {
254         hammer2_cluster_t *cluster;
255
256         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
257         cluster->array[0].chain = chain;
258         cluster->nchains = 1;
259         cluster->focus = chain;
260         cluster->pmp = chain->pmp;
261         cluster->refs = 1;
262         cluster->flags = HAMMER2_CLUSTER_LOCKED |
263                          HAMMER2_CLUSTER_WRHARD |
264                          HAMMER2_CLUSTER_RDHARD |
265                          HAMMER2_CLUSTER_MSYNCED |
266                          HAMMER2_CLUSTER_SSYNCED;
267
268         return cluster;
269 }
270
271 #if 0
272 /*
273  * Allocates a cluster and its underlying chain structures.  The underlying
274  * chains will be locked.  The cluster and underlying chains will have one
275  * ref and will be focused on the first chain.
276  *
277  * XXX focus on first chain.
278  */
279 hammer2_cluster_t *
280 hammer2_cluster_alloc(hammer2_pfs_t *pmp,
281                       hammer2_trans_t *trans, hammer2_blockref_t *bref)
282 {
283         hammer2_cluster_t *cluster;
284         hammer2_cluster_t *rcluster;
285         hammer2_chain_t *chain;
286         hammer2_chain_t *rchain;
287 #if 0
288         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
289 #endif
290         int i;
291
292         KKASSERT(pmp != NULL);
293
294         /*
295          * Construct the appropriate system structure.
296          */
297         switch(bref->type) {
298         case HAMMER2_BREF_TYPE_INODE:
299         case HAMMER2_BREF_TYPE_INDIRECT:
300         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
301         case HAMMER2_BREF_TYPE_DATA:
302         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
303                 /*
304                  * Chain's are really only associated with the hmp but we
305                  * maintain a pmp association for per-mount memory tracking
306                  * purposes.  The pmp can be NULL.
307                  */
308                 break;
309         case HAMMER2_BREF_TYPE_VOLUME:
310         case HAMMER2_BREF_TYPE_FREEMAP:
311                 chain = NULL;
312                 panic("hammer2_cluster_alloc volume type illegal for op");
313         default:
314                 chain = NULL;
315                 panic("hammer2_cluster_alloc: unrecognized blockref type: %d",
316                       bref->type);
317         }
318
319         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
320         cluster->refs = 1;
321         cluster->flags = HAMMER2_CLUSTER_LOCKED;
322
323         rcluster = &pmp->iroot->cluster;
324         for (i = 0; i < rcluster->nchains; ++i) {
325                 rchain = rcluster->array[i].chain;
326                 chain = hammer2_chain_alloc(rchain->hmp, pmp, trans, bref);
327 #if 0
328                 chain->hmp = rchain->hmp;
329                 chain->bref = *bref;
330                 chain->bytes = bytes;
331                 chain->refs = 1;
332                 chain->flags |= HAMMER2_CHAIN_ALLOCATED;
333 #endif
334
335                 /*
336                  * NOTE: When loading a chain from backing store or creating a
337                  *       snapshot, trans will be NULL and the caller is
338                  *       responsible for setting these fields.
339                  */
340                 cluster->array[i].chain = chain;
341         }
342         cluster->nchains = i;
343         cluster->pmp = pmp;
344         cluster->focus = cluster->array[0].chain;
345
346         return (cluster);
347 }
348 #endif
349
350 /*
351  * Add a reference to a cluster.
352  *
353  * We must also ref the underlying chains in order to allow ref/unlock
354  * sequences to later re-lock.
355  */
356 void
357 hammer2_cluster_ref(hammer2_cluster_t *cluster)
358 {
359         hammer2_chain_t *chain;
360         int i;
361
362         atomic_add_int(&cluster->refs, 1);
363         for (i = 0; i < cluster->nchains; ++i) {
364                 chain = cluster->array[i].chain;
365                 if (chain)
366                         hammer2_chain_ref(chain);
367         }
368 }
369
370 /*
371  * Drop the caller's reference to the cluster.  When the ref count drops to
372  * zero this function frees the cluster and drops all underlying chains.
373  *
374  * In-progress read I/Os are typically detached from the cluster once the
375  * first one returns (the remaining stay attached to the DIOs but are then
376  * ignored and drop naturally).
377  */
378 void
379 hammer2_cluster_drop(hammer2_cluster_t *cluster)
380 {
381         hammer2_chain_t *chain;
382         int i;
383
384         KKASSERT(cluster->refs > 0);
385         for (i = 0; i < cluster->nchains; ++i) {
386                 chain = cluster->array[i].chain;
387                 if (chain) {
388                         hammer2_chain_drop(chain);
389                         if (cluster->refs == 1)
390                                 cluster->array[i].chain = NULL;
391                 }
392         }
393         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
394                 cluster->focus = NULL;          /* safety XXX chg to assert */
395                 kfree(cluster, M_HAMMER2);
396                 /* cluster is invalid */
397         }
398 }
399
400 void
401 hammer2_cluster_wait(hammer2_cluster_t *cluster)
402 {
403         tsleep(cluster->focus, 0, "h2clcw", 1);
404 }
405
406 /*
407  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
408  * and then locks them.
409  *
410  * The act of locking a cluster sets its focus if not already set.
411  *
412  * The chains making up the cluster may be narrowed down based on quorum
413  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
414  * narrowed down to a single chain as long as the entire subtopology is known
415  * to be intact.  So, for example, we can narrow a read-only op to a single
416  * fast SLAVE but if we focus a CACHE chain we must still retain at least
417  * a SLAVE to ensure that the subtopology can be accessed.
418  *
419  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
420  * to be maintained once the topology is validated as-of the top level of
421  * the operation.
422  *
423  * If a failure occurs the operation must be aborted by higher-level code and
424  * retried. XXX
425  */
426 int
427 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
428 {
429         hammer2_chain_t *chain;
430         hammer2_pfs_t *pmp;
431         uint32_t nflags;
432         hammer2_tid_t quorum_tid;
433         int focus_pfs_type;
434         int nmasters;
435         int ttlslaves;
436         int ttlmasters;
437         int nslaves;
438         int nquorum;
439         int error;
440         int i;
441
442         pmp = cluster->pmp;
443         KKASSERT(pmp != NULL || cluster->nchains == 0);
444
445         /* cannot be on inode-embedded cluster template, must be on copy */
446         KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
447         if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
448                 kprintf("hammer2_cluster_lock: cluster %p already locked!\n",
449                         cluster);
450         } else {
451                 KKASSERT(cluster->focus == NULL);
452         }
453         atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
454
455         focus_pfs_type = 0;
456         quorum_tid = 0;
457         nflags = 0;
458         ttlslaves = 0;
459         ttlmasters = 0;
460         nslaves = 0;
461         nmasters = 0;
462
463         /*
464          * Calculate the quorum count.
465          */
466         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
467
468         if ((how & HAMMER2_RESOLVE_NOREF) == 0)
469                 atomic_add_int(&cluster->refs, 1);
470
471         /*
472          * Lock chains and accumulate statistics.
473          */
474         for (i = 0; i < cluster->nchains; ++i) {
475                 chain = cluster->array[i].chain;
476                 if (chain == NULL)
477                         continue;
478                 error = hammer2_chain_lock(chain, how);
479                 if (error) {
480                         kprintf("hammer2_cluster_lock: cluster %p index %d "
481                                 "lock failure\n", cluster, i);
482                         cluster->array[i].chain = NULL;
483                         hammer2_chain_drop(chain);
484                         continue;
485                 }
486
487                 /*
488                  * We can resolve some things on this array pass but other
489                  * things will require a full-accounting of available
490                  * masters.
491                  */
492                 switch (cluster->pmp->pfs_types[i]) {
493                 case HAMMER2_PFSTYPE_MASTER:
494                         ++ttlmasters;
495                         if (quorum_tid < chain->bref.mirror_tid ||
496                             nmasters == 0) {
497                                 nmasters = 1;
498                                 quorum_tid = chain->bref.mirror_tid;
499                         } else if (quorum_tid == chain->bref.mirror_tid) {
500                                 ++nmasters;
501                         }
502                         break;
503                 case HAMMER2_PFSTYPE_SLAVE:
504                         ++ttlslaves;
505                         break;
506                 case HAMMER2_PFSTYPE_SOFT_MASTER:
507                         nflags |= HAMMER2_CLUSTER_WRSOFT;
508                         nflags |= HAMMER2_CLUSTER_RDSOFT;
509                         break;
510                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
511                         nflags |= HAMMER2_CLUSTER_RDSOFT;
512                         break;
513                 case HAMMER2_PFSTYPE_SUPROOT:
514                         /*
515                          * Degenerate cluster representing the super-root
516                          * topology on a single device.
517                          */
518                         nflags |= HAMMER2_CLUSTER_WRHARD;
519                         nflags |= HAMMER2_CLUSTER_RDHARD;
520                         cluster->focus = chain;
521                         break;
522                 default:
523                         break;
524                 }
525         }
526
527         /*
528          * Pass 2 - accumulate statistics.
529          */
530         for (i = 0; i < cluster->nchains; ++i) {
531                 chain = cluster->array[i].chain;
532                 if (chain == NULL)
533                         continue;
534                 switch (cluster->pmp->pfs_types[i]) {
535                 case HAMMER2_PFSTYPE_MASTER:
536                         /*
537                          * We must have enough up-to-date masters to reach
538                          * a quorum and the master mirror_tid must match
539                          * the quorum's mirror_tid.
540                          */
541                         if (nmasters >= nquorum &&
542                             quorum_tid == chain->bref.mirror_tid) {
543                                 nflags |= HAMMER2_CLUSTER_WRHARD;
544                                 nflags |= HAMMER2_CLUSTER_RDHARD;
545                                 if (cluster->focus == NULL ||
546                                     focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
547                                         focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
548                                         cluster->focus = chain;
549                                 }
550                         }
551                         break;
552                 case HAMMER2_PFSTYPE_SLAVE:
553                         /*
554                          * We must have enough up-to-date masters to reach
555                          * a quorum and the slave mirror_tid must match the
556                          * quorum's mirror_tid.
557                          */
558                         if (nmasters >= nquorum &&
559                             quorum_tid == chain->bref.mirror_tid) {
560                                 ++nslaves;
561                                 nflags |= HAMMER2_CLUSTER_RDHARD;
562                                 if (cluster->focus == NULL) {
563                                         focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
564                                         cluster->focus = chain;
565                                 }
566                         }
567                         break;
568                 case HAMMER2_PFSTYPE_SOFT_MASTER:
569                         /*
570                          * Directly mounted soft master always wins.  There
571                          * should be only one.
572                          */
573                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
574                         cluster->focus = chain;
575                         focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
576                         break;
577                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
578                         /*
579                          * Directly mounted soft slave always wins.  There
580                          * should be only one.
581                          */
582                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
583                         if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
584                                 cluster->focus = chain;
585                                 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
586                         }
587                         break;
588                 default:
589                         break;
590                 }
591         }
592
593         /*
594          * Set SSYNCED or MSYNCED for slaves and masters respectively if
595          * all available nodes (even if 0 are available) are fully
596          * synchronized.  This is used by the synchronization thread to
597          * determine if there is work it could potentially accomplish.
598          */
599         if (nslaves == ttlslaves)
600                 nflags |= HAMMER2_CLUSTER_SSYNCED;
601         if (nmasters == ttlmasters)
602                 nflags |= HAMMER2_CLUSTER_MSYNCED;
603
604         /*
605          * Determine if the cluster was successfully locked for the
606          * requested operation and generate an error code.  The cluster
607          * will not be locked (or ref'd) if an error is returned.
608          */
609         atomic_set_int(&cluster->flags, nflags);
610         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
611         error = 0;
612
613         if (how & HAMMER2_RESOLVE_RDONLY) {
614                 if ((nflags & HAMMER2_CLUSTER_RDOK) == 0)
615                         error = EIO;
616         } else {
617                 if ((nflags & HAMMER2_CLUSTER_WROK) == 0)
618                         error = EIO;
619         }
620
621         if (error) {
622                 kprintf("hammer2_cluster_lock: cluster %p lock failed %d\n",
623                         cluster, error);
624                 for (i = 0; i < cluster->nchains; ++i) {
625                         chain = cluster->array[i].chain;
626                         if (chain == NULL)
627                                 continue;
628                         hammer2_chain_unlock(chain);
629                 }
630                 if ((how & HAMMER2_RESOLVE_NOREF) == 0)
631                         atomic_add_int(&cluster->refs, -1);
632         }
633
634         return error;
635 }
636
637 #if 0
638 /*
639  * Replace the contents of dst with src, adding a reference to src's chains
640  * but not adding any additional locks.
641  *
642  * dst is assumed to already have a ref and any chains present in dst are
643  * assumed to be locked and will be unlocked.
644  *
645  * If the chains in src are locked, only one of (src) or (dst) should be
646  * considered locked by the caller after return, not both.
647  */
648 void
649 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src)
650 {
651         hammer2_chain_t *chain;
652         hammer2_chain_t *tmp;
653         int i;
654
655         KKASSERT(dst->refs == 1);
656         dst->focus = NULL;
657
658         for (i = 0; i < src->nchains; ++i) {
659                 chain = src->array[i].chain;
660                 if (chain) {
661                         hammer2_chain_ref(chain);
662                         if (i < dst->nchains &&
663                             (tmp = dst->array[i].chain) != NULL) {
664                                 hammer2_chain_unlock(tmp);
665                         }
666                         dst->array[i].chain = chain;
667                         if (dst->focus == NULL)
668                                 dst->focus = chain;
669                 }
670         }
671         while (i < dst->nchains) {
672                 chain = dst->array[i].chain;
673                 if (chain) {
674                         hammer2_chain_unlock(chain);
675                         dst->array[i].chain = NULL;
676                 }
677                 ++i;
678         }
679         dst->nchains = src->nchains;
680 }
681
682 /*
683  * Replace the contents of the locked destination with the contents of the
684  * locked source.  The destination must have one ref.
685  *
686  * Returns with the destination still with one ref and the copied chains
687  * with an additional lock (representing their state on the destination).
688  * The original chains associated with the destination are unlocked.
689  *
690  * From the point of view of the caller, both src and dst are locked on
691  * call and remain locked on return.
692  *
693  * XXX adjust flag state
694  */
695 void
696 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src)
697 {
698         hammer2_chain_t *chain;
699         hammer2_chain_t *tmp;
700         int i;
701
702         KKASSERT(dst->refs == 1);
703
704         dst->focus = NULL;
705         for (i = 0; i < src->nchains; ++i) {
706                 chain = src->array[i].chain;
707                 if (chain) {
708                         hammer2_chain_lock(chain, 0);
709                         if (i < dst->nchains &&
710                             (tmp = dst->array[i].chain) != NULL) {
711                                 hammer2_chain_unlock(tmp);
712                         }
713                         dst->array[i].chain = chain;
714                 }
715         }
716         while (i < dst->nchains) {
717                 chain = dst->array[i].chain;
718                 if (chain) {
719                         hammer2_chain_unlock(chain);
720                         dst->array[i].chain = NULL;
721                 }
722                 ++i;
723         }
724         dst->nchains = src->nchains;
725         dst->flags = src->flags;
726         dst->focus = src->focus;
727 }
728 #endif
729
730 /*
731  * Copy a cluster, returned a ref'd cluster.  All underlying chains
732  * are also ref'd, but not locked.
733  *
734  * The cluster focus is not set because the cluster is not yet locked
735  * (and the originating cluster does not have to be locked either).
736  */
737 hammer2_cluster_t *
738 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
739 {
740         hammer2_pfs_t *pmp = ocluster->pmp;
741         hammer2_cluster_t *ncluster;
742         hammer2_chain_t *chain;
743         int i;
744
745         ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
746         ncluster->pmp = pmp;
747         ncluster->nchains = ocluster->nchains;
748         ncluster->refs = 1;
749         ncluster->flags = 0;    /* cluster not locked */
750
751         for (i = 0; i < ocluster->nchains; ++i) {
752                 chain = ocluster->array[i].chain;
753                 ncluster->array[i].chain = chain;
754                 if (chain)
755                         hammer2_chain_ref(chain);
756         }
757         return (ncluster);
758 }
759
760 /*
761  * Unlock and deref a cluster.  The cluster is destroyed if this is the
762  * last ref.
763  */
764 void
765 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
766 {
767         hammer2_chain_t *chain;
768         int i;
769
770         if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
771                 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
772                         cluster);
773         }
774         /* KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); */
775         KKASSERT(cluster->refs > 0);
776         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
777
778         for (i = 0; i < cluster->nchains; ++i) {
779                 chain = cluster->array[i].chain;
780                 if (chain) {
781                         hammer2_chain_unlock(chain);
782                         if (cluster->refs == 1)
783                                 cluster->array[i].chain = NULL; /* safety */
784                 }
785         }
786         cluster->focus = NULL;
787
788         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
789                 kfree(cluster, M_HAMMER2);
790                 /* cluster = NULL; safety */
791         }
792 }
793
794 /*
795  * Resize the cluster's physical storage allocation in-place.  This may
796  * replace the cluster's chains.
797  */
798 void
799 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
800                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
801                        int nradix, int flags)
802 {
803         hammer2_chain_t *chain;
804         int i;
805
806         KKASSERT(cparent->pmp == cluster->pmp);         /* can be NULL */
807         KKASSERT(cparent->nchains == cluster->nchains);
808
809         for (i = 0; i < cluster->nchains; ++i) {
810                 chain = cluster->array[i].chain;
811                 if (chain) {
812                         KKASSERT(cparent->array[i].chain);
813                         hammer2_chain_resize(trans, ip,
814                                              cparent->array[i].chain, chain,
815                                              nradix, flags);
816                 }
817         }
818 }
819
820 /*
821  * Set an inode's cluster modified, marking the related chains RW and
822  * duplicating them if necessary.
823  *
824  * The passed-in chain is a localized copy of the chain previously acquired
825  * when the inode was locked (and possilby replaced in the mean time), and
826  * must also be updated.  In fact, we update it first and then synchronize
827  * the inode's cluster cache.
828  */
829 hammer2_inode_data_t *
830 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
831                           hammer2_cluster_t *cluster, int flags)
832 {
833         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
834         hammer2_cluster_modify(trans, cluster, flags);
835
836         hammer2_inode_repoint(ip, NULL, cluster);
837         if (ip->vp)
838                 vsetisdirty(ip->vp);
839         return (&hammer2_cluster_wdata(cluster)->ipdata);
840 }
841
842 /*
843  * Adjust the cluster's chains to allow modification and adjust the
844  * focus.  Data will be accessible on return.
845  */
846 void
847 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
848                        int flags)
849 {
850         hammer2_chain_t *chain;
851         int i;
852
853         for (i = 0; i < cluster->nchains; ++i) {
854                 chain = cluster->array[i].chain;
855                 if (chain)
856                         hammer2_chain_modify(trans, chain, flags);
857         }
858 }
859
860 /*
861  * Synchronize modifications from the focus to other chains in a cluster.
862  * Convenient because nominal API users can just modify the contents of the
863  * focus (at least for non-blockref data).
864  *
865  * Nominal front-end operations only edit non-block-table data in a single
866  * chain.  This code copies such modifications to the other chains in the
867  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
868  * by both the frontend and the backend and will explode in fireworks if
869  * blindly copied.
870  */
871 void
872 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
873 {
874         hammer2_chain_t *focus;
875         hammer2_chain_t *scan;
876         const hammer2_inode_data_t *ripdata;
877         hammer2_inode_data_t *wipdata;
878         int i;
879
880         focus = cluster->focus;
881         KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
882
883         for (i = 0; i < cluster->nchains; ++i) {
884                 scan = cluster->array[i].chain;
885                 if (scan == NULL || scan == focus)
886                         continue;
887                 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
888                 KKASSERT(focus->bytes == scan->bytes &&
889                          focus->bref.type == scan->bref.type);
890                 switch(focus->bref.type) {
891                 case HAMMER2_BREF_TYPE_INODE:
892                         ripdata = &focus->data->ipdata;
893                         wipdata = &scan->data->ipdata;
894                         if ((ripdata->op_flags &
895                             HAMMER2_OPFLAG_DIRECTDATA) == 0) {
896                                 bcopy(ripdata, wipdata,
897                                       offsetof(hammer2_inode_data_t, u));
898                                 break;
899                         }
900                         /* fall through to full copy */
901                 case HAMMER2_BREF_TYPE_DATA:
902                         bcopy(focus->data, scan->data, focus->bytes);
903                         break;
904                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
905                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
906                 case HAMMER2_BREF_TYPE_FREEMAP:
907                 case HAMMER2_BREF_TYPE_VOLUME:
908                         panic("hammer2_cluster_modsync: illegal node type");
909                         /* NOT REACHED */
910                         break;
911                 default:
912                         panic("hammer2_cluster_modsync: unknown node type");
913                         break;
914                 }
915         }
916 }
917
918 /*
919  * Lookup initialization/completion API
920  */
921 hammer2_cluster_t *
922 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
923 {
924         hammer2_cluster_t *cluster;
925         int i;
926
927         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
928         cluster->pmp = cparent->pmp;                    /* can be NULL */
929         cluster->flags = 0;     /* cluster not locked (yet) */
930         /* cluster->focus = NULL; already null */
931
932         for (i = 0; i < cparent->nchains; ++i)
933                 cluster->array[i].chain = cparent->array[i].chain;
934         cluster->nchains = cparent->nchains;
935
936         /*
937          * Independently lock (this will also give cluster 1 ref)
938          */
939         if (flags & HAMMER2_LOOKUP_SHARED) {
940                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
941                                               HAMMER2_RESOLVE_SHARED);
942         } else {
943                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
944         }
945         return (cluster);
946 }
947
948 void
949 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
950 {
951         if (cparent)
952                 hammer2_cluster_unlock(cparent);
953 }
954
955 /*
956  * Locate first match or overlap under parent, return a new cluster
957  */
958 hammer2_cluster_t *
959 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
960                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
961 {
962         hammer2_pfs_t *pmp;
963         hammer2_cluster_t *cluster;
964         hammer2_chain_t *chain;
965         hammer2_key_t key_accum;
966         hammer2_key_t key_next;
967         hammer2_key_t bref_key;
968         int null_count;
969         int bref_keybits;
970         int i;
971         uint8_t bref_type;
972         u_int bytes;
973
974         pmp = cparent->pmp;                             /* can be NULL */
975         key_accum = *key_nextp;
976         null_count = 0;
977         bref_type = 0;
978         bref_key = 0;
979         bref_keybits = 0;
980         bytes = 0;
981
982         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
983         cluster->pmp = pmp;                             /* can be NULL */
984         cluster->refs = 1;
985         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
986                 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
987
988         for (i = 0; i < cparent->nchains; ++i) {
989                 key_next = *key_nextp;
990                 if (cparent->array[i].chain == NULL) {
991                         ++null_count;
992                         continue;
993                 }
994                 chain = hammer2_chain_lookup(&cparent->array[i].chain,
995                                              &key_next,
996                                              key_beg, key_end,
997                                              &cparent->array[i].cache_index,
998                                              flags);
999                 cluster->array[i].chain = chain;
1000                 if (chain == NULL) {
1001                         ++null_count;
1002                 } else {
1003                         int ddflag = (chain->bref.type ==
1004                                       HAMMER2_BREF_TYPE_INODE);
1005
1006                         /*
1007                          * Set default focus.
1008                          */
1009                         if (cluster->focus == NULL) {
1010                                 bref_type = chain->bref.type;
1011                                 bref_key = chain->bref.key;
1012                                 bref_keybits = chain->bref.keybits;
1013                                 bytes = chain->bytes;
1014                                 cluster->ddflag = ddflag;
1015                                 cluster->focus = chain;
1016                         }
1017
1018                         /*
1019                          * Override default focus to follow the parent.
1020                          */
1021                         if (cparent->focus == cparent->array[i].chain)
1022                                 cluster->focus = chain;
1023
1024                         KKASSERT(bref_type == chain->bref.type);
1025                         KKASSERT(bref_key == chain->bref.key);
1026                         KKASSERT(bref_keybits == chain->bref.keybits);
1027                         KKASSERT(bytes == chain->bytes);
1028                         KKASSERT(cluster->ddflag == ddflag);
1029                 }
1030                 if (key_accum > key_next)
1031                         key_accum = key_next;
1032         }
1033         *key_nextp = key_accum;
1034         cluster->nchains = i;
1035
1036         if (null_count == i) {
1037                 hammer2_cluster_drop(cluster);
1038                 cluster = NULL;
1039         }
1040
1041         return (cluster);
1042 }
1043
1044 /*
1045  * Locate next match or overlap under parent, replace cluster
1046  */
1047 hammer2_cluster_t *
1048 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1049                      hammer2_key_t *key_nextp,
1050                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1051 {
1052         hammer2_chain_t *chain;
1053         hammer2_key_t key_accum;
1054         hammer2_key_t key_next;
1055         hammer2_key_t bref_key;
1056         int null_count;
1057         int bref_keybits;
1058         int i;
1059         uint8_t bref_type;
1060         u_int bytes;
1061
1062         key_accum = *key_nextp;
1063         null_count = 0;
1064         cluster->focus = NULL;
1065         cparent->focus = NULL;
1066
1067         bref_type = 0;
1068         bref_key = 0;
1069         bref_keybits = 0;
1070         bytes = 0;
1071         cluster->ddflag = 0;
1072
1073         for (i = 0; i < cparent->nchains; ++i) {
1074                 key_next = *key_nextp;
1075                 chain = cluster->array[i].chain;
1076                 if (chain == NULL) {
1077                         ++null_count;
1078                         continue;
1079                 }
1080                 if (cparent->array[i].chain == NULL) {
1081                         if (flags & HAMMER2_LOOKUP_NOLOCK)
1082                                 hammer2_chain_drop(chain);
1083                         else
1084                                 hammer2_chain_unlock(chain);
1085                         ++null_count;
1086                         continue;
1087                 }
1088                 chain = hammer2_chain_next(&cparent->array[i].chain, chain,
1089                                            &key_next, key_beg, key_end,
1090                                            &cparent->array[i].cache_index,
1091                                            flags);
1092                 cluster->array[i].chain = chain;
1093                 if (chain == NULL) {
1094                         ++null_count;
1095                 } else {
1096                         int ddflag = (chain->bref.type ==
1097                                       HAMMER2_BREF_TYPE_INODE);
1098                         if (cluster->focus == NULL) {
1099                                 bref_type = chain->bref.type;
1100                                 bref_key = chain->bref.key;
1101                                 bref_keybits = chain->bref.keybits;
1102                                 bytes = chain->bytes;
1103                                 cluster->ddflag = ddflag;
1104                                 cluster->focus = chain;
1105                         }
1106
1107                         /*
1108                          * Override default focus to follow the parent.
1109                          */
1110                         if (cparent->focus == cparent->array[i].chain)
1111                                 cluster->focus = chain;
1112
1113                         KKASSERT(bref_type == chain->bref.type);
1114                         KKASSERT(bref_key == chain->bref.key);
1115                         KKASSERT(bref_keybits == chain->bref.keybits);
1116                         KKASSERT(bytes == chain->bytes);
1117                         KKASSERT(cluster->ddflag == ddflag);
1118                 }
1119                 if (key_accum > key_next)
1120                         key_accum = key_next;
1121         }
1122         cluster->nchains = i;
1123
1124         if (null_count == i) {
1125                 hammer2_cluster_drop(cluster);
1126                 cluster = NULL;
1127         }
1128         return(cluster);
1129 }
1130
1131 #if 0
1132 /*
1133  * XXX initial NULL cluster needs reworking (pass **clusterp ?)
1134  *
1135  * The raw scan function is similar to lookup/next but does not seek to a key.
1136  * Blockrefs are iterated via first_chain = (parent, NULL) and
1137  * next_chain = (parent, chain).
1138  *
1139  * The passed-in parent must be locked and its data resolved.  The returned
1140  * chain will be locked.  Pass chain == NULL to acquire the first sub-chain
1141  * under parent and then iterate with the passed-in chain (which this
1142  * function will unlock).
1143  */
1144 hammer2_cluster_t *
1145 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1146                      int flags)
1147 {
1148         hammer2_chain_t *chain;
1149         int null_count;
1150         int i;
1151
1152         null_count = 0;
1153
1154         for (i = 0; i < cparent->nchains; ++i) {
1155                 chain = cluster->array[i].chain;
1156                 if (chain == NULL) {
1157                         ++null_count;
1158                         continue;
1159                 }
1160                 if (cparent->array[i].chain == NULL) {
1161                         if (flags & HAMMER2_LOOKUP_NOLOCK)
1162                                 hammer2_chain_drop(chain);
1163                         else
1164                                 hammer2_chain_unlock(chain);
1165                         ++null_count;
1166                         continue;
1167                 }
1168
1169                 chain = hammer2_chain_scan(cparent->array[i].chain, chain,
1170                                            &cparent->array[i].cache_index,
1171                                            flags);
1172                 cluster->array[i].chain = chain;
1173                 if (chain == NULL)
1174                         ++null_count;
1175         }
1176
1177         if (null_count == i) {
1178                 hammer2_cluster_drop(cluster);
1179                 cluster = NULL;
1180         }
1181         return(cluster);
1182 }
1183
1184 #endif
1185
1186 /*
1187  * Create a new cluster using the specified key
1188  */
1189 int
1190 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1191                      hammer2_cluster_t **clusterp,
1192                      hammer2_key_t key, int keybits,
1193                      int type, size_t bytes, int flags)
1194 {
1195         hammer2_cluster_t *cluster;
1196         hammer2_pfs_t *pmp;
1197         int error;
1198         int i;
1199
1200         pmp = trans->pmp;                               /* can be NULL */
1201
1202         if ((cluster = *clusterp) == NULL) {
1203                 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1204                                   M_WAITOK | M_ZERO);
1205                 cluster->pmp = pmp;                     /* can be NULL */
1206                 cluster->refs = 1;
1207                 cluster->flags = HAMMER2_CLUSTER_LOCKED;
1208         }
1209         cluster->focus = NULL;
1210
1211         /*
1212          * NOTE: cluster->array[] entries can initially be NULL.  If
1213          *       *clusterp is supplied, skip NULL entries, otherwise
1214          *       create new chains.
1215          */
1216         for (i = 0; i < cparent->nchains; ++i) {
1217                 if (*clusterp && cluster->array[i].chain == NULL) {
1218                         continue;
1219                 }
1220                 error = hammer2_chain_create(trans, &cparent->array[i].chain,
1221                                              &cluster->array[i].chain, pmp,
1222                                              key, keybits,
1223                                              type, bytes, flags);
1224                 KKASSERT(error == 0);
1225                 if (cluster->focus == NULL)
1226                         cluster->focus = cluster->array[i].chain;
1227                 if (cparent->focus == cparent->array[i].chain)
1228                         cluster->focus = cluster->array[i].chain;
1229         }
1230         cluster->nchains = i;
1231         *clusterp = cluster;
1232
1233         return error;
1234 }
1235
1236 /*
1237  * Rename a cluster to a new parent.
1238  *
1239  * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields
1240  *          are used from a passed-in non-NULL bref pointer.  All other fields
1241  *          are extracted from the original chain for each chain in the
1242  *          iteration.
1243  */
1244 void
1245 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1246                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1247                        int flags)
1248 {
1249         hammer2_chain_t *chain;
1250         hammer2_blockref_t xbref;
1251         int i;
1252
1253         cluster->focus = NULL;
1254         cparent->focus = NULL;
1255
1256         for (i = 0; i < cluster->nchains; ++i) {
1257                 chain = cluster->array[i].chain;
1258                 if (chain) {
1259                         if (bref) {
1260                                 xbref = chain->bref;
1261                                 xbref.key = bref->key;
1262                                 xbref.keybits = bref->keybits;
1263                                 hammer2_chain_rename(trans, &xbref,
1264                                                      &cparent->array[i].chain,
1265                                                      chain, flags);
1266                         } else {
1267                                 hammer2_chain_rename(trans, NULL,
1268                                                      &cparent->array[i].chain,
1269                                                      chain, flags);
1270                         }
1271                         KKASSERT(cluster->array[i].chain == chain); /*remove*/
1272                 }
1273         }
1274 }
1275
1276 /*
1277  * Mark a cluster deleted
1278  */
1279 void
1280 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1281                        hammer2_cluster_t *cluster, int flags)
1282 {
1283         hammer2_chain_t *chain;
1284         hammer2_chain_t *parent;
1285         int i;
1286
1287         if (cparent == NULL) {
1288                 kprintf("cparent is NULL\n");
1289                 return;
1290         }
1291
1292         for (i = 0; i < cluster->nchains; ++i) {
1293                 parent = (i < cparent->nchains) ?
1294                          cparent->array[i].chain : NULL;
1295                 chain = cluster->array[i].chain;
1296                 if (chain == NULL)
1297                         continue;
1298                 if (chain->parent != parent) {
1299                         kprintf("hammer2_cluster_delete: parent "
1300                                 "mismatch chain=%p parent=%p against=%p\n",
1301                                 chain, chain->parent, parent);
1302                 } else {
1303                         hammer2_chain_delete(trans, parent, chain, flags);
1304                 }
1305         }
1306 }
1307
1308 /*
1309  * Create a snapshot of the specified {parent, ochain} with the specified
1310  * label.  The originating hammer2_inode must be exclusively locked for
1311  * safety.
1312  *
1313  * The ioctl code has already synced the filesystem.
1314  */
1315 int
1316 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1317                        hammer2_ioc_pfs_t *pfs)
1318 {
1319         hammer2_dev_t *hmp;
1320         hammer2_cluster_t *ncluster;
1321         const hammer2_inode_data_t *ripdata;
1322         hammer2_inode_data_t *wipdata;
1323         hammer2_chain_t *nchain;
1324         hammer2_inode_t *nip;
1325         size_t name_len;
1326         hammer2_key_t lhc;
1327         struct vattr vat;
1328 #if 0
1329         uuid_t opfs_clid;
1330 #endif
1331         int error;
1332         int i;
1333
1334         kprintf("snapshot %s\n", pfs->name);
1335
1336         name_len = strlen(pfs->name);
1337         lhc = hammer2_dirhash(pfs->name, name_len);
1338
1339         /*
1340          * Get the clid
1341          */
1342         ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1343 #if 0
1344         opfs_clid = ripdata->pfs_clid;
1345 #endif
1346         hmp = ocluster->focus->hmp;     /* XXX find synchronized local disk */
1347
1348         /*
1349          * Create the snapshot directory under the super-root
1350          *
1351          * Set PFS type, generate a unique filesystem id, and generate
1352          * a cluster id.  Use the same clid when snapshotting a PFS root,
1353          * which theoretically allows the snapshot to be used as part of
1354          * the same cluster (perhaps as a cache).
1355          *
1356          * Copy the (flushed) blockref array.  Theoretically we could use
1357          * chain_duplicate() but it becomes difficult to disentangle
1358          * the shared core so for now just brute-force it.
1359          */
1360         VATTR_NULL(&vat);
1361         vat.va_type = VDIR;
1362         vat.va_mode = 0755;
1363         ncluster = NULL;
1364         nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1365                                    proc0.p_ucred, pfs->name, name_len,
1366                                    &ncluster,
1367                                    HAMMER2_INSERT_PFSROOT, &error);
1368
1369         if (nip) {
1370                 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1371                 wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER;
1372                 wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1373                 wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT;
1374                 kern_uuidgen(&wipdata->pfs_fsid, 1);
1375
1376                 /*
1377                  * Give the snapshot its own private cluster.  As a snapshot
1378                  * no further synchronization with the original cluster will
1379                  * be done.
1380                  */
1381 #if 0
1382                 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1383                         wipdata->pfs_clid = opfs_clid;
1384                 else
1385                         kern_uuidgen(&wipdata->pfs_clid, 1);
1386 #endif
1387                 kern_uuidgen(&wipdata->pfs_clid, 1);
1388
1389                 for (i = 0; i < ncluster->nchains; ++i) {
1390                         nchain = ncluster->array[i].chain;
1391                         if (nchain)
1392                                 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1393                 }
1394 #if 0
1395                 /* XXX can't set this unless we do an explicit flush, which
1396                    we also need a pmp assigned to do, else the flush code
1397                    won't flush ncluster because it thinks it is crossing a
1398                    flush boundary */
1399                 hammer2_cluster_set_chainflags(ncluster,
1400                                                HAMMER2_CHAIN_PFSBOUNDARY);
1401 #endif
1402
1403                 /* XXX hack blockset copy */
1404                 /* XXX doesn't work with real cluster */
1405                 KKASSERT(ocluster->nchains == 1);
1406                 wipdata->u.blockset = ripdata->u.blockset;
1407                 hammer2_cluster_modsync(ncluster);
1408                 for (i = 0; i < ncluster->nchains; ++i) {
1409                         nchain = ncluster->array[i].chain;
1410                         if (nchain)
1411                                 hammer2_flush(trans, nchain);
1412                 }
1413                 hammer2_inode_unlock_ex(nip, ncluster);
1414         }
1415         return (error);
1416 }
1417
1418 /*
1419  * Return locked parent cluster given a locked child.  The child remains
1420  * locked on return.  The new parent's focus follows the child's focus
1421  * and the parent is always resolved.
1422  */
1423 hammer2_cluster_t *
1424 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1425 {
1426         hammer2_cluster_t *cparent;
1427         int i;
1428
1429         cparent = hammer2_cluster_copy(cluster);
1430
1431         for (i = 0; i < cparent->nchains; ++i) {
1432                 hammer2_chain_t *chain;
1433                 hammer2_chain_t *rchain;
1434
1435                 /*
1436                  * Calculate parent for each element.  Old chain has an extra
1437                  * ref for cparent but the lock remains with cluster.
1438                  */
1439                 chain = cparent->array[i].chain;
1440                 if (chain == NULL)
1441                         continue;
1442                 while ((rchain = chain->parent) != NULL) {
1443                         hammer2_chain_ref(rchain);
1444                         hammer2_chain_unlock(chain);
1445                         hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1446                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1447                         hammer2_chain_drop(rchain);
1448                         if (chain->parent == rchain)
1449                                 break;
1450                         hammer2_chain_unlock(rchain);
1451                 }
1452                 if (cluster->focus == chain)
1453                         cparent->focus = rchain;
1454                 cparent->array[i].chain = rchain;
1455                 hammer2_chain_drop(chain);
1456         }
1457         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1458
1459         return cparent;
1460 }
1461
1462 /************************************************************************
1463  *                              CLUSTER I/O                             *
1464  ************************************************************************
1465  *
1466  *
1467  * WARNING! blockref[] array data is not universal.  These functions should
1468  *          only be used to access universal data.
1469  *
1470  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1471  *          complete if necessary.  The I/O's should have already been
1472  *          initiated by the cluster_lock/chain_lock operation.
1473  *
1474  *          The cluster must already be in a modified state before wdata
1475  *          is called.  The data will already be available for this case.
1476  */
1477 const hammer2_media_data_t *
1478 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1479 {
1480         return(cluster->focus->data);
1481 }
1482
1483 hammer2_media_data_t *
1484 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1485 {
1486         KKASSERT(hammer2_cluster_modified(cluster));
1487         return(cluster->focus->data);
1488 }
1489
1490 /*
1491  * Load cluster data asynchronously with callback.
1492  *
1493  * The callback is made for the first validated data found, or NULL
1494  * if no valid data is available.
1495  *
1496  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1497  *       in the inode with an exclusive lock held), the chain structure may be
1498  *       shared.
1499  */
1500 void
1501 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1502                            void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1503 {
1504         hammer2_chain_t *chain;
1505         hammer2_iocb_t *iocb;
1506         hammer2_dev_t *hmp;
1507         hammer2_blockref_t *bref;
1508         int i;
1509
1510         /*
1511          * Try to find a chain whos data is already resolved.  If none can
1512          * be found, start with the first chain.
1513          */
1514         chain = NULL;
1515         for (i = 0; i < cluster->nchains; ++i) {
1516                 chain = cluster->array[i].chain;
1517                 if (chain && chain->data)
1518                         break;
1519         }
1520         if (i == cluster->nchains) {
1521                 chain = cluster->array[0].chain;
1522                 i = 0;
1523         }
1524
1525         iocb = &cluster->iocb;
1526         iocb->callback = callback;
1527         iocb->dio = NULL;               /* for already-validated case */
1528         iocb->cluster = cluster;
1529         iocb->chain = chain;
1530         iocb->ptr = ptr;
1531         iocb->lbase = (off_t)i;
1532         iocb->flags = 0;
1533         iocb->error = 0;
1534
1535         /*
1536          * Data already validated
1537          */
1538         if (chain->data) {
1539                 callback(iocb);
1540                 return;
1541         }
1542
1543         /*
1544          * We must resolve to a device buffer, either by issuing I/O or
1545          * by creating a zero-fill element.  We do not mark the buffer
1546          * dirty when creating a zero-fill element (the hammer2_chain_modify()
1547          * API must still be used to do that).
1548          *
1549          * The device buffer is variable-sized in powers of 2 down
1550          * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1551          * chunk always contains buffers of the same size. (XXX)
1552          *
1553          * The minimum physical IO size may be larger than the variable
1554          * block size.
1555          *
1556          * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
1557          *            matches hammer2_devblksize()?  Or does the freemap's
1558          *            pre-zeroing handle the case for us?
1559          */
1560         bref = &chain->bref;
1561         hmp = chain->hmp;
1562
1563 #if 0
1564         /* handled by callback? <- TODO XXX even needed for loads? */
1565         /*
1566          * The getblk() optimization for a 100% overwrite can only be used
1567          * if the physical block size matches the request.
1568          */
1569         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1570             chain->bytes == hammer2_devblksize(chain->bytes)) {
1571                 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1572                 KKASSERT(error == 0);
1573                 iocb->dio = dio;
1574                 callback(iocb);
1575                 return;
1576         }
1577 #endif
1578
1579         /*
1580          * Otherwise issue a read
1581          */
1582         hammer2_adjreadcounter(&chain->bref, chain->bytes);
1583         hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1584 }
1585
1586 /************************************************************************
1587  *                          NODE FAILURES                               *
1588  ************************************************************************
1589  *
1590  * A node failure can occur for numerous reasons.
1591  *
1592  *      - A read I/O may fail
1593  *      - A write I/O may fail
1594  *      - An unexpected chain might be found (or be missing)
1595  *      - A node might disconnect temporarily and reconnect later
1596  *        (for example, a USB stick could get pulled, or a node might
1597  *        be programmatically disconnected).
1598  *      - A node might run out of space during a modifying operation.
1599  *
1600  * When a read failure or an unexpected chain state is found, the chain and
1601  * parent chain at the failure point for the nodes involved (the nodes
1602  * which we determine to be in error) are flagged as failed and removed
1603  * from the cluster.  The node itself is allowed to remain active.  The
1604  * highest common point (usually a parent chain) is queued to the
1605  * resynchronization thread for action.
1606  *
1607  * When a write I/O fails or a node runs out of space, we first adjust
1608  * as if a read failure occurs but we further disable flushes on the
1609  * ENTIRE node.  Concurrent modifying transactions are allowed to complete
1610  * but any new modifying transactions will automatically remove the node
1611  * from consideration in all related cluster structures and not generate
1612  * any new modified chains.  The ROOT chain for the failed node(s) is queued
1613  * to the resynchronization thread for action.
1614  *
1615  * A temporary disconnect is handled as if a write failure occurred.
1616  *
1617  * Any of these failures might or might not stall related high level VNOPS,
1618  * depending on what has failed, what nodes remain, the type of cluster,
1619  * and the operating state of the cluster.
1620  *
1621  *                          FLUSH ON WRITE-DISABLED NODES
1622  *
1623  * A flush on a write-disabled node is not allowed to write anything because
1624  * we cannot safely update the mirror_tid anywhere on the failed node.  The
1625  * synchronization thread uses mirror_tid to calculate incremental resyncs.
1626  * Dirty meta-data related to the failed node is thrown away.
1627  *
1628  * Dirty buffer cache buffers and inodes are only thrown away if they can be
1629  * retired... that is, if the filesystem still has enough nodes to complete
1630  * the operation.
1631  */
1632
1633 /************************************************************************
1634  *                      SYNCHRONIZATION THREAD                          *
1635  ************************************************************************
1636  *
1637  * This thread is responsible for [re]synchronizing the cluster representing
1638  * a PFS.  Any out-of-sync or failed node starts this thread on a
1639  * node-by-node basis when the failure is detected.
1640  *
1641  * Clusters needing resynchronization are queued at the highest point
1642  * where the parent on the failed node is still valid, or a special
1643  * incremental scan from the ROOT is queued if no parent exists.  This
1644  * thread is also responsible for waiting for reconnections of the failed
1645  * node if the cause was due to a disconnect, and waiting for space to be
1646  * freed up if the cause was due to running out of space.
1647  *
1648  * If the cause is due to a node running out of space, this thread will also
1649  * remove older (unlocked) snapshots to make new space, recover space, and
1650  * then start resynchronization.
1651  *
1652  * Each resynchronization pass virtually snapshots the PFS on the good nodes
1653  * and synchronizes using that snapshot against the target node.  This
1654  * ensures a consistent chain topology and also avoids interference between
1655  * the resynchronization thread and frontend operations.
1656  *
1657  * Since these are per-node threads it is possible to resynchronize several
1658  * nodes at once.
1659  */