hammer2 - update documentation, cleanup
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *          locks and I/O, do quorum and/or master-slave processing, and
50  *          it must operate properly even if some nodes are broken (which
51  *          can also mean indefinite locks).
52  *
53  *                              CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *              hammer2_cluster_lock()
59  *              hammer2_cluster_parent()
60  *
61  *      - Most complex functions, quorum management on transaction ids.
62  *
63  *      - Locking and data accesses must be internally asynchronous.
64  *
65  *      - Validate and manage cache coherency primitives (cache state
66  *        is stored in chain topologies but must be validated by these
67  *        functions).
68  *
69  * (2) Lookups and Scans
70  *              hammer2_cluster_lookup()
71  *              hammer2_cluster_next()
72  *
73  *      - Depend on locking & data retrieval functions, but still complex.
74  *
75  *      - Must do quorum management on transaction ids.
76  *
77  *      - Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *              hammer2_cluster_create()
81  *              hammer2_cluster_rename()
82  *              hammer2_cluster_delete()
83  *              hammer2_cluster_modify()
84  *              hammer2_cluster_modsync()
85  *
86  *      - Can usually punt on failures, operation continues unless quorum
87  *        is lost.  If quorum is lost, must wait for resynchronization
88  *        (depending on the management mode).
89  *
90  *      - Must disconnect node on failures (also not flush), remount, and
91  *        resynchronize.
92  *
93  *      - Network links (via kdmsg) are relatively easy to issue as the
94  *        complex underworkings of hammer2_chain.c don't have to messed
95  *        with (the protocol is at a higher level than block-level).
96  *
97  *      - Multiple local disk nodes (i.e. block devices) are another matter.
98  *        Chain operations have to be dispatched to per-node threads (xN)
99  *        because we can't asynchronize potentially very complex chain
100  *        operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *        (these threads are also used to terminate incoming kdmsg ops from
103  *        other machines).
104  *
105  *      - Single-node filesystems do not use threads and will simply call
106  *        hammer2_chain.c functions directly.  This short-cut is handled
107  *        at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115
116 #include "hammer2.h"
117
118 /*
119  * Returns non-zero if any chain in the cluster needs to be resized.
120  * Errored elements are not used in the calculation.
121  */
122 int
123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
124 {
125         hammer2_chain_t *chain;
126         int i;
127
128         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
129         for (i = 0; i < cluster->nchains; ++i) {
130                 chain = cluster->array[i].chain;
131                 if (chain == NULL)
132                         continue;
133                 if (chain->error)
134                         continue;
135                 if (chain->bytes != bytes)
136                         return 1;
137         }
138         return 0;
139 }
140
141 /*
142  * Returns the bref type of the cluster's foucs.
143  *
144  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
145  * The cluster must be locked.
146  */
147 uint8_t
148 hammer2_cluster_type(hammer2_cluster_t *cluster)
149 {
150         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
151         if (cluster->error == 0)
152                 return(cluster->focus->bref.type);
153         return 0;
154 }
155
156 /*
157  * Returns non-zero if the cluster's focus is flagged as being modified.
158  *
159  * If the cluster is errored, returns 0.
160  */
161 int
162 hammer2_cluster_modified(hammer2_cluster_t *cluster)
163 {
164         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
165         if (cluster->error == 0)
166                 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
167         return 0;
168 }
169
170 /*
171  * Returns the bref of the cluster's focus, sans any data-offset information
172  * (since offset information is per-node and wouldn't be useful).
173  *
174  * Callers use this function to access mirror_tid, key, and keybits.
175  *
176  * If the cluster is errored, returns an empty bref.
177  * The cluster must be locked.
178  */
179 void
180 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
181 {
182         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
183         if (cluster->error == 0) {
184                 *bref = cluster->focus->bref;
185                 bref->data_off = 0;
186         } else {
187                 bzero(bref, sizeof(*bref));
188         }
189 }
190
191 /*
192  * Return non-zero if the chain representing an inode has been flagged
193  * as having been unlinked.  Allows the vnode reclaim to avoid loading
194  * the inode data from disk e.g. when unmount or recycling old, clean
195  * vnodes.
196  *
197  * The cluster does not need to be locked.
198  * The focus cannot be used since the cluster might not be locked.
199  */
200 int
201 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster)
202 {
203         hammer2_chain_t *chain;
204         int flags;
205         int i;
206
207         flags = 0;
208         for (i = 0; i < cluster->nchains; ++i) {
209                 chain = cluster->array[i].chain;
210                 if (chain)
211                         flags |= chain->flags;
212         }
213         return (flags & HAMMER2_CHAIN_UNLINKED);
214 }
215
216 /*
217  * Set a bitmask of flags in all chains related to a cluster.
218  * The cluster should probably be locked.
219  */
220 void
221 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
222 {
223         hammer2_chain_t *chain;
224         int i;
225
226         for (i = 0; i < cluster->nchains; ++i) {
227                 chain = cluster->array[i].chain;
228                 if (chain)
229                         atomic_set_int(&chain->flags, flags);
230         }
231 }
232
233 /*
234  * Set a bitmask of flags in all chains related to a cluster.
235  * The cluster should probably be locked.
236  */
237 void
238 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
239 {
240         hammer2_chain_t *chain;
241         int i;
242
243         for (i = 0; i < cluster->nchains; ++i) {
244                 chain = cluster->array[i].chain;
245                 if (chain)
246                         atomic_clear_int(&chain->flags, flags);
247         }
248 }
249
250 /*
251  * Flag the cluster for flushing recursively up to the root.  Despite the
252  * work it does, this is relatively benign.  It just makes sure that the
253  * flusher has top-down visibility to this cluster.
254  *
255  * Errored chains are not flagged for flushing.
256  *
257  * The cluster should probably be locked.
258  */
259 void
260 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
261 {
262         hammer2_chain_t *chain;
263         int i;
264
265         for (i = 0; i < cluster->nchains; ++i) {
266                 chain = cluster->array[i].chain;
267                 if (chain == NULL)
268                         continue;
269                 if (chain->error)
270                         continue;
271                 hammer2_chain_setflush(trans, chain);
272         }
273 }
274
275 /*
276  * Set the check mode for the cluster.
277  * Errored elements of the cluster are ignored.
278  *
279  * The cluster must be locked.
280  */
281 void
282 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
283                                 hammer2_cluster_t *cluster,
284                                 int check_algo)
285 {
286         hammer2_chain_t *chain;
287         int i;
288
289         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
290         for (i = 0; i < cluster->nchains; ++i) {
291                 chain = cluster->array[i].chain;
292                 if (chain == NULL)
293                         continue;
294                 if (chain->error)
295                         continue;
296                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
297                 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
298                 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
299         }
300 }
301
302 /*
303  * Create a degenerate cluster with one ref from a single locked chain.
304  * The returned cluster will be focused on the chain and inherit its
305  * error state.
306  *
307  * The chain's lock and reference are transfered to the new cluster, so
308  * the caller should not try to unlock the chain separately.
309  *
310  * We fake the flags.
311  */
312 hammer2_cluster_t *
313 hammer2_cluster_from_chain(hammer2_chain_t *chain)
314 {
315         hammer2_cluster_t *cluster;
316
317         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
318         cluster->array[0].chain = chain;
319         cluster->nchains = 1;
320         cluster->focus = chain;
321         cluster->pmp = chain->pmp;
322         cluster->refs = 1;
323         cluster->error = chain->error;
324         cluster->flags = HAMMER2_CLUSTER_LOCKED |
325                          HAMMER2_CLUSTER_WRHARD |
326                          HAMMER2_CLUSTER_RDHARD |
327                          HAMMER2_CLUSTER_MSYNCED |
328                          HAMMER2_CLUSTER_SSYNCED;
329
330         return cluster;
331 }
332
333 /*
334  * Add a reference to a cluster and its underlying chains.
335  *
336  * We must also ref the underlying chains in order to allow ref/unlock
337  * sequences to later re-lock.
338  */
339 void
340 hammer2_cluster_ref(hammer2_cluster_t *cluster)
341 {
342         hammer2_chain_t *chain;
343         int i;
344
345         atomic_add_int(&cluster->refs, 1);
346         for (i = 0; i < cluster->nchains; ++i) {
347                 chain = cluster->array[i].chain;
348                 if (chain)
349                         hammer2_chain_ref(chain);
350         }
351 }
352
353 /*
354  * Drop the caller's reference to the cluster.  When the ref count drops to
355  * zero this function frees the cluster and drops all underlying chains.
356  *
357  * In-progress read I/Os are typically detached from the cluster once the
358  * first one returns (the remaining stay attached to the DIOs but are then
359  * ignored and drop naturally).
360  */
361 void
362 hammer2_cluster_drop(hammer2_cluster_t *cluster)
363 {
364         hammer2_chain_t *chain;
365         int i;
366
367         KKASSERT(cluster->refs > 0);
368         for (i = 0; i < cluster->nchains; ++i) {
369                 chain = cluster->array[i].chain;
370                 if (chain) {
371                         hammer2_chain_drop(chain);
372                         if (cluster->refs == 1)
373                                 cluster->array[i].chain = NULL;
374                 }
375         }
376         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
377                 cluster->focus = NULL;          /* safety XXX chg to assert */
378                 kfree(cluster, M_HAMMER2);
379                 /* cluster is invalid */
380         }
381 }
382
383 void
384 hammer2_cluster_wait(hammer2_cluster_t *cluster)
385 {
386         tsleep(cluster->focus, 0, "h2clcw", 1);
387 }
388
389 /*
390  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
391  * and then locks them, modified by various RESOLVE flags.
392  *
393  * The act of locking a cluster sets its focus.
394  *
395  * The chains making up the cluster may be narrowed down based on quorum
396  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
397  * narrowed down to a single chain as long as the entire subtopology is known
398  * to be intact.  So, for example, we can narrow a read-only op to a single
399  * fast SLAVE but if we focus a CACHE chain we must still retain at least
400  * a SLAVE to ensure that the subtopology can be accessed.
401  *
402  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
403  * to be maintained once the topology is validated as-of the top level of
404  * the operation.
405  *
406  * If a failure occurs the operation must be aborted by higher-level code and
407  * retried. XXX
408  */
409 void
410 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
411 {
412         hammer2_chain_t *chain;
413         int i;
414
415         /* cannot be on inode-embedded cluster template, must be on copy */
416         KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
417         if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
418                 kprintf("hammer2_cluster_lock: cluster %p already locked!\n",
419                         cluster);
420         } else {
421                 KKASSERT(cluster->focus == NULL);
422         }
423         atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
424
425         if ((how & HAMMER2_RESOLVE_NOREF) == 0)
426                 atomic_add_int(&cluster->refs, 1);
427
428         /*
429          * Lock chains and resolve state.
430          */
431         for (i = 0; i < cluster->nchains; ++i) {
432                 chain = cluster->array[i].chain;
433                 if (chain == NULL)
434                         continue;
435                 hammer2_chain_lock(chain, how);
436         }
437
438         hammer2_cluster_resolve(cluster);
439 }
440
441 void
442 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
443 {
444         hammer2_chain_t *chain;
445         hammer2_pfs_t *pmp;
446         hammer2_tid_t quorum_tid;
447         int focus_pfs_type;
448         uint32_t nflags;
449         int ttlmasters;
450         int ttlslaves;
451         int nmasters;
452         int nslaves;
453         int nquorum;
454         int i;
455
456         cluster->error = 0;
457
458         quorum_tid = 0;
459         focus_pfs_type = 0;
460         nflags = 0;
461         ttlmasters = 0;
462         ttlslaves = 0;
463         nmasters = 0;
464         nslaves = 0;
465
466         /*
467          * Calculate quorum
468          */
469         pmp = cluster->pmp;
470         KKASSERT(pmp != NULL || cluster->nchains == 0);
471         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
472
473         /*
474          * Pass 1
475          */
476         for (i = 0; i < cluster->nchains; ++i) {
477                 chain = cluster->array[i].chain;
478                 if (chain == NULL)
479                         continue;
480                 if (chain->error) {
481                         if (cluster->focus == NULL || cluster->focus == chain) {
482                                 /* error will be overridden by valid focus */
483                                 cluster->error = chain->error;
484                         }
485
486                         /*
487                          * Must count total masters and slaves whether the
488                          * chain is errored or not.
489                          */
490                         switch (cluster->pmp->pfs_types[i]) {
491                         case HAMMER2_PFSTYPE_MASTER:
492                                 ++ttlmasters;
493                                 break;
494                         case HAMMER2_PFSTYPE_SLAVE:
495                                 ++ttlslaves;
496                                 break;
497                         }
498                         continue;
499                 }
500
501                 switch (cluster->pmp->pfs_types[i]) {
502                 case HAMMER2_PFSTYPE_MASTER:
503                         ++ttlmasters;
504                         if (quorum_tid < chain->bref.mirror_tid ||
505                             nmasters == 0) {
506                                 nmasters = 1;
507                                 quorum_tid = chain->bref.mirror_tid;
508                         } else if (quorum_tid == chain->bref.mirror_tid) {
509                                 ++nmasters;
510                         }
511                         break;
512                 case HAMMER2_PFSTYPE_SLAVE:
513                         ++ttlslaves;
514                         break;
515                 case HAMMER2_PFSTYPE_SOFT_MASTER:
516                         nflags |= HAMMER2_CLUSTER_WRSOFT;
517                         nflags |= HAMMER2_CLUSTER_RDSOFT;
518                         break;
519                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
520                         nflags |= HAMMER2_CLUSTER_RDSOFT;
521                         break;
522                 case HAMMER2_PFSTYPE_SUPROOT:
523                         /*
524                          * Degenerate cluster representing the super-root
525                          * topology on a single device.
526                          */
527                         nflags |= HAMMER2_CLUSTER_WRHARD;
528                         nflags |= HAMMER2_CLUSTER_RDHARD;
529                         cluster->focus = chain;
530                         cluster->error = chain->error;
531                         break;
532                 default:
533                         break;
534                 }
535         }
536
537         /*
538          * Pass 2
539          */
540         for (i = 0; i < cluster->nchains; ++i) {
541                 chain = cluster->array[i].chain;
542                 if (chain == NULL)
543                         continue;
544                 if (chain->error) {
545                         if (cluster->focus == NULL || cluster->focus == chain) {
546                                 /* error will be overridden by valid focus */
547                                 cluster->error = chain->error;
548                         }
549                         continue;
550                 }
551
552                 switch (cluster->pmp->pfs_types[i]) {
553                 case HAMMER2_PFSTYPE_MASTER:
554                         /*
555                          * We must have enough up-to-date masters to reach
556                          * a quorum and the master mirror_tid must match
557                          * the quorum's mirror_tid.
558                          *
559                          * Do not select an errored master.
560                          */
561                         if (nmasters >= nquorum &&
562                             chain->error == 0 &&
563                             quorum_tid == chain->bref.mirror_tid) {
564                                 nflags |= HAMMER2_CLUSTER_WRHARD;
565                                 nflags |= HAMMER2_CLUSTER_RDHARD;
566                                 if (cluster->focus == NULL ||
567                                     focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
568                                         focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
569                                         cluster->focus = chain;
570                                         cluster->error = chain->error;
571                                 }
572                         } else if (chain->error == 0) {
573                                 nflags |= HAMMER2_CLUSTER_UNHARD;
574                         }
575                         break;
576                 case HAMMER2_PFSTYPE_SLAVE:
577                         /*
578                          * We must have enough up-to-date masters to reach
579                          * a quorum and the slave mirror_tid must match the
580                          * quorum's mirror_tid.
581                          *
582                          * Do not select an errored slave.
583                          */
584                         if (nmasters >= nquorum &&
585                             chain->error == 0 &&
586                             quorum_tid == chain->bref.mirror_tid) {
587                                 ++nslaves;
588                                 nflags |= HAMMER2_CLUSTER_RDHARD;
589                                 if (cluster->focus == NULL) {
590                                         focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
591                                         cluster->focus = chain;
592                                         cluster->error = chain->error;
593                                 }
594                         } else if (chain->error == 0) {
595                                 nflags |= HAMMER2_CLUSTER_UNSOFT;
596                         }
597                         break;
598                 case HAMMER2_PFSTYPE_SOFT_MASTER:
599                         /*
600                          * Directly mounted soft master always wins.  There
601                          * should be only one.
602                          */
603                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
604                         cluster->focus = chain;
605                         cluster->error = chain->error;
606                         focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
607                         break;
608                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
609                         /*
610                          * Directly mounted soft slave always wins.  There
611                          * should be only one.
612                          */
613                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
614                         if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
615                                 cluster->focus = chain;
616                                 cluster->error = chain->error;
617                                 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
618                         }
619                         break;
620                 default:
621                         break;
622                 }
623         }
624
625         if (ttlslaves == 0)
626                 nflags |= HAMMER2_CLUSTER_NOHARD;
627         if (ttlmasters == 0)
628                 nflags |= HAMMER2_CLUSTER_NOSOFT;
629
630         /*
631          * Set SSYNCED or MSYNCED for slaves and masters respectively if
632          * all available nodes (even if 0 are available) are fully
633          * synchronized.  This is used by the synchronization thread to
634          * determine if there is work it could potentially accomplish.
635          */
636         if (nslaves == ttlslaves)
637                 nflags |= HAMMER2_CLUSTER_SSYNCED;
638         if (nmasters == ttlmasters)
639                 nflags |= HAMMER2_CLUSTER_MSYNCED;
640
641         /*
642          * Determine if the cluster was successfully locked for the
643          * requested operation and generate an error code.  The cluster
644          * will not be locked (or ref'd) if an error is returned.
645          *
646          * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
647          * to determine if reading or writing is possible.  If writing, the
648          * cluster still requires a call to hammer2_cluster_modify() first.
649          */
650         atomic_set_int(&cluster->flags, nflags);
651         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
652 }
653
654 /*
655  * Copy a cluster, returned a ref'd cluster.  All underlying chains
656  * are also ref'd, but not locked.
657  *
658  * The cluster focus is not set because the cluster is not yet locked
659  * (and the originating cluster does not have to be locked either).
660  */
661 hammer2_cluster_t *
662 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
663 {
664         hammer2_pfs_t *pmp = ocluster->pmp;
665         hammer2_cluster_t *ncluster;
666         hammer2_chain_t *chain;
667         int i;
668
669         ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
670         ncluster->pmp = pmp;
671         ncluster->nchains = ocluster->nchains;
672         ncluster->refs = 1;
673         ncluster->flags = 0;    /* cluster not locked */
674
675         for (i = 0; i < ocluster->nchains; ++i) {
676                 chain = ocluster->array[i].chain;
677                 ncluster->array[i].chain = chain;
678                 if (chain)
679                         hammer2_chain_ref(chain);
680         }
681         return (ncluster);
682 }
683
684 /*
685  * Unlock and deref a cluster.  The cluster is destroyed if this is the
686  * last ref.
687  */
688 void
689 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
690 {
691         hammer2_chain_t *chain;
692         int i;
693
694         if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
695                 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
696                         cluster);
697         }
698         /* KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); */
699         KKASSERT(cluster->refs > 0);
700         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
701
702         for (i = 0; i < cluster->nchains; ++i) {
703                 chain = cluster->array[i].chain;
704                 if (chain) {
705                         hammer2_chain_unlock(chain);
706                         if (cluster->refs == 1)
707                                 cluster->array[i].chain = NULL; /* safety */
708                 }
709         }
710         cluster->focus = NULL;
711
712         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
713                 kfree(cluster, M_HAMMER2);
714                 /* cluster = NULL; safety */
715         }
716 }
717
718 /*
719  * Resize the cluster's physical storage allocation in-place.  This may
720  * replace the cluster's chains.
721  */
722 void
723 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
724                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
725                        int nradix, int flags)
726 {
727         hammer2_chain_t *chain;
728         int i;
729
730         KKASSERT(cparent->pmp == cluster->pmp);         /* can be NULL */
731         KKASSERT(cparent->nchains == cluster->nchains);
732
733         for (i = 0; i < cluster->nchains; ++i) {
734                 chain = cluster->array[i].chain;
735                 if (chain) {
736                         KKASSERT(cparent->array[i].chain);
737                         hammer2_chain_resize(trans, ip,
738                                              cparent->array[i].chain, chain,
739                                              nradix, flags);
740                 }
741         }
742 }
743
744 /*
745  * Set an inode's cluster modified, marking the related chains RW and
746  * duplicating them if necessary.
747  *
748  * The passed-in chain is a localized copy of the chain previously acquired
749  * when the inode was locked (and possilby replaced in the mean time), and
750  * must also be updated.  In fact, we update it first and then synchronize
751  * the inode's cluster cache.
752  */
753 hammer2_inode_data_t *
754 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
755                           hammer2_cluster_t *cluster, int flags)
756 {
757         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
758         hammer2_cluster_modify(trans, cluster, flags);
759
760         hammer2_inode_repoint(ip, NULL, cluster);
761         if (ip->vp)
762                 vsetisdirty(ip->vp);
763         return (&hammer2_cluster_wdata(cluster)->ipdata);
764 }
765
766 /*
767  * Adjust the cluster's chains to allow modification and adjust the
768  * focus.  Data will be accessible on return.
769  *
770  * If our focused master errors on modify, re-resolve the cluster to
771  * try to select a different master.
772  */
773 void
774 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
775                        int flags)
776 {
777         hammer2_chain_t *chain;
778         int resolve_again;
779         int i;
780
781         resolve_again = 0;
782         for (i = 0; i < cluster->nchains; ++i) {
783                 chain = cluster->array[i].chain;
784                 if (chain == NULL)
785                         continue;
786                 if (chain->error)
787                         continue;
788                 hammer2_chain_modify(trans, chain, flags);
789                 if (cluster->focus == chain && chain->error) {
790                         cluster->error = chain->error;
791                         resolve_again = 1;
792                 }
793         }
794         if (resolve_again)
795                 hammer2_cluster_resolve(cluster);
796 }
797
798 /*
799  * Synchronize modifications from the focus to other chains in a cluster.
800  * Convenient because nominal API users can just modify the contents of the
801  * focus (at least for non-blockref data).
802  *
803  * Nominal front-end operations only edit non-block-table data in a single
804  * chain.  This code copies such modifications to the other chains in the
805  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
806  * by both the frontend and the backend and will explode in fireworks if
807  * blindly copied.
808  */
809 void
810 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
811 {
812         hammer2_chain_t *focus;
813         hammer2_chain_t *scan;
814         const hammer2_inode_data_t *ripdata;
815         hammer2_inode_data_t *wipdata;
816         int i;
817
818         focus = cluster->focus;
819         KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
820
821         for (i = 0; i < cluster->nchains; ++i) {
822                 scan = cluster->array[i].chain;
823                 if (scan == NULL || scan == focus)
824                         continue;
825                 if (scan->error)
826                         continue;
827                 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
828                 KKASSERT(focus->bytes == scan->bytes &&
829                          focus->bref.type == scan->bref.type);
830                 switch(focus->bref.type) {
831                 case HAMMER2_BREF_TYPE_INODE:
832                         ripdata = &focus->data->ipdata;
833                         wipdata = &scan->data->ipdata;
834                         if ((ripdata->op_flags &
835                             HAMMER2_OPFLAG_DIRECTDATA) == 0) {
836                                 bcopy(ripdata, wipdata,
837                                       offsetof(hammer2_inode_data_t, u));
838                                 break;
839                         }
840                         /* fall through to full copy */
841                 case HAMMER2_BREF_TYPE_DATA:
842                         bcopy(focus->data, scan->data, focus->bytes);
843                         break;
844                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
845                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
846                 case HAMMER2_BREF_TYPE_FREEMAP:
847                 case HAMMER2_BREF_TYPE_VOLUME:
848                         panic("hammer2_cluster_modsync: illegal node type");
849                         /* NOT REACHED */
850                         break;
851                 default:
852                         panic("hammer2_cluster_modsync: unknown node type");
853                         break;
854                 }
855         }
856 }
857
858 /*
859  * Lookup initialization/completion API
860  */
861 hammer2_cluster_t *
862 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
863 {
864         hammer2_cluster_t *cluster;
865         int i;
866
867         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
868         cluster->pmp = cparent->pmp;                    /* can be NULL */
869         cluster->flags = 0;     /* cluster not locked (yet) */
870         /* cluster->focus = NULL; already null */
871
872         for (i = 0; i < cparent->nchains; ++i)
873                 cluster->array[i].chain = cparent->array[i].chain;
874         cluster->nchains = cparent->nchains;
875
876         /*
877          * Independently lock (this will also give cluster 1 ref)
878          */
879         if (flags & HAMMER2_LOOKUP_SHARED) {
880                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
881                                               HAMMER2_RESOLVE_SHARED);
882         } else {
883                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
884         }
885         return (cluster);
886 }
887
888 void
889 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
890 {
891         if (cparent)
892                 hammer2_cluster_unlock(cparent);
893 }
894
895 /*
896  * Locate first match or overlap under parent, return a new cluster
897  */
898 hammer2_cluster_t *
899 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
900                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
901 {
902         hammer2_pfs_t *pmp;
903         hammer2_cluster_t *cluster;
904         hammer2_chain_t *chain;
905         hammer2_key_t key_accum;
906         hammer2_key_t key_next;
907         hammer2_key_t bref_key;
908         int null_count;
909         int bref_keybits;
910         int i;
911         uint8_t bref_type;
912         u_int bytes;
913
914         pmp = cparent->pmp;                             /* can be NULL */
915         key_accum = *key_nextp;
916         null_count = 0;
917         bref_type = 0;
918         bref_key = 0;
919         bref_keybits = 0;
920         bytes = 0;
921
922         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
923         cluster->pmp = pmp;                             /* can be NULL */
924         cluster->refs = 1;
925         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
926                 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
927
928         for (i = 0; i < cparent->nchains; ++i) {
929                 key_next = *key_nextp;
930                 if (cparent->array[i].chain == NULL) {
931                         ++null_count;
932                         continue;
933                 }
934                 chain = hammer2_chain_lookup(&cparent->array[i].chain,
935                                              &key_next,
936                                              key_beg, key_end,
937                                              &cparent->array[i].cache_index,
938                                              flags);
939                 cluster->array[i].chain = chain;
940                 if (chain == NULL) {
941                         ++null_count;
942                 } else if (chain->error) {
943                         /*
944                          * Leave errored chain in cluster, but it cannot be
945                          * the cluster's focus.
946                          */
947                         ;
948                 } else {
949                         int ddflag = (chain->bref.type ==
950                                       HAMMER2_BREF_TYPE_INODE);
951
952                         /*
953                          * Set default focus.
954                          */
955                         if (cluster->focus == NULL) {
956                                 bref_type = chain->bref.type;
957                                 bref_key = chain->bref.key;
958                                 bref_keybits = chain->bref.keybits;
959                                 bytes = chain->bytes;
960                                 cluster->ddflag = ddflag;
961                                 cluster->focus = chain;
962                         }
963
964                         /*
965                          * Override default focus to follow the parent.
966                          */
967                         if (cparent->focus == cparent->array[i].chain)
968                                 cluster->focus = chain;
969
970                         KKASSERT(bref_type == chain->bref.type);
971                         KKASSERT(bref_key == chain->bref.key);
972                         KKASSERT(bref_keybits == chain->bref.keybits);
973                         KKASSERT(bytes == chain->bytes);
974                         KKASSERT(cluster->ddflag == ddflag);
975                 }
976                 if (key_accum > key_next)
977                         key_accum = key_next;
978         }
979         *key_nextp = key_accum;
980         cluster->nchains = i;
981         hammer2_cluster_resolve(cluster);
982
983         if (null_count == i) {
984                 hammer2_cluster_drop(cluster);
985                 cluster = NULL;
986         }
987
988         return (cluster);
989 }
990
991 /*
992  * Locate next match or overlap under parent, replace cluster
993  */
994 hammer2_cluster_t *
995 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
996                      hammer2_key_t *key_nextp,
997                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
998 {
999         hammer2_chain_t *chain;
1000         hammer2_key_t key_accum;
1001         hammer2_key_t key_next;
1002         hammer2_key_t bref_key;
1003         int null_count;
1004         int bref_keybits;
1005         int i;
1006         uint8_t bref_type;
1007         u_int bytes;
1008
1009         key_accum = *key_nextp;
1010         null_count = 0;
1011         cluster->focus = NULL;
1012         cparent->focus = NULL;
1013
1014         bref_type = 0;
1015         bref_key = 0;
1016         bref_keybits = 0;
1017         bytes = 0;
1018         cluster->ddflag = 0;
1019
1020         for (i = 0; i < cparent->nchains; ++i) {
1021                 key_next = *key_nextp;
1022                 chain = cluster->array[i].chain;
1023                 if (chain == NULL) {
1024                         ++null_count;
1025                         continue;
1026                 }
1027                 if (cparent->array[i].chain == NULL) {
1028                         if (flags & HAMMER2_LOOKUP_NOLOCK)
1029                                 hammer2_chain_drop(chain);
1030                         else
1031                                 hammer2_chain_unlock(chain);
1032                         ++null_count;
1033                         continue;
1034                 }
1035                 chain = hammer2_chain_next(&cparent->array[i].chain, chain,
1036                                            &key_next, key_beg, key_end,
1037                                            &cparent->array[i].cache_index,
1038                                            flags);
1039                 cluster->array[i].chain = chain;
1040                 if (chain == NULL) {
1041                         ++null_count;
1042                 } else if (chain->error) {
1043                         /*
1044                          * Leave errored chain in cluster, but it cannot be
1045                          * the cluster's focus.
1046                          */
1047                         ;
1048                 } else {
1049                         int ddflag = (chain->bref.type ==
1050                                       HAMMER2_BREF_TYPE_INODE);
1051                         if (cluster->focus == NULL) {
1052                                 bref_type = chain->bref.type;
1053                                 bref_key = chain->bref.key;
1054                                 bref_keybits = chain->bref.keybits;
1055                                 bytes = chain->bytes;
1056                                 cluster->ddflag = ddflag;
1057                                 cluster->focus = chain;
1058                         }
1059
1060                         /*
1061                          * Override default focus to follow the parent.
1062                          */
1063                         if (cparent->focus == cparent->array[i].chain)
1064                                 cluster->focus = chain;
1065
1066                         KKASSERT(bref_type == chain->bref.type);
1067                         KKASSERT(bref_key == chain->bref.key);
1068                         KKASSERT(bref_keybits == chain->bref.keybits);
1069                         KKASSERT(bytes == chain->bytes);
1070                         KKASSERT(cluster->ddflag == ddflag);
1071                 }
1072                 if (key_accum > key_next)
1073                         key_accum = key_next;
1074         }
1075         cluster->nchains = i;
1076         hammer2_cluster_resolve(cluster);
1077
1078         if (null_count == i) {
1079                 hammer2_cluster_drop(cluster);
1080                 cluster = NULL;
1081         }
1082         return(cluster);
1083 }
1084
1085 /*
1086  * Create a new cluster using the specified key
1087  */
1088 int
1089 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1090                      hammer2_cluster_t **clusterp,
1091                      hammer2_key_t key, int keybits,
1092                      int type, size_t bytes, int flags)
1093 {
1094         hammer2_cluster_t *cluster;
1095         hammer2_pfs_t *pmp;
1096         int error;
1097         int i;
1098
1099         pmp = trans->pmp;                               /* can be NULL */
1100
1101         if ((cluster = *clusterp) == NULL) {
1102                 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1103                                   M_WAITOK | M_ZERO);
1104                 cluster->pmp = pmp;                     /* can be NULL */
1105                 cluster->refs = 1;
1106                 cluster->flags = HAMMER2_CLUSTER_LOCKED;
1107         }
1108         cluster->focus = NULL;
1109
1110         /*
1111          * NOTE: cluster->array[] entries can initially be NULL.  If
1112          *       *clusterp is supplied, skip NULL entries, otherwise
1113          *       create new chains.
1114          */
1115         for (i = 0; i < cparent->nchains; ++i) {
1116                 if (*clusterp && cluster->array[i].chain == NULL) {
1117                         continue;
1118                 }
1119                 error = hammer2_chain_create(trans, &cparent->array[i].chain,
1120                                              &cluster->array[i].chain, pmp,
1121                                              key, keybits,
1122                                              type, bytes, flags);
1123                 KKASSERT(error == 0);
1124                 if (cluster->focus == NULL)
1125                         cluster->focus = cluster->array[i].chain;
1126                 if (cparent->focus == cparent->array[i].chain)
1127                         cluster->focus = cluster->array[i].chain;
1128         }
1129         cluster->nchains = i;
1130         *clusterp = cluster;
1131         hammer2_cluster_resolve(cluster);
1132
1133         return error;
1134 }
1135
1136 /*
1137  * Rename a cluster to a new parent.
1138  *
1139  * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1140  *          So the data_off field is not relevant.  Only the key and
1141  *          keybits are used.
1142  */
1143 void
1144 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1145                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1146                        int flags)
1147 {
1148         hammer2_chain_t *chain;
1149         hammer2_blockref_t xbref;
1150         int i;
1151
1152         cluster->focus = NULL;
1153         cparent->focus = NULL;
1154
1155         for (i = 0; i < cluster->nchains; ++i) {
1156                 chain = cluster->array[i].chain;
1157                 if (chain) {
1158                         if (bref) {
1159                                 xbref = chain->bref;
1160                                 xbref.key = bref->key;
1161                                 xbref.keybits = bref->keybits;
1162                                 hammer2_chain_rename(trans, &xbref,
1163                                                      &cparent->array[i].chain,
1164                                                      chain, flags);
1165                         } else {
1166                                 hammer2_chain_rename(trans, NULL,
1167                                                      &cparent->array[i].chain,
1168                                                      chain, flags);
1169                         }
1170                         KKASSERT(cluster->array[i].chain == chain); /*remove*/
1171                 }
1172         }
1173 }
1174
1175 /*
1176  * Mark a cluster deleted
1177  */
1178 void
1179 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1180                        hammer2_cluster_t *cluster, int flags)
1181 {
1182         hammer2_chain_t *chain;
1183         hammer2_chain_t *parent;
1184         int i;
1185
1186         if (cparent == NULL) {
1187                 kprintf("cparent is NULL\n");
1188                 return;
1189         }
1190
1191         for (i = 0; i < cluster->nchains; ++i) {
1192                 parent = (i < cparent->nchains) ?
1193                          cparent->array[i].chain : NULL;
1194                 chain = cluster->array[i].chain;
1195                 if (chain == NULL)
1196                         continue;
1197                 if (chain->parent != parent) {
1198                         kprintf("hammer2_cluster_delete: parent "
1199                                 "mismatch chain=%p parent=%p against=%p\n",
1200                                 chain, chain->parent, parent);
1201                 } else {
1202                         hammer2_chain_delete(trans, parent, chain, flags);
1203                 }
1204         }
1205 }
1206
1207 /*
1208  * Create a snapshot of the specified {parent, ochain} with the specified
1209  * label.  The originating hammer2_inode must be exclusively locked for
1210  * safety.
1211  *
1212  * The ioctl code has already synced the filesystem.
1213  */
1214 int
1215 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1216                        hammer2_ioc_pfs_t *pfs)
1217 {
1218         hammer2_dev_t *hmp;
1219         hammer2_cluster_t *ncluster;
1220         const hammer2_inode_data_t *ripdata;
1221         hammer2_inode_data_t *wipdata;
1222         hammer2_chain_t *nchain;
1223         hammer2_inode_t *nip;
1224         size_t name_len;
1225         hammer2_key_t lhc;
1226         struct vattr vat;
1227 #if 0
1228         uuid_t opfs_clid;
1229 #endif
1230         int error;
1231         int i;
1232
1233         kprintf("snapshot %s\n", pfs->name);
1234
1235         name_len = strlen(pfs->name);
1236         lhc = hammer2_dirhash(pfs->name, name_len);
1237
1238         /*
1239          * Get the clid
1240          */
1241         ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1242 #if 0
1243         opfs_clid = ripdata->pfs_clid;
1244 #endif
1245         hmp = ocluster->focus->hmp;     /* XXX find synchronized local disk */
1246
1247         /*
1248          * Create the snapshot directory under the super-root
1249          *
1250          * Set PFS type, generate a unique filesystem id, and generate
1251          * a cluster id.  Use the same clid when snapshotting a PFS root,
1252          * which theoretically allows the snapshot to be used as part of
1253          * the same cluster (perhaps as a cache).
1254          *
1255          * Copy the (flushed) blockref array.  Theoretically we could use
1256          * chain_duplicate() but it becomes difficult to disentangle
1257          * the shared core so for now just brute-force it.
1258          */
1259         VATTR_NULL(&vat);
1260         vat.va_type = VDIR;
1261         vat.va_mode = 0755;
1262         ncluster = NULL;
1263         nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1264                                    proc0.p_ucred, pfs->name, name_len,
1265                                    &ncluster,
1266                                    HAMMER2_INSERT_PFSROOT, &error);
1267
1268         if (nip) {
1269                 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1270                 wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER;
1271                 wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1272                 wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT;
1273                 kern_uuidgen(&wipdata->pfs_fsid, 1);
1274
1275                 /*
1276                  * Give the snapshot its own private cluster.  As a snapshot
1277                  * no further synchronization with the original cluster will
1278                  * be done.
1279                  */
1280 #if 0
1281                 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1282                         wipdata->pfs_clid = opfs_clid;
1283                 else
1284                         kern_uuidgen(&wipdata->pfs_clid, 1);
1285 #endif
1286                 kern_uuidgen(&wipdata->pfs_clid, 1);
1287
1288                 for (i = 0; i < ncluster->nchains; ++i) {
1289                         nchain = ncluster->array[i].chain;
1290                         if (nchain)
1291                                 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1292                 }
1293 #if 0
1294                 /* XXX can't set this unless we do an explicit flush, which
1295                    we also need a pmp assigned to do, else the flush code
1296                    won't flush ncluster because it thinks it is crossing a
1297                    flush boundary */
1298                 hammer2_cluster_set_chainflags(ncluster,
1299                                                HAMMER2_CHAIN_PFSBOUNDARY);
1300 #endif
1301
1302                 /* XXX hack blockset copy */
1303                 /* XXX doesn't work with real cluster */
1304                 KKASSERT(ocluster->nchains == 1);
1305                 wipdata->u.blockset = ripdata->u.blockset;
1306                 hammer2_cluster_modsync(ncluster);
1307                 for (i = 0; i < ncluster->nchains; ++i) {
1308                         nchain = ncluster->array[i].chain;
1309                         if (nchain)
1310                                 hammer2_flush(trans, nchain);
1311                 }
1312                 hammer2_inode_unlock(nip, ncluster);
1313         }
1314         return (error);
1315 }
1316
1317 /*
1318  * Return locked parent cluster given a locked child.  The child remains
1319  * locked on return.  The new parent's focus follows the child's focus
1320  * and the parent is always resolved.
1321  */
1322 hammer2_cluster_t *
1323 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1324 {
1325         hammer2_cluster_t *cparent;
1326         int i;
1327
1328         cparent = hammer2_cluster_copy(cluster);
1329
1330         for (i = 0; i < cparent->nchains; ++i) {
1331                 hammer2_chain_t *chain;
1332                 hammer2_chain_t *rchain;
1333
1334                 /*
1335                  * Calculate parent for each element.  Old chain has an extra
1336                  * ref for cparent but the lock remains with cluster.
1337                  */
1338                 chain = cparent->array[i].chain;
1339                 if (chain == NULL)
1340                         continue;
1341                 while ((rchain = chain->parent) != NULL) {
1342                         hammer2_chain_ref(rchain);
1343                         hammer2_chain_unlock(chain);
1344                         hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1345                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1346                         hammer2_chain_drop(rchain);
1347                         if (chain->parent == rchain)
1348                                 break;
1349                         hammer2_chain_unlock(rchain);
1350                 }
1351                 if (cluster->focus == chain)
1352                         cparent->focus = rchain;
1353                 cparent->array[i].chain = rchain;
1354                 hammer2_chain_drop(chain);
1355         }
1356         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1357         hammer2_cluster_resolve(cparent);
1358
1359         return cparent;
1360 }
1361
1362 /************************************************************************
1363  *                              CLUSTER I/O                             *
1364  ************************************************************************
1365  *
1366  *
1367  * WARNING! blockref[] array data is not universal.  These functions should
1368  *          only be used to access universal data.
1369  *
1370  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1371  *          complete if necessary.  The I/O's should have already been
1372  *          initiated by the cluster_lock/chain_lock operation.
1373  *
1374  *          The cluster must already be in a modified state before wdata
1375  *          is called.  The data will already be available for this case.
1376  */
1377 const hammer2_media_data_t *
1378 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1379 {
1380         return(cluster->focus->data);
1381 }
1382
1383 hammer2_media_data_t *
1384 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1385 {
1386         KKASSERT(hammer2_cluster_modified(cluster));
1387         return(cluster->focus->data);
1388 }
1389
1390 /*
1391  * Load cluster data asynchronously with callback.
1392  *
1393  * The callback is made for the first validated data found, or NULL
1394  * if no valid data is available.
1395  *
1396  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1397  *       in the inode with an exclusive lock held), the chain structure may be
1398  *       shared.
1399  */
1400 void
1401 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1402                            void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1403 {
1404         hammer2_chain_t *chain;
1405         hammer2_iocb_t *iocb;
1406         hammer2_dev_t *hmp;
1407         hammer2_blockref_t *bref;
1408         int i;
1409
1410         /*
1411          * Try to find a chain whos data is already resolved.  If none can
1412          * be found, start with the first chain.
1413          */
1414         chain = NULL;
1415         for (i = 0; i < cluster->nchains; ++i) {
1416                 chain = cluster->array[i].chain;
1417                 if (chain && chain->data)
1418                         break;
1419         }
1420         if (i == cluster->nchains) {
1421                 chain = cluster->array[0].chain;
1422                 i = 0;
1423         }
1424
1425         iocb = &cluster->iocb;
1426         iocb->callback = callback;
1427         iocb->dio = NULL;               /* for already-validated case */
1428         iocb->cluster = cluster;
1429         iocb->chain = chain;
1430         iocb->ptr = ptr;
1431         iocb->lbase = (off_t)i;
1432         iocb->flags = 0;
1433         iocb->error = 0;
1434
1435         /*
1436          * Data already validated
1437          */
1438         if (chain->data) {
1439                 callback(iocb);
1440                 return;
1441         }
1442
1443         /*
1444          * We must resolve to a device buffer, either by issuing I/O or
1445          * by creating a zero-fill element.  We do not mark the buffer
1446          * dirty when creating a zero-fill element (the hammer2_chain_modify()
1447          * API must still be used to do that).
1448          *
1449          * The device buffer is variable-sized in powers of 2 down
1450          * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1451          * chunk always contains buffers of the same size. (XXX)
1452          *
1453          * The minimum physical IO size may be larger than the variable
1454          * block size.
1455          *
1456          * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
1457          *            matches hammer2_devblksize()?  Or does the freemap's
1458          *            pre-zeroing handle the case for us?
1459          */
1460         bref = &chain->bref;
1461         hmp = chain->hmp;
1462
1463 #if 0
1464         /* handled by callback? <- TODO XXX even needed for loads? */
1465         /*
1466          * The getblk() optimization for a 100% overwrite can only be used
1467          * if the physical block size matches the request.
1468          */
1469         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1470             chain->bytes == hammer2_devblksize(chain->bytes)) {
1471                 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1472                 KKASSERT(error == 0);
1473                 iocb->dio = dio;
1474                 callback(iocb);
1475                 return;
1476         }
1477 #endif
1478
1479         /*
1480          * Otherwise issue a read
1481          */
1482         hammer2_adjreadcounter(&chain->bref, chain->bytes);
1483         hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1484 }
1485
1486 /************************************************************************
1487  *                          NODE FAILURES                               *
1488  ************************************************************************
1489  *
1490  * A node failure can occur for numerous reasons.
1491  *
1492  *      - A read I/O may fail
1493  *      - A write I/O may fail
1494  *      - An unexpected chain might be found (or be missing)
1495  *      - A node might disconnect temporarily and reconnect later
1496  *        (for example, a USB stick could get pulled, or a node might
1497  *        be programmatically disconnected).
1498  *      - A node might run out of space during a modifying operation.
1499  *
1500  * When a read failure or an unexpected chain state is found, the chain and
1501  * parent chain at the failure point for the nodes involved (the nodes
1502  * which we determine to be in error) are flagged as failed and removed
1503  * from the cluster.  The node itself is allowed to remain active.  The
1504  * highest common point (usually a parent chain) is queued to the
1505  * resynchronization thread for action.
1506  *
1507  * When a write I/O fails or a node runs out of space, we first adjust
1508  * as if a read failure occurs but we further disable flushes on the
1509  * ENTIRE node.  Concurrent modifying transactions are allowed to complete
1510  * but any new modifying transactions will automatically remove the node
1511  * from consideration in all related cluster structures and not generate
1512  * any new modified chains.  The ROOT chain for the failed node(s) is queued
1513  * to the resynchronization thread for action.
1514  *
1515  * A temporary disconnect is handled as if a write failure occurred.
1516  *
1517  * Any of these failures might or might not stall related high level VNOPS,
1518  * depending on what has failed, what nodes remain, the type of cluster,
1519  * and the operating state of the cluster.
1520  *
1521  *                          FLUSH ON WRITE-DISABLED NODES
1522  *
1523  * A flush on a write-disabled node is not allowed to write anything because
1524  * we cannot safely update the mirror_tid anywhere on the failed node.  The
1525  * synchronization thread uses mirror_tid to calculate incremental resyncs.
1526  * Dirty meta-data related to the failed node is thrown away.
1527  *
1528  * Dirty buffer cache buffers and inodes are only thrown away if they can be
1529  * retired... that is, if the filesystem still has enough nodes to complete
1530  * the operation.
1531  */
1532
1533 /************************************************************************
1534  *                      SYNCHRONIZATION THREAD                          *
1535  ************************************************************************
1536  *
1537  * This thread is responsible for [re]synchronizing the cluster representing
1538  * a PFS.  Any out-of-sync or failed node starts this thread on a
1539  * node-by-node basis when the failure is detected.
1540  *
1541  * Clusters needing resynchronization are queued at the highest point
1542  * where the parent on the failed node is still valid, or a special
1543  * incremental scan from the ROOT is queued if no parent exists.  This
1544  * thread is also responsible for waiting for reconnections of the failed
1545  * node if the cause was due to a disconnect, and waiting for space to be
1546  * freed up if the cause was due to running out of space.
1547  *
1548  * If the cause is due to a node running out of space, this thread will also
1549  * remove older (unlocked) snapshots to make new space, recover space, and
1550  * then start resynchronization.
1551  *
1552  * Each resynchronization pass virtually snapshots the PFS on the good nodes
1553  * and synchronizes using that snapshot against the target node.  This
1554  * ensures a consistent chain topology and also avoids interference between
1555  * the resynchronization thread and frontend operations.
1556  *
1557  * Since these are per-node threads it is possible to resynchronize several
1558  * nodes at once.
1559  */