hammer2 - slave sync adjustments, doc update
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *          locks and I/O, do quorum and/or master-slave processing, and
50  *          it must operate properly even if some nodes are broken (which
51  *          can also mean indefinite locks).
52  *
53  *                              CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *              hammer2_cluster_lock()
59  *              hammer2_cluster_parent()
60  *
61  *      - Most complex functions, quorum management on transaction ids.
62  *
63  *      - Locking and data accesses must be internally asynchronous.
64  *
65  *      - Validate and manage cache coherency primitives (cache state
66  *        is stored in chain topologies but must be validated by these
67  *        functions).
68  *
69  * (2) Lookups and Scans
70  *              hammer2_cluster_lookup()
71  *              hammer2_cluster_next()
72  *
73  *      - Depend on locking & data retrieval functions, but still complex.
74  *
75  *      - Must do quorum management on transaction ids.
76  *
77  *      - Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *              hammer2_cluster_create()
81  *              hammer2_cluster_rename()
82  *              hammer2_cluster_delete()
83  *              hammer2_cluster_modify()
84  *              hammer2_cluster_modsync()
85  *
86  *      - Can usually punt on failures, operation continues unless quorum
87  *        is lost.  If quorum is lost, must wait for resynchronization
88  *        (depending on the management mode).
89  *
90  *      - Must disconnect node on failures (also not flush), remount, and
91  *        resynchronize.
92  *
93  *      - Network links (via kdmsg) are relatively easy to issue as the
94  *        complex underworkings of hammer2_chain.c don't have to messed
95  *        with (the protocol is at a higher level than block-level).
96  *
97  *      - Multiple local disk nodes (i.e. block devices) are another matter.
98  *        Chain operations have to be dispatched to per-node threads (xN)
99  *        because we can't asynchronize potentially very complex chain
100  *        operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *        (these threads are also used to terminate incoming kdmsg ops from
103  *        other machines).
104  *
105  *      - Single-node filesystems do not use threads and will simply call
106  *        hammer2_chain.c functions directly.  This short-cut is handled
107  *        at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115
116 #include "hammer2.h"
117
118 /*
119  * Returns non-zero if any chain in the cluster needs to be resized.
120  * Errored elements are not used in the calculation.
121  */
122 int
123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
124 {
125         hammer2_chain_t *chain;
126         int i;
127
128         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
129         for (i = 0; i < cluster->nchains; ++i) {
130                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
131                         continue;
132                 chain = cluster->array[i].chain;
133                 if (chain == NULL)
134                         continue;
135                 if (chain->error)
136                         continue;
137                 if (chain->bytes != bytes)
138                         return 1;
139         }
140         return 0;
141 }
142
143 /*
144  * Returns the bref type of the cluster's foucs.
145  *
146  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
147  * The cluster must be locked.
148  */
149 uint8_t
150 hammer2_cluster_type(hammer2_cluster_t *cluster)
151 {
152         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
153         if (cluster->error == 0)
154                 return(cluster->focus->bref.type);
155         return 0;
156 }
157
158 /*
159  * Returns non-zero if the cluster's focus is flagged as being modified.
160  *
161  * If the cluster is errored, returns 0.
162  */
163 int
164 hammer2_cluster_modified(hammer2_cluster_t *cluster)
165 {
166         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
167         if (cluster->error == 0)
168                 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
169         return 0;
170 }
171
172 /*
173  * Returns the bref of the cluster's focus, sans any data-offset information
174  * (since offset information is per-node and wouldn't be useful).
175  *
176  * Callers use this function to access modify_tid, mirror_tid, type,
177  * key, and keybits.
178  *
179  * If the cluster is errored, returns an empty bref.
180  * The cluster must be locked.
181  */
182 void
183 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
184 {
185         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
186         if (cluster->error == 0) {
187                 *bref = cluster->focus->bref;
188                 bref->data_off = 0;
189         } else {
190                 bzero(bref, sizeof(*bref));
191         }
192 }
193
194 /*
195  * Return non-zero if the chain representing an inode has been flagged
196  * as having been unlinked.  Allows the vnode reclaim to avoid loading
197  * the inode data from disk e.g. when unmount or recycling old, clean
198  * vnodes.
199  *
200  * The cluster does not need to be locked.
201  * The focus cannot be used since the cluster might not be locked.
202  */
203 int
204 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster)
205 {
206         hammer2_chain_t *chain;
207         int flags;
208         int i;
209
210         flags = 0;
211         for (i = 0; i < cluster->nchains; ++i) {
212                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
213                         continue;
214                 chain = cluster->array[i].chain;
215                 if (chain)
216                         flags |= chain->flags;
217         }
218         return (flags & HAMMER2_CHAIN_UNLINKED);
219 }
220
221 /*
222  * Set a bitmask of flags in all chains related to a cluster.
223  * The cluster should probably be locked.
224  *
225  * XXX Only operate on FEMOD elements?
226  */
227 void
228 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
229 {
230         hammer2_chain_t *chain;
231         int i;
232
233         for (i = 0; i < cluster->nchains; ++i) {
234                 chain = cluster->array[i].chain;
235                 if (chain)
236                         atomic_set_int(&chain->flags, flags);
237         }
238 }
239
240 /*
241  * Set a bitmask of flags in all chains related to a cluster.
242  * The cluster should probably be locked.
243  *
244  * XXX Only operate on FEMOD elements?
245  */
246 void
247 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
248 {
249         hammer2_chain_t *chain;
250         int i;
251
252         for (i = 0; i < cluster->nchains; ++i) {
253                 chain = cluster->array[i].chain;
254                 if (chain)
255                         atomic_clear_int(&chain->flags, flags);
256         }
257 }
258
259 /*
260  * Flag the cluster for flushing recursively up to the root.  Despite the
261  * work it does, this is relatively benign.  It just makes sure that the
262  * flusher has top-down visibility to this cluster.
263  *
264  * Errored chains are not flagged for flushing.
265  *
266  * The cluster should probably be locked.
267  */
268 void
269 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
270 {
271         hammer2_chain_t *chain;
272         int i;
273
274         for (i = 0; i < cluster->nchains; ++i) {
275                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
276                         continue;
277                 chain = cluster->array[i].chain;
278                 if (chain == NULL)
279                         continue;
280                 if (chain->error)
281                         continue;
282                 hammer2_chain_setflush(trans, chain);
283         }
284 }
285
286 /*
287  * Set the check mode for the cluster.
288  * Errored elements of the cluster are ignored.
289  *
290  * The cluster must be locked and modified.
291  */
292 void
293 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
294                                 hammer2_cluster_t *cluster,
295                                 int check_algo)
296 {
297         hammer2_chain_t *chain;
298         int i;
299
300         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
301         for (i = 0; i < cluster->nchains; ++i) {
302                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
303                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
304                         continue;
305                 }
306                 chain = cluster->array[i].chain;
307                 if (chain == NULL)
308                         continue;
309                 if (chain->error)
310                         continue;
311                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
312                 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
313                 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
314         }
315 }
316
317 /*
318  * Create a degenerate cluster with one ref from a single locked chain.
319  * The returned cluster will be focused on the chain and inherit its
320  * error state.
321  *
322  * The chain's lock and reference are transfered to the new cluster, so
323  * the caller should not try to unlock the chain separately.
324  *
325  * We fake the flags.
326  */
327 hammer2_cluster_t *
328 hammer2_cluster_from_chain(hammer2_chain_t *chain)
329 {
330         hammer2_cluster_t *cluster;
331
332         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
333         cluster->array[0].chain = chain;
334         cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
335         cluster->nchains = 1;
336         cluster->focus = chain;
337         cluster->focus_index = 0;
338         cluster->pmp = chain->pmp;
339         cluster->refs = 1;
340         cluster->error = chain->error;
341         cluster->flags = HAMMER2_CLUSTER_LOCKED |
342                          HAMMER2_CLUSTER_WRHARD |
343                          HAMMER2_CLUSTER_RDHARD |
344                          HAMMER2_CLUSTER_MSYNCED |
345                          HAMMER2_CLUSTER_SSYNCED;
346
347         return cluster;
348 }
349
350 /*
351  * Add a reference to a cluster and its underlying chains.
352  *
353  * We must also ref the underlying chains in order to allow ref/unlock
354  * sequences to later re-lock.
355  */
356 void
357 hammer2_cluster_ref(hammer2_cluster_t *cluster)
358 {
359         atomic_add_int(&cluster->refs, 1);
360 #if 0
361         hammer2_chain_t *chain;
362         int i;
363
364         for (i = 0; i < cluster->nchains; ++i) {
365                 chain = cluster->array[i].chain;
366                 if (chain)
367                         hammer2_chain_ref(chain);
368         }
369 #endif
370 }
371
372 /*
373  * Drop the caller's reference to the cluster.  When the ref count drops to
374  * zero this function frees the cluster and drops all underlying chains.
375  *
376  * In-progress read I/Os are typically detached from the cluster once the
377  * first one returns (the remaining stay attached to the DIOs but are then
378  * ignored and drop naturally).
379  */
380 void
381 hammer2_cluster_drop(hammer2_cluster_t *cluster)
382 {
383         hammer2_chain_t *chain;
384         int i;
385
386         KKASSERT(cluster->refs > 0);
387         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
388                 cluster->focus = NULL;          /* safety XXX chg to assert */
389                 cluster->focus_index = 0;
390
391                 for (i = 0; i < cluster->nchains; ++i) {
392                         chain = cluster->array[i].chain;
393                         if (chain) {
394                                 hammer2_chain_drop(chain);
395                                 cluster->array[i].chain = NULL; /* safety */
396                         }
397                 }
398                 cluster->nchains = 0;                           /* safety */
399
400                 kfree(cluster, M_HAMMER2);
401                 /* cluster is invalid */
402         }
403 }
404
405 void
406 hammer2_cluster_wait(hammer2_cluster_t *cluster)
407 {
408         tsleep(cluster->focus, 0, "h2clcw", 1);
409 }
410
411 /*
412  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
413  * and then locks them, modified by various RESOLVE flags.
414  *
415  * The act of locking a cluster sets its focus.  Note that cluster elements
416  * flagged with HAMMER2_CITEM_INVALID cannot be set as a focus.  Locking a
417  * cluster does not adjust this flag since exact matches only matter for leafs
418  * (parents can depend on minor differences in topology).
419  *
420  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
421  * operations.  Typically this is only set on a quorum of MASTERs or
422  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
423  * is present, this bit is *not* set on a quorum of MASTERs.  The
424  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
425  * that create/modify/delete elements use it.
426  *
427  * The chains making up the cluster may be narrowed down based on quorum
428  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
429  * narrowed down to a single chain as long as the entire subtopology is known
430  * to be intact.  So, for example, we can narrow a read-only op to a single
431  * fast SLAVE but if we focus a CACHE chain we must still retain at least
432  * a SLAVE to ensure that the subtopology can be accessed.
433  *
434  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
435  * to be maintained once the topology is validated as-of the top level of
436  * the operation.
437  *
438  * If a failure occurs the operation must be aborted by higher-level code and
439  * retried. XXX
440  */
441 void
442 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
443 {
444         hammer2_chain_t *chain;
445         int i;
446
447         /* cannot be on inode-embedded cluster template, must be on copy */
448         KKASSERT(cluster->refs > 0);
449         KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
450         if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
451                 panic("hammer2_cluster_lock: cluster %p already locked!\n",
452                         cluster);
453         } else {
454                 KKASSERT(cluster->focus == NULL);
455         }
456         atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
457
458         /*
459          * Lock chains and resolve state.
460          */
461         for (i = 0; i < cluster->nchains; ++i) {
462                 chain = cluster->array[i].chain;
463                 if (chain == NULL)
464                         continue;
465                 hammer2_chain_lock(chain, how);
466         }
467
468         hammer2_cluster_resolve(cluster);
469 }
470
471 void
472 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
473 {
474         hammer2_chain_t *chain;
475         hammer2_pfs_t *pmp;
476         hammer2_tid_t quorum_tid;
477         int focus_pfs_type;
478         uint32_t nflags;
479         int ttlmasters;
480         int ttlslaves;
481         int nmasters;
482         int nslaves;
483         int nquorum;
484         int smpresent;
485         int i;
486
487         cluster->error = 0;
488
489         quorum_tid = 0;
490         focus_pfs_type = 0;
491         nflags = 0;
492         ttlmasters = 0;
493         ttlslaves = 0;
494         nmasters = 0;
495         nslaves = 0;
496
497         /*
498          * Calculate quorum
499          */
500         pmp = cluster->pmp;
501         KKASSERT(pmp != NULL || cluster->nchains == 0);
502         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
503         smpresent = 0;
504
505         /*
506          * Pass 1
507          */
508         for (i = 0; i < cluster->nchains; ++i) {
509                 chain = cluster->array[i].chain;
510                 if (chain == NULL)
511                         continue;
512                 if (chain->error) {
513                         if (cluster->focus == NULL || cluster->focus == chain) {
514                                 /* error will be overridden by valid focus */
515                                 cluster->error = chain->error;
516                         }
517
518                         /*
519                          * Must count total masters and slaves whether the
520                          * chain is errored or not.
521                          */
522                         switch (cluster->pmp->pfs_types[i]) {
523                         case HAMMER2_PFSTYPE_MASTER:
524                                 ++ttlmasters;
525                                 break;
526                         case HAMMER2_PFSTYPE_SLAVE:
527                                 ++ttlslaves;
528                                 break;
529                         }
530                         continue;
531                 }
532                 switch (cluster->pmp->pfs_types[i]) {
533                 case HAMMER2_PFSTYPE_MASTER:
534                         ++ttlmasters;
535                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
536                                 /*
537                                  * Invalid as in unsynchronized, cannot be
538                                  * used to calculate the quorum.
539                                  */
540                         } else if (quorum_tid < chain->bref.modify_tid ||
541                                    nmasters == 0) {
542                                 nmasters = 1;
543                                 quorum_tid = chain->bref.modify_tid;
544                         } else if (quorum_tid == chain->bref.modify_tid) {
545                                 ++nmasters;
546                         }
547                         break;
548                 case HAMMER2_PFSTYPE_SLAVE:
549                         ++ttlslaves;
550                         break;
551                 case HAMMER2_PFSTYPE_SOFT_MASTER:
552                         nflags |= HAMMER2_CLUSTER_WRSOFT;
553                         nflags |= HAMMER2_CLUSTER_RDSOFT;
554                         smpresent = 1;
555                         break;
556                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
557                         nflags |= HAMMER2_CLUSTER_RDSOFT;
558                         break;
559                 case HAMMER2_PFSTYPE_SUPROOT:
560                         /*
561                          * Degenerate cluster representing the super-root
562                          * topology on a single device.  Fake stuff so
563                          * cluster ops work as expected.
564                          */
565                         nflags |= HAMMER2_CLUSTER_WRHARD;
566                         nflags |= HAMMER2_CLUSTER_RDHARD;
567                         cluster->focus_index = i;
568                         cluster->focus = chain;
569                         cluster->error = chain->error;
570                         break;
571                 default:
572                         break;
573                 }
574         }
575
576         /*
577          * Pass 2
578          */
579         for (i = 0; i < cluster->nchains; ++i) {
580                 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
581                 chain = cluster->array[i].chain;
582                 if (chain == NULL)
583                         continue;
584                 if (chain->error) {
585                         if (cluster->focus == NULL || cluster->focus == chain) {
586                                 /* error will be overridden by valid focus */
587                                 cluster->error = chain->error;
588                         }
589                         continue;
590                 }
591
592                 switch (cluster->pmp->pfs_types[i]) {
593                 case HAMMER2_PFSTYPE_MASTER:
594                         /*
595                          * We must have enough up-to-date masters to reach
596                          * a quorum and the master modify_tid must match
597                          * the quorum's modify_tid.
598                          *
599                          * Do not select an errored or out-of-sync master.
600                          */
601                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
602                                 nflags |= HAMMER2_CLUSTER_UNHARD;
603                         } else if (nmasters >= nquorum &&
604                                    chain->error == 0 &&
605                                    quorum_tid == chain->bref.modify_tid) {
606                                 nflags |= HAMMER2_CLUSTER_WRHARD;
607                                 nflags |= HAMMER2_CLUSTER_RDHARD;
608                                 if (!smpresent) {
609                                         cluster->array[i].flags |=
610                                                         HAMMER2_CITEM_FEMOD;
611                                 }
612                                 if (cluster->focus == NULL ||
613                                     focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
614                                         focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
615                                         cluster->focus_index = i;
616                                         cluster->focus = chain;
617                                         cluster->error = chain->error;
618                                 }
619                         } else if (chain->error == 0) {
620                                 nflags |= HAMMER2_CLUSTER_UNHARD;
621                         }
622                         break;
623                 case HAMMER2_PFSTYPE_SLAVE:
624                         /*
625                          * We must have enough up-to-date masters to reach
626                          * a quorum and the slave modify_tid must match the
627                          * quorum's modify_tid.
628                          *
629                          * Do not select an errored slave.
630                          */
631                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
632                                 nflags |= HAMMER2_CLUSTER_UNHARD;
633                         } else if (nmasters >= nquorum &&
634                                    chain->error == 0 &&
635                                    quorum_tid == chain->bref.modify_tid) {
636                                 ++nslaves;
637                                 nflags |= HAMMER2_CLUSTER_RDHARD;
638                                 if (cluster->focus == NULL) {
639                                         focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
640                                         cluster->focus_index = i;
641                                         cluster->focus = chain;
642                                         cluster->error = chain->error;
643                                 }
644                         } else if (chain->error == 0) {
645                                 nflags |= HAMMER2_CLUSTER_UNSOFT;
646                         }
647                         break;
648                 case HAMMER2_PFSTYPE_SOFT_MASTER:
649                         /*
650                          * Directly mounted soft master always wins.  There
651                          * should be only one.
652                          */
653                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
654                         cluster->focus_index = i;
655                         cluster->focus = chain;
656                         cluster->error = chain->error;
657                         focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
658                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
659                         break;
660                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
661                         /*
662                          * Directly mounted soft slave always wins.  There
663                          * should be only one.
664                          */
665                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
666                         if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
667                                 cluster->focus_index = i;
668                                 cluster->focus = chain;
669                                 cluster->error = chain->error;
670                                 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
671                         }
672                         break;
673                 case HAMMER2_PFSTYPE_SUPROOT:
674                         /*
675                          * spmp (degenerate case)
676                          */
677                         KKASSERT(i == 0);
678                         cluster->focus_index = i;
679                         cluster->focus = chain;
680                         cluster->error = chain->error;
681                         focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
682                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
683                         break;
684                 default:
685                         break;
686                 }
687         }
688
689         if (ttlslaves == 0)
690                 nflags |= HAMMER2_CLUSTER_NOSOFT;
691         if (ttlmasters == 0)
692                 nflags |= HAMMER2_CLUSTER_NOHARD;
693
694         /*
695          * Set SSYNCED or MSYNCED for slaves and masters respectively if
696          * all available nodes (even if 0 are available) are fully
697          * synchronized.  This is used by the synchronization thread to
698          * determine if there is work it could potentially accomplish.
699          */
700         if (nslaves == ttlslaves)
701                 nflags |= HAMMER2_CLUSTER_SSYNCED;
702         if (nmasters == ttlmasters)
703                 nflags |= HAMMER2_CLUSTER_MSYNCED;
704
705         /*
706          * Determine if the cluster was successfully locked for the
707          * requested operation and generate an error code.  The cluster
708          * will not be locked (or ref'd) if an error is returned.
709          *
710          * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
711          * to determine if reading or writing is possible.  If writing, the
712          * cluster still requires a call to hammer2_cluster_modify() first.
713          */
714         atomic_set_int(&cluster->flags, nflags);
715         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
716 }
717
718 /*
719  * Copy a cluster, returned a ref'd cluster.  All underlying chains
720  * are also ref'd, but not locked.
721  *
722  * The cluster focus is not set because the cluster is not yet locked
723  * (and the originating cluster does not have to be locked either).
724  */
725 hammer2_cluster_t *
726 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
727 {
728         hammer2_pfs_t *pmp = ocluster->pmp;
729         hammer2_cluster_t *ncluster;
730         hammer2_chain_t *chain;
731         int i;
732
733         ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
734         ncluster->pmp = pmp;
735         ncluster->nchains = ocluster->nchains;
736         ncluster->refs = 1;
737         ncluster->flags = 0;    /* cluster not locked */
738
739         for (i = 0; i < ocluster->nchains; ++i) {
740                 chain = ocluster->array[i].chain;
741                 ncluster->array[i].chain = chain;
742                 if (chain)
743                         hammer2_chain_ref(chain);
744         }
745         return (ncluster);
746 }
747
748 /*
749  * Unlock and deref a cluster.  The cluster is destroyed if this is the
750  * last ref.
751  */
752 void
753 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
754 {
755         hammer2_chain_t *chain;
756         int i;
757
758         if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
759                 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
760                         cluster);
761         }
762         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
763         KKASSERT(cluster->refs > 0);
764         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
765
766         for (i = 0; i < cluster->nchains; ++i) {
767                 chain = cluster->array[i].chain;
768                 if (chain)
769                         hammer2_chain_unlock(chain);
770         }
771         cluster->focus_index = 0;
772         cluster->focus = NULL;
773 }
774
775 /*
776  * Resize the cluster's physical storage allocation in-place.  This may
777  * replace the cluster's chains.
778  */
779 void
780 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
781                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
782                        int nradix, int flags)
783 {
784         hammer2_chain_t *chain;
785         int i;
786
787         KKASSERT(cparent->pmp == cluster->pmp);         /* can be NULL */
788         KKASSERT(cparent->nchains == cluster->nchains);
789
790         for (i = 0; i < cluster->nchains; ++i) {
791                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
792                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
793                         continue;
794                 }
795                 chain = cluster->array[i].chain;
796                 if (chain) {
797                         KKASSERT(cparent->array[i].chain);
798                         hammer2_chain_resize(trans, ip,
799                                              cparent->array[i].chain, chain,
800                                              nradix, flags);
801                 }
802         }
803 }
804
805 /*
806  * Set an inode's cluster modified, marking the related chains RW and
807  * duplicating them if necessary.
808  *
809  * The passed-in chain is a localized copy of the chain previously acquired
810  * when the inode was locked (and possilby replaced in the mean time), and
811  * must also be updated.  In fact, we update it first and then synchronize
812  * the inode's cluster cache.
813  */
814 hammer2_inode_data_t *
815 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
816                           hammer2_cluster_t *cluster, int flags)
817 {
818         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
819         hammer2_cluster_modify(trans, cluster, flags);
820
821         hammer2_inode_repoint(ip, NULL, cluster);
822         if (ip->vp)
823                 vsetisdirty(ip->vp);
824         return (&hammer2_cluster_wdata(cluster)->ipdata);
825 }
826
827 /*
828  * Adjust the cluster's chains to allow modification and adjust the
829  * focus.  Data will be accessible on return.
830  *
831  * If our focused master errors on modify, re-resolve the cluster to
832  * try to select a different master.
833  */
834 void
835 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
836                        int flags)
837 {
838         hammer2_chain_t *chain;
839         int resolve_again;
840         int i;
841
842         resolve_again = 0;
843         for (i = 0; i < cluster->nchains; ++i) {
844                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
845                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
846                         continue;
847                 }
848                 chain = cluster->array[i].chain;
849                 if (chain == NULL)
850                         continue;
851                 if (chain->error)
852                         continue;
853                 hammer2_chain_modify(trans, chain, flags);
854                 if (cluster->focus == chain && chain->error) {
855                         cluster->error = chain->error;
856                         resolve_again = 1;
857                 }
858         }
859         if (resolve_again)
860                 hammer2_cluster_resolve(cluster);
861 }
862
863 /*
864  * Synchronize modifications from the focus to other chains in a cluster.
865  * Convenient because nominal API users can just modify the contents of the
866  * focus (at least for non-blockref data).
867  *
868  * Nominal front-end operations only edit non-block-table data in a single
869  * chain.  This code copies such modifications to the other chains in the
870  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
871  * by both the frontend and the backend and will explode in fireworks if
872  * blindly copied.
873  */
874 void
875 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
876 {
877         hammer2_chain_t *focus;
878         hammer2_chain_t *scan;
879         const hammer2_inode_data_t *ripdata;
880         hammer2_inode_data_t *wipdata;
881         int i;
882
883         focus = cluster->focus;
884         KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
885
886         for (i = 0; i < cluster->nchains; ++i) {
887                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
888                         continue;
889                 scan = cluster->array[i].chain;
890                 if (scan == NULL || scan == focus)
891                         continue;
892                 if (scan->error)
893                         continue;
894                 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
895                 KKASSERT(focus->bytes == scan->bytes &&
896                          focus->bref.type == scan->bref.type);
897                 switch(focus->bref.type) {
898                 case HAMMER2_BREF_TYPE_INODE:
899                         ripdata = &focus->data->ipdata;
900                         wipdata = &scan->data->ipdata;
901                         if ((ripdata->op_flags &
902                             HAMMER2_OPFLAG_DIRECTDATA) == 0) {
903                                 bcopy(ripdata, wipdata,
904                                       offsetof(hammer2_inode_data_t, u));
905                                 break;
906                         }
907                         /* fall through to full copy */
908                 case HAMMER2_BREF_TYPE_DATA:
909                         bcopy(focus->data, scan->data, focus->bytes);
910                         break;
911                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
912                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
913                 case HAMMER2_BREF_TYPE_FREEMAP:
914                 case HAMMER2_BREF_TYPE_VOLUME:
915                         panic("hammer2_cluster_modsync: illegal node type");
916                         /* NOT REACHED */
917                         break;
918                 default:
919                         panic("hammer2_cluster_modsync: unknown node type");
920                         break;
921                 }
922         }
923 }
924
925 /*
926  * Lookup initialization/completion API.  Returns a locked cluster with 1 ref.
927  */
928 hammer2_cluster_t *
929 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
930 {
931         hammer2_cluster_t *cluster;
932
933         cluster = hammer2_cluster_copy(cparent);
934         if (flags & HAMMER2_LOOKUP_SHARED) {
935                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
936                                               HAMMER2_RESOLVE_SHARED);
937         } else {
938                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
939         }
940         return (cluster);
941 }
942
943 void
944 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
945 {
946         if (cparent) {
947                 hammer2_cluster_unlock(cparent);
948                 hammer2_cluster_drop(cparent);
949         }
950 }
951
952 /*
953  * Locate first match or overlap under parent, return a new cluster
954  */
955 hammer2_cluster_t *
956 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
957                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
958 {
959         hammer2_pfs_t *pmp;
960         hammer2_cluster_t *cluster;
961         hammer2_chain_t *chain;
962         hammer2_chain_t *focus;
963         hammer2_key_t key_accum;
964         hammer2_key_t key_next;
965         int null_count;
966         int i;
967
968         pmp = cparent->pmp;                             /* can be NULL */
969         key_accum = *key_nextp;
970         null_count = 0;
971
972         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
973         cluster->pmp = pmp;                             /* can be NULL */
974         cluster->refs = 1;
975         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
976                 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
977
978         /*
979          * Pass-1, issue lookup and find focus.
980          */
981         for (i = 0; i < cparent->nchains; ++i) {
982                 cluster->array[i].flags = cparent->array[i].flags;
983                 key_next = *key_nextp;
984
985                 /*
986                  * Nothing to base the lookup, or parent was not synchronized.
987                  */
988                 if (cparent->array[i].chain == NULL ||
989                     (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
990                         ++null_count;
991                         continue;
992                 }
993
994                 chain = hammer2_chain_lookup(&cparent->array[i].chain,
995                                              &key_next,
996                                              key_beg, key_end,
997                                              &cparent->array[i].cache_index,
998                                              flags);
999                 cluster->array[i].chain = chain;
1000                 if (chain == NULL) {
1001                         ++null_count;
1002                 } else if (chain->error) {
1003                         /*
1004                          * Leave errored chain in cluster, but it cannot be
1005                          * the cluster's focus.  It is still possible for an
1006                          * error'd chain to be synchronized (since we have
1007                          * the bref), synchronization state will be handled
1008                          * in pass-2.
1009                          */
1010                         ;
1011                 } else if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
1012                         /*
1013                          * Leave unsynchronized chain in cluster, but it cannot
1014                          * be the cluster's focus.
1015                          */
1016                         ;
1017                 } else {
1018                         int ddflag = (chain->bref.type ==
1019                                       HAMMER2_BREF_TYPE_INODE);
1020
1021                         if (cluster->focus == NULL) {
1022                                 cluster->focus_index = i;
1023                                 cluster->focus = chain;
1024                                 cluster->ddflag = ddflag;
1025                         }
1026                         if (cparent->focus == cparent->array[i].chain) {
1027                                 cluster->focus_index = i;
1028                                 cluster->focus = chain;
1029                                 cluster->ddflag = ddflag;
1030                         }
1031                 }
1032                 if (key_accum > key_next)
1033                         key_accum = key_next;
1034         }
1035
1036         /*
1037          * Pass-2 invalidate mismatches
1038          */
1039         focus = cluster->focus;
1040         if (focus == NULL)
1041                 goto done;
1042
1043         for (i = 0; i < cparent->nchains; ++i) {
1044                 int ddflag;
1045
1046                 chain = cluster->array[i].chain;
1047
1048                 if (chain == NULL)
1049                         continue;
1050                 if (chain == focus)
1051                         continue;
1052                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1053                         continue;
1054
1055                 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1056                 if (chain->bref.type != focus->bref.type ||
1057                     chain->bref.key != focus->bref.key ||
1058                     chain->bref.keybits != focus->bref.keybits ||
1059                     chain->bref.modify_tid != focus->bref.modify_tid ||
1060                     chain->bytes != focus->bytes ||
1061                     ddflag != cluster->ddflag) {
1062                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1063                 }
1064         }
1065
1066         /*
1067          * Resolve cluster flags.  A lookup or locking failure could wind
1068          * up changing the cluster.
1069          */
1070 done:
1071         *key_nextp = key_accum;
1072         cluster->nchains = i;
1073         hammer2_cluster_resolve(cluster);
1074
1075         if (null_count == i) {
1076                 hammer2_cluster_drop(cluster);
1077                 cluster = NULL;
1078         }
1079
1080         return (cluster);
1081 }
1082
1083 /*
1084  * Locate next match or overlap under parent, replace cluster
1085  */
1086 hammer2_cluster_t *
1087 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1088                      hammer2_key_t *key_nextp,
1089                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1090 {
1091         hammer2_chain_t *ochain;
1092         hammer2_chain_t *nchain;
1093         hammer2_chain_t *focus;
1094         hammer2_key_t key_accum;
1095         hammer2_key_t key_next;
1096         int null_count;
1097         int i;
1098
1099         key_accum = *key_nextp;
1100         null_count = 0;
1101         cluster->focus = NULL;
1102         cparent->focus = NULL;
1103         cluster->focus_index = 0;
1104         cparent->focus_index = 0;
1105
1106         cluster->ddflag = 0;
1107
1108         for (i = 0; i < cparent->nchains; ++i) {
1109                 key_next = *key_nextp;
1110                 ochain = cluster->array[i].chain;
1111
1112                 /*
1113                  * Nothing to iterate from.  These cases can occur under
1114                  * normal operations.  For example, during synchronization
1115                  * a slave might reach the end of its scan while records
1116                  * are still left on the master(s).
1117                  */
1118                 if (ochain == NULL) {
1119                         ++null_count;
1120                         continue;
1121                 }
1122                 if (cparent->array[i].chain == NULL ||
1123                     (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1124                     (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1125                         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1126                                 hammer2_chain_unlock(ochain);
1127                         hammer2_chain_drop(ochain);
1128                         cluster->array[i].chain = NULL;
1129                         ++null_count;
1130                         continue;
1131                 }
1132
1133                 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1134                                             &key_next, key_beg, key_end,
1135                                             &cparent->array[i].cache_index,
1136                                             flags);
1137                 /* ochain now invalid but can still be used for focus check */
1138
1139                 cluster->array[i].chain = nchain;
1140                 if (nchain == NULL) {
1141                         ++null_count;
1142                 } else if (nchain->error) {
1143                         /*
1144                          * Leave errored chain in cluster, but it cannot be
1145                          * the cluster's focus.
1146                          */
1147                         ;
1148                 } else {
1149                         int ddflag = (nchain->bref.type ==
1150                                       HAMMER2_BREF_TYPE_INODE);
1151
1152                         /*
1153                          * Possible new focus.
1154                          */
1155                         if (cluster->focus == NULL) {
1156                                 cluster->ddflag = ddflag;
1157                                 cluster->focus_index = i;
1158                                 cluster->focus = nchain;
1159                         }
1160
1161                         /*
1162                          * Fixup pre-existing focus.
1163                          */
1164                         if (cluster->focus == ochain) {
1165                                 cluster->focus_index = i;
1166                                 cluster->focus = nchain;
1167                         }
1168                 }
1169                 if (key_accum > key_next)
1170                         key_accum = key_next;
1171         }
1172
1173         /*
1174          * Pass-2 invalidate mismatches
1175          */
1176         focus = cluster->focus;
1177         if (focus == NULL)
1178                 goto done;
1179
1180         for (i = 0; i < cparent->nchains; ++i) {
1181                 int ddflag;
1182
1183                 nchain = cluster->array[i].chain;
1184
1185                 if (nchain == NULL)
1186                         continue;
1187                 if (nchain == focus)
1188                         continue;
1189                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1190                         continue;
1191
1192                 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1193                 if (nchain->bref.type != focus->bref.type ||
1194                     nchain->bref.key != focus->bref.key ||
1195                     nchain->bref.keybits != focus->bref.keybits ||
1196                     nchain->bref.modify_tid != focus->bref.modify_tid ||
1197                     nchain->bytes != focus->bytes ||
1198                     ddflag != cluster->ddflag) {
1199                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1200                 }
1201         }
1202
1203 done:
1204         *key_nextp = key_accum;
1205         cluster->nchains = i;
1206         hammer2_cluster_resolve(cluster);
1207
1208         if (null_count == i) {
1209                 hammer2_cluster_drop(cluster);
1210                 cluster = NULL;
1211         }
1212         return(cluster);
1213 }
1214
1215 /*
1216  * Advance just one chain in the cluster and recalculate the invalid bit.
1217  * (used during synchronization to advance past a chain being deleted).
1218  *
1219  * The chain being advanced must not be the focus and the clusters in
1220  * question must have already passed normal cluster_lookup/cluster_next
1221  * checks.
1222  *
1223  * The cluster always remains intact on return, so void function.
1224  */
1225 void
1226 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1227                                   hammer2_cluster_t *cluster,
1228                                   hammer2_key_t *key_nextp,
1229                                   hammer2_key_t key_beg,
1230                                   hammer2_key_t key_end,
1231                                   int i, int flags)
1232 {
1233         hammer2_chain_t *ochain;
1234         hammer2_chain_t *nchain;
1235         hammer2_chain_t *focus;
1236         hammer2_key_t key_accum;
1237         hammer2_key_t key_next;
1238         int ddflag;
1239
1240         key_accum = *key_nextp;
1241         key_next = *key_nextp;
1242         ochain = cluster->array[i].chain;
1243         if (ochain == NULL)
1244                 goto done;
1245         KKASSERT(ochain != cluster->focus);
1246
1247         nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1248                                     &key_next, key_beg, key_end,
1249                                     &cparent->array[i].cache_index,
1250                                     flags);
1251         /* ochain now invalid */
1252
1253         /*
1254          * Install nchain.  Note that nchain can be NULL, and can also
1255          * be in an unlocked state depending on flags.
1256          */
1257         cluster->array[i].chain = nchain;
1258         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1259
1260         if (key_accum > key_next)
1261                 key_accum = key_next;
1262
1263         focus = cluster->focus;
1264         if (focus == NULL)
1265                 goto done;
1266         if (nchain == NULL)
1267                 goto done;
1268 #if 0
1269         if (nchain == focus)    /* ASSERTED NOT TRUE */
1270                 ...
1271 #endif
1272         ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1273         if (nchain->bref.type != focus->bref.type ||
1274             nchain->bref.key != focus->bref.key ||
1275             nchain->bref.keybits != focus->bref.keybits ||
1276             nchain->bref.modify_tid != focus->bref.modify_tid ||
1277             nchain->bytes != focus->bytes ||
1278             ddflag != cluster->ddflag) {
1279                 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1280         }
1281
1282 done:
1283         *key_nextp = key_accum;
1284 #if 0
1285         /*
1286          * For now don't re-resolve cluster->flags.
1287          */
1288         hammer2_cluster_resolve(cluster);
1289 #endif
1290 }
1291
1292 /*
1293  * Create a new cluster using the specified key
1294  */
1295 int
1296 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1297                      hammer2_cluster_t **clusterp,
1298                      hammer2_key_t key, int keybits,
1299                      int type, size_t bytes, int flags)
1300 {
1301         hammer2_cluster_t *cluster;
1302         hammer2_pfs_t *pmp;
1303         int error;
1304         int i;
1305
1306         pmp = trans->pmp;                               /* can be NULL */
1307
1308         if ((cluster = *clusterp) == NULL) {
1309                 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1310                                   M_WAITOK | M_ZERO);
1311                 cluster->pmp = pmp;                     /* can be NULL */
1312                 cluster->refs = 1;
1313                 cluster->flags = HAMMER2_CLUSTER_LOCKED;
1314         }
1315         cluster->focus_index = 0;
1316         cluster->focus = NULL;
1317
1318         /*
1319          * NOTE: cluster->array[] entries can initially be NULL.  If
1320          *       *clusterp is supplied, skip NULL entries, otherwise
1321          *       create new chains.
1322          */
1323         for (i = 0; i < cparent->nchains; ++i) {
1324                 if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1325                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1326                         continue;
1327                 }
1328                 if (*clusterp) {
1329                         if ((cluster->array[i].flags &
1330                              HAMMER2_CITEM_FEMOD) == 0) {
1331                                 cluster->array[i].flags |=
1332                                                 HAMMER2_CITEM_INVALID;
1333                                 continue;
1334                         }
1335                         if (cluster->array[i].chain == NULL)
1336                                 continue;
1337                 }
1338                 error = hammer2_chain_create(trans, &cparent->array[i].chain,
1339                                              &cluster->array[i].chain, pmp,
1340                                              key, keybits,
1341                                              type, bytes, flags);
1342                 KKASSERT(error == 0);
1343                 if (cluster->focus == NULL) {
1344                         cluster->focus_index = i;
1345                         cluster->focus = cluster->array[i].chain;
1346                 }
1347                 if (cparent->focus == cparent->array[i].chain) {
1348                         cluster->focus_index = i;
1349                         cluster->focus = cluster->array[i].chain;
1350                 }
1351         }
1352         cluster->nchains = i;
1353         *clusterp = cluster;
1354         hammer2_cluster_resolve(cluster);
1355
1356         return error;
1357 }
1358
1359 /*
1360  * Rename a cluster to a new parent.
1361  *
1362  * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1363  *          So the data_off field is not relevant.  Only the key and
1364  *          keybits are used.
1365  */
1366 void
1367 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1368                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1369                        int flags)
1370 {
1371         hammer2_chain_t *chain;
1372         hammer2_blockref_t xbref;
1373         int i;
1374
1375         cluster->focus = NULL;
1376         cparent->focus = NULL;
1377         cluster->focus_index = 0;
1378         cparent->focus_index = 0;
1379
1380         for (i = 0; i < cluster->nchains; ++i) {
1381                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1382                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1383                         continue;
1384                 }
1385                 chain = cluster->array[i].chain;
1386                 if (chain) {
1387                         if (bref) {
1388                                 xbref = chain->bref;
1389                                 xbref.key = bref->key;
1390                                 xbref.keybits = bref->keybits;
1391                                 hammer2_chain_rename(trans, &xbref,
1392                                                      &cparent->array[i].chain,
1393                                                      chain, flags);
1394                         } else {
1395                                 hammer2_chain_rename(trans, NULL,
1396                                                      &cparent->array[i].chain,
1397                                                      chain, flags);
1398                         }
1399                         KKASSERT(cluster->array[i].chain == chain); /*remove*/
1400                 }
1401         }
1402 }
1403
1404 /*
1405  * Mark a cluster deleted
1406  */
1407 void
1408 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1409                        hammer2_cluster_t *cluster, int flags)
1410 {
1411         hammer2_chain_t *chain;
1412         hammer2_chain_t *parent;
1413         int i;
1414
1415         if (cparent == NULL) {
1416                 kprintf("cparent is NULL\n");
1417                 return;
1418         }
1419
1420         for (i = 0; i < cluster->nchains; ++i) {
1421                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1422                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1423                         continue;
1424                 }
1425                 parent = cparent->array[i].chain;
1426                 chain = cluster->array[i].chain;
1427                 if (chain == NULL)
1428                         continue;
1429                 if (chain->parent != parent) {
1430                         kprintf("hammer2_cluster_delete: parent "
1431                                 "mismatch chain=%p parent=%p against=%p\n",
1432                                 chain, chain->parent, parent);
1433                 } else {
1434                         hammer2_chain_delete(trans, parent, chain, flags);
1435                 }
1436         }
1437 }
1438
1439 /*
1440  * Create a snapshot of the specified {parent, ochain} with the specified
1441  * label.  The originating hammer2_inode must be exclusively locked for
1442  * safety.
1443  *
1444  * The ioctl code has already synced the filesystem.
1445  */
1446 int
1447 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1448                        hammer2_ioc_pfs_t *pfs)
1449 {
1450         hammer2_dev_t *hmp;
1451         hammer2_cluster_t *ncluster;
1452         const hammer2_inode_data_t *ripdata;
1453         hammer2_inode_data_t *wipdata;
1454         hammer2_chain_t *nchain;
1455         hammer2_inode_t *nip;
1456         size_t name_len;
1457         hammer2_key_t lhc;
1458         struct vattr vat;
1459 #if 0
1460         uuid_t opfs_clid;
1461 #endif
1462         int error;
1463         int i;
1464
1465         kprintf("snapshot %s\n", pfs->name);
1466
1467         name_len = strlen(pfs->name);
1468         lhc = hammer2_dirhash(pfs->name, name_len);
1469
1470         /*
1471          * Get the clid
1472          */
1473         ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1474 #if 0
1475         opfs_clid = ripdata->pfs_clid;
1476 #endif
1477         hmp = ocluster->focus->hmp;     /* XXX find synchronized local disk */
1478
1479         /*
1480          * Create the snapshot directory under the super-root
1481          *
1482          * Set PFS type, generate a unique filesystem id, and generate
1483          * a cluster id.  Use the same clid when snapshotting a PFS root,
1484          * which theoretically allows the snapshot to be used as part of
1485          * the same cluster (perhaps as a cache).
1486          *
1487          * Copy the (flushed) blockref array.  Theoretically we could use
1488          * chain_duplicate() but it becomes difficult to disentangle
1489          * the shared core so for now just brute-force it.
1490          */
1491         VATTR_NULL(&vat);
1492         vat.va_type = VDIR;
1493         vat.va_mode = 0755;
1494         ncluster = NULL;
1495         nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1496                                    proc0.p_ucred, pfs->name, name_len,
1497                                    &ncluster,
1498                                    HAMMER2_INSERT_PFSROOT, &error);
1499
1500         if (nip) {
1501                 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1502                 wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER;
1503                 wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1504                 wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT;
1505                 kern_uuidgen(&wipdata->pfs_fsid, 1);
1506
1507                 /*
1508                  * Give the snapshot its own private cluster.  As a snapshot
1509                  * no further synchronization with the original cluster will
1510                  * be done.
1511                  */
1512 #if 0
1513                 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1514                         wipdata->pfs_clid = opfs_clid;
1515                 else
1516                         kern_uuidgen(&wipdata->pfs_clid, 1);
1517 #endif
1518                 kern_uuidgen(&wipdata->pfs_clid, 1);
1519
1520                 for (i = 0; i < ncluster->nchains; ++i) {
1521                         if ((ncluster->array[i].flags &
1522                              HAMMER2_CITEM_FEMOD) == 0) {
1523                                 ncluster->array[i].flags |=
1524                                         HAMMER2_CITEM_INVALID;
1525                                 continue;
1526                         }
1527                         nchain = ncluster->array[i].chain;
1528                         if (nchain)
1529                                 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1530                 }
1531 #if 0
1532                 /* XXX can't set this unless we do an explicit flush, which
1533                    we also need a pmp assigned to do, else the flush code
1534                    won't flush ncluster because it thinks it is crossing a
1535                    flush boundary */
1536                 hammer2_cluster_set_chainflags(ncluster,
1537                                                HAMMER2_CHAIN_PFSBOUNDARY);
1538 #endif
1539
1540                 /* XXX hack blockset copy */
1541                 /* XXX doesn't work with real cluster */
1542                 KKASSERT(ocluster->nchains == 1);
1543                 wipdata->u.blockset = ripdata->u.blockset;
1544                 hammer2_cluster_modsync(ncluster);
1545                 for (i = 0; i < ncluster->nchains; ++i) {
1546                         nchain = ncluster->array[i].chain;
1547                         if (nchain)
1548                                 hammer2_flush(trans, nchain);
1549                 }
1550                 hammer2_inode_unlock(nip, ncluster);
1551         }
1552         return (error);
1553 }
1554
1555 /*
1556  * Return locked parent cluster given a locked child.  The child remains
1557  * locked on return.  The new parent's focus follows the child's focus
1558  * and the parent is always resolved.
1559  */
1560 hammer2_cluster_t *
1561 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1562 {
1563         hammer2_cluster_t *cparent;
1564         int i;
1565
1566         cparent = hammer2_cluster_copy(cluster);
1567
1568         for (i = 0; i < cparent->nchains; ++i) {
1569                 hammer2_chain_t *chain;
1570                 hammer2_chain_t *rchain;
1571
1572                 /*
1573                  * Calculate parent for each element.  Old chain has an extra
1574                  * ref for cparent but the lock remains with cluster.
1575                  */
1576                 chain = cparent->array[i].chain;
1577                 if (chain == NULL)
1578                         continue;
1579                 while ((rchain = chain->parent) != NULL) {
1580                         hammer2_chain_ref(rchain);
1581                         hammer2_chain_unlock(chain);
1582                         hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1583                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1584                         if (chain->parent == rchain)
1585                                 break;
1586                         hammer2_chain_unlock(rchain);
1587                         hammer2_chain_drop(rchain);
1588                 }
1589                 if (cluster->focus == chain) {
1590                         cparent->focus_index = i;
1591                         cparent->focus = rchain;
1592                 }
1593                 cparent->array[i].chain = rchain;
1594                 hammer2_chain_drop(chain);
1595         }
1596         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1597         hammer2_cluster_resolve(cparent);
1598
1599         return cparent;
1600 }
1601
1602 /************************************************************************
1603  *                              CLUSTER I/O                             *
1604  ************************************************************************
1605  *
1606  *
1607  * WARNING! blockref[] array data is not universal.  These functions should
1608  *          only be used to access universal data.
1609  *
1610  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1611  *          complete if necessary.  The I/O's should have already been
1612  *          initiated by the cluster_lock/chain_lock operation.
1613  *
1614  *          The cluster must already be in a modified state before wdata
1615  *          is called.  The data will already be available for this case.
1616  */
1617 const hammer2_media_data_t *
1618 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1619 {
1620         return(cluster->focus->data);
1621 }
1622
1623 hammer2_media_data_t *
1624 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1625 {
1626         KKASSERT(hammer2_cluster_modified(cluster));
1627         return(cluster->focus->data);
1628 }
1629
1630 /*
1631  * Load cluster data asynchronously with callback.
1632  *
1633  * The callback is made for the first validated data found, or NULL
1634  * if no valid data is available.
1635  *
1636  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1637  *       in the inode with an exclusive lock held), the chain structure may be
1638  *       shared.
1639  */
1640 void
1641 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1642                            void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1643 {
1644         hammer2_chain_t *chain;
1645         hammer2_iocb_t *iocb;
1646         hammer2_dev_t *hmp;
1647         hammer2_blockref_t *bref;
1648         int i;
1649
1650         /*
1651          * Try to find a chain whos data is already resolved.  If none can
1652          * be found, start with the first chain.
1653          */
1654         chain = NULL;
1655         for (i = 0; i < cluster->nchains; ++i) {
1656                 chain = cluster->array[i].chain;
1657                 if (chain && chain->data)
1658                         break;
1659         }
1660         if (i == cluster->nchains) {
1661                 chain = cluster->array[0].chain;
1662                 i = 0;
1663         }
1664
1665         iocb = &cluster->iocb;
1666         iocb->callback = callback;
1667         iocb->dio = NULL;               /* for already-validated case */
1668         iocb->cluster = cluster;
1669         iocb->chain = chain;
1670         iocb->ptr = ptr;
1671         iocb->lbase = (off_t)i;
1672         iocb->flags = 0;
1673         iocb->error = 0;
1674
1675         /*
1676          * Data already validated
1677          */
1678         if (chain->data) {
1679                 callback(iocb);
1680                 return;
1681         }
1682
1683         /*
1684          * We must resolve to a device buffer, either by issuing I/O or
1685          * by creating a zero-fill element.  We do not mark the buffer
1686          * dirty when creating a zero-fill element (the hammer2_chain_modify()
1687          * API must still be used to do that).
1688          *
1689          * The device buffer is variable-sized in powers of 2 down
1690          * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1691          * chunk always contains buffers of the same size. (XXX)
1692          *
1693          * The minimum physical IO size may be larger than the variable
1694          * block size.
1695          *
1696          * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
1697          *            matches hammer2_devblksize()?  Or does the freemap's
1698          *            pre-zeroing handle the case for us?
1699          */
1700         bref = &chain->bref;
1701         hmp = chain->hmp;
1702
1703 #if 0
1704         /* handled by callback? <- TODO XXX even needed for loads? */
1705         /*
1706          * The getblk() optimization for a 100% overwrite can only be used
1707          * if the physical block size matches the request.
1708          */
1709         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1710             chain->bytes == hammer2_devblksize(chain->bytes)) {
1711                 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1712                 KKASSERT(error == 0);
1713                 iocb->dio = dio;
1714                 callback(iocb);
1715                 return;
1716         }
1717 #endif
1718
1719         /*
1720          * Otherwise issue a read
1721          */
1722         hammer2_adjreadcounter(&chain->bref, chain->bytes);
1723         hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1724 }