hammer2 - Per-PFS transaction management
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *          locks and I/O, do quorum and/or master-slave processing, and
50  *          it must operate properly even if some nodes are broken (which
51  *          can also mean indefinite locks).
52  *
53  *                              CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *              hammer2_cluster_lock()
59  *              hammer2_cluster_parent()
60  *
61  *      - Most complex functions, quorum management on transaction ids.
62  *
63  *      - Locking and data accesses must be internally asynchronous.
64  *
65  *      - Validate and manage cache coherency primitives (cache state
66  *        is stored in chain topologies but must be validated by these
67  *        functions).
68  *
69  * (2) Lookups and Scans
70  *              hammer2_cluster_lookup()
71  *              hammer2_cluster_next()
72  *
73  *      - Depend on locking & data retrieval functions, but still complex.
74  *
75  *      - Must do quorum management on transaction ids.
76  *
77  *      - Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *              hammer2_cluster_create()
81  *              hammer2_cluster_rename()
82  *              hammer2_cluster_delete()
83  *              hammer2_cluster_modify()
84  *              hammer2_cluster_modsync()
85  *
86  *      - Can usually punt on failures, operation continues unless quorum
87  *        is lost.  If quorum is lost, must wait for resynchronization
88  *        (depending on the management mode).
89  *
90  *      - Must disconnect node on failures (also not flush), remount, and
91  *        resynchronize.
92  *
93  *      - Network links (via kdmsg) are relatively easy to issue as the
94  *        complex underworkings of hammer2_chain.c don't have to messed
95  *        with (the protocol is at a higher level than block-level).
96  *
97  *      - Multiple local disk nodes (i.e. block devices) are another matter.
98  *        Chain operations have to be dispatched to per-node threads (xN)
99  *        because we can't asynchronize potentially very complex chain
100  *        operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *        (these threads are also used to terminate incoming kdmsg ops from
103  *        other machines).
104  *
105  *      - Single-node filesystems do not use threads and will simply call
106  *        hammer2_chain.c functions directly.  This short-cut is handled
107  *        at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115
116 #include "hammer2.h"
117
118 /*
119  * Returns non-zero if any chain in the cluster needs to be resized.
120  * Errored elements are not used in the calculation.
121  */
122 int
123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
124 {
125         hammer2_chain_t *chain;
126         int i;
127
128         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
129         for (i = 0; i < cluster->nchains; ++i) {
130                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
131                         continue;
132                 chain = cluster->array[i].chain;
133                 if (chain == NULL)
134                         continue;
135                 if (chain->error)
136                         continue;
137                 if (chain->bytes != bytes)
138                         return 1;
139         }
140         return 0;
141 }
142
143 /*
144  * Returns the bref type of the cluster's foucs.
145  *
146  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
147  * The cluster must be locked.
148  */
149 uint8_t
150 hammer2_cluster_type(hammer2_cluster_t *cluster)
151 {
152         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
153         if (cluster->error == 0)
154                 return(cluster->focus->bref.type);
155         return 0;
156 }
157
158 /*
159  * Returns non-zero if the cluster's focus is flagged as being modified.
160  *
161  * If the cluster is errored, returns 0.
162  */
163 int
164 hammer2_cluster_modified(hammer2_cluster_t *cluster)
165 {
166         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
167         if (cluster->error == 0)
168                 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
169         return 0;
170 }
171
172 /*
173  * Returns the bref of the cluster's focus, sans any data-offset information
174  * (since offset information is per-node and wouldn't be useful).
175  *
176  * Callers use this function to access modify_tid, mirror_tid, type,
177  * key, and keybits.
178  *
179  * If the cluster is errored, returns an empty bref.
180  * The cluster must be locked.
181  */
182 void
183 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
184 {
185         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
186         if (cluster->error == 0) {
187                 *bref = cluster->focus->bref;
188                 bref->data_off = 0;
189         } else {
190                 bzero(bref, sizeof(*bref));
191         }
192 }
193
194 /*
195  * Return non-zero if the chain representing an inode has been flagged
196  * as having been unlinked.  Allows the vnode reclaim to avoid loading
197  * the inode data from disk e.g. when unmount or recycling old, clean
198  * vnodes.
199  *
200  * The cluster does not need to be locked.
201  * The focus cannot be used since the cluster might not be locked.
202  */
203 int
204 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster)
205 {
206         hammer2_chain_t *chain;
207         int flags;
208         int i;
209
210         flags = 0;
211         for (i = 0; i < cluster->nchains; ++i) {
212                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
213                         continue;
214                 chain = cluster->array[i].chain;
215                 if (chain)
216                         flags |= chain->flags;
217         }
218         return (flags & HAMMER2_CHAIN_UNLINKED);
219 }
220
221 /*
222  * Set a bitmask of flags in all chains related to a cluster.
223  * The cluster should probably be locked.
224  *
225  * XXX Only operate on FEMOD elements?
226  */
227 void
228 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
229 {
230         hammer2_chain_t *chain;
231         int i;
232
233         for (i = 0; i < cluster->nchains; ++i) {
234                 chain = cluster->array[i].chain;
235                 if (chain)
236                         atomic_set_int(&chain->flags, flags);
237         }
238 }
239
240 /*
241  * Set a bitmask of flags in all chains related to a cluster.
242  * The cluster should probably be locked.
243  *
244  * XXX Only operate on FEMOD elements?
245  */
246 void
247 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
248 {
249         hammer2_chain_t *chain;
250         int i;
251
252         for (i = 0; i < cluster->nchains; ++i) {
253                 chain = cluster->array[i].chain;
254                 if (chain)
255                         atomic_clear_int(&chain->flags, flags);
256         }
257 }
258
259 /*
260  * Flag the cluster for flushing recursively up to the root.  Despite the
261  * work it does, this is relatively benign.  It just makes sure that the
262  * flusher has top-down visibility to this cluster.
263  *
264  * Errored chains are not flagged for flushing.
265  *
266  * The cluster should probably be locked.
267  */
268 void
269 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
270 {
271         hammer2_chain_t *chain;
272         int i;
273
274         for (i = 0; i < cluster->nchains; ++i) {
275                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
276                         continue;
277                 chain = cluster->array[i].chain;
278                 if (chain == NULL)
279                         continue;
280                 if (chain->error)
281                         continue;
282                 hammer2_chain_setflush(trans, chain);
283         }
284 }
285
286 /*
287  * Set the check mode for the cluster.
288  * Errored elements of the cluster are ignored.
289  *
290  * The cluster must be locked and modified.
291  */
292 void
293 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
294                                 hammer2_cluster_t *cluster,
295                                 int check_algo)
296 {
297         hammer2_chain_t *chain;
298         int i;
299
300         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
301         for (i = 0; i < cluster->nchains; ++i) {
302                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
303                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
304                         continue;
305                 }
306                 chain = cluster->array[i].chain;
307                 if (chain == NULL)
308                         continue;
309                 if (chain->error)
310                         continue;
311                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
312                 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
313                 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
314         }
315 }
316
317 /*
318  * Create a degenerate cluster with one ref from a single locked chain.
319  * The returned cluster will be focused on the chain and inherit its
320  * error state.
321  *
322  * The chain's lock and reference are transfered to the new cluster, so
323  * the caller should not try to unlock the chain separately.
324  *
325  * We fake the flags.
326  */
327 hammer2_cluster_t *
328 hammer2_cluster_from_chain(hammer2_chain_t *chain)
329 {
330         hammer2_cluster_t *cluster;
331
332         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
333         cluster->array[0].chain = chain;
334         cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
335         cluster->nchains = 1;
336         cluster->focus = chain;
337         cluster->focus_index = 0;
338         cluster->pmp = chain->pmp;
339         cluster->refs = 1;
340         cluster->error = chain->error;
341         cluster->flags = HAMMER2_CLUSTER_LOCKED |
342                          HAMMER2_CLUSTER_WRHARD |
343                          HAMMER2_CLUSTER_RDHARD |
344                          HAMMER2_CLUSTER_MSYNCED |
345                          HAMMER2_CLUSTER_SSYNCED;
346
347         return cluster;
348 }
349
350 /*
351  * Add a reference to a cluster and its underlying chains.
352  *
353  * We must also ref the underlying chains in order to allow ref/unlock
354  * sequences to later re-lock.
355  */
356 void
357 hammer2_cluster_ref(hammer2_cluster_t *cluster)
358 {
359         atomic_add_int(&cluster->refs, 1);
360 #if 0
361         hammer2_chain_t *chain;
362         int i;
363
364         for (i = 0; i < cluster->nchains; ++i) {
365                 chain = cluster->array[i].chain;
366                 if (chain)
367                         hammer2_chain_ref(chain);
368         }
369 #endif
370 }
371
372 /*
373  * Drop the caller's reference to the cluster.  When the ref count drops to
374  * zero this function frees the cluster and drops all underlying chains.
375  *
376  * In-progress read I/Os are typically detached from the cluster once the
377  * first one returns (the remaining stay attached to the DIOs but are then
378  * ignored and drop naturally).
379  */
380 void
381 hammer2_cluster_drop(hammer2_cluster_t *cluster)
382 {
383         hammer2_chain_t *chain;
384         int i;
385
386         KKASSERT(cluster->refs > 0);
387         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
388                 cluster->focus = NULL;          /* safety XXX chg to assert */
389                 cluster->focus_index = 0;
390
391                 for (i = 0; i < cluster->nchains; ++i) {
392                         chain = cluster->array[i].chain;
393                         if (chain) {
394                                 hammer2_chain_drop(chain);
395                                 cluster->array[i].chain = NULL; /* safety */
396                         }
397                 }
398                 cluster->nchains = 0;                           /* safety */
399
400                 kfree(cluster, M_HAMMER2);
401                 /* cluster is invalid */
402         }
403 }
404
405 void
406 hammer2_cluster_wait(hammer2_cluster_t *cluster)
407 {
408         tsleep(cluster->focus, 0, "h2clcw", 1);
409 }
410
411 /*
412  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
413  * and then locks them, modified by various RESOLVE flags.
414  *
415  * The act of locking a cluster sets its focus.  Note that cluster elements
416  * flagged with HAMMER2_CITEM_INVALID cannot be set as a focus.  Locking a
417  * cluster does not adjust this flag since exact matches only matter for leafs
418  * (parents can depend on minor differences in topology).
419  *
420  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
421  * operations.  Typically this is only set on a quorum of MASTERs or
422  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
423  * is present, this bit is *not* set on a quorum of MASTERs.  The
424  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
425  * that create/modify/delete elements use it.
426  *
427  * The chains making up the cluster may be narrowed down based on quorum
428  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
429  * narrowed down to a single chain as long as the entire subtopology is known
430  * to be intact.  So, for example, we can narrow a read-only op to a single
431  * fast SLAVE but if we focus a CACHE chain we must still retain at least
432  * a SLAVE to ensure that the subtopology can be accessed.
433  *
434  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
435  * to be maintained once the topology is validated as-of the top level of
436  * the operation.
437  *
438  * If a failure occurs the operation must be aborted by higher-level code and
439  * retried. XXX
440  */
441 void
442 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
443 {
444         hammer2_chain_t *chain;
445         int i;
446
447         /* cannot be on inode-embedded cluster template, must be on copy */
448         KKASSERT(cluster->refs > 0);
449         KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
450         if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
451                 panic("hammer2_cluster_lock: cluster %p already locked!\n",
452                         cluster);
453         } else {
454                 KKASSERT(cluster->focus == NULL);
455         }
456         atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
457
458         /*
459          * Lock chains and resolve state.
460          */
461         for (i = 0; i < cluster->nchains; ++i) {
462                 chain = cluster->array[i].chain;
463                 if (chain == NULL)
464                         continue;
465                 hammer2_chain_lock(chain, how);
466         }
467
468         hammer2_cluster_resolve(cluster);
469 }
470
471 void
472 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
473 {
474         hammer2_chain_t *chain;
475         hammer2_pfs_t *pmp;
476         hammer2_tid_t quorum_tid;
477         hammer2_tid_t last_best_quorum_tid;
478         int focus_pfs_type;
479         uint32_t nflags;
480         int ttlmasters;
481         int ttlslaves;
482         int nmasters;
483         int nslaves;
484         int nquorum;
485         int smpresent;
486         int i;
487
488         cluster->error = 0;
489         cluster->focus = NULL;
490
491         focus_pfs_type = 0;
492         nflags = 0;
493         ttlmasters = 0;
494         ttlslaves = 0;
495         nmasters = 0;
496         nslaves = 0;
497
498         /*
499          * Calculate quorum
500          */
501         pmp = cluster->pmp;
502         KKASSERT(pmp != NULL || cluster->nchains == 0);
503         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
504         smpresent = 0;
505
506         /*
507          * Pass 1
508          *
509          * NOTE: A NULL chain is not necessarily an error, it could be
510          *       e.g. a lookup failure or the end of an iteration.
511          *       Process normally.
512          */
513         for (i = 0; i < cluster->nchains; ++i) {
514                 chain = cluster->array[i].chain;
515                 if (chain && chain->error) {
516                         if (cluster->focus == NULL || cluster->focus == chain) {
517                                 /* error will be overridden by valid focus */
518                                 cluster->error = chain->error;
519                         }
520
521                         /*
522                          * Must count total masters and slaves whether the
523                          * chain is errored or not.
524                          */
525                         switch (cluster->pmp->pfs_types[i]) {
526                         case HAMMER2_PFSTYPE_MASTER:
527                                 ++ttlmasters;
528                                 break;
529                         case HAMMER2_PFSTYPE_SLAVE:
530                                 ++ttlslaves;
531                                 break;
532                         }
533                         continue;
534                 }
535                 switch (cluster->pmp->pfs_types[i]) {
536                 case HAMMER2_PFSTYPE_MASTER:
537                         ++ttlmasters;
538                         break;
539                 case HAMMER2_PFSTYPE_SLAVE:
540                         ++ttlslaves;
541                         break;
542                 case HAMMER2_PFSTYPE_SOFT_MASTER:
543                         nflags |= HAMMER2_CLUSTER_WRSOFT;
544                         nflags |= HAMMER2_CLUSTER_RDSOFT;
545                         smpresent = 1;
546                         break;
547                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
548                         nflags |= HAMMER2_CLUSTER_RDSOFT;
549                         break;
550                 case HAMMER2_PFSTYPE_SUPROOT:
551                         /*
552                          * Degenerate cluster representing the super-root
553                          * topology on a single device.  Fake stuff so
554                          * cluster ops work as expected.
555                          */
556                         nflags |= HAMMER2_CLUSTER_WRHARD;
557                         nflags |= HAMMER2_CLUSTER_RDHARD;
558                         cluster->focus_index = i;
559                         cluster->focus = chain;
560                         cluster->error = chain ? chain->error : 0;
561                         break;
562                 default:
563                         break;
564                 }
565         }
566
567         /*
568          * Pass 2
569          *
570          * Resolve masters.  Calculate nmasters for the highest matching
571          * TID, if a quorum cannot be attained try the next lower matching
572          * TID until we exhaust TIDs.
573          *
574          * NOTE: A NULL chain is not necessarily an error, it could be
575          *       e.g. a lookup failure or the end of an iteration.
576          *       Process normally.
577          */
578         last_best_quorum_tid = HAMMER2_TID_MAX;
579         quorum_tid = 0;         /* fix gcc warning */
580
581         while (nmasters < nquorum && last_best_quorum_tid != 0) {
582                 nmasters = 0;
583                 quorum_tid = 0;
584
585                 for (i = 0; i < cluster->nchains; ++i) {
586                         if (cluster->pmp->pfs_types[i] !=
587                             HAMMER2_PFSTYPE_MASTER) {
588                                 continue;
589                         }
590                         chain = cluster->array[i].chain;
591
592                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
593                                 /*
594                                  * Invalid as in unsynchronized, cannot be
595                                  * used to calculate the quorum.
596                                  */
597                         } else if (chain == NULL && quorum_tid == 0) {
598                                 /*
599                                  * NULL chain on master matches NULL chains
600                                  * on other masters.
601                                  */
602                                 ++nmasters;
603                         } else if (quorum_tid < last_best_quorum_tid &&
604                                    chain != NULL &&
605                                    (quorum_tid < chain->bref.modify_tid ||
606                                     nmasters == 0)) {
607                                 /*
608                                  * Better TID located, reset nmasters count.
609                                  */
610                                 nmasters = 1;
611                                 quorum_tid = chain->bref.modify_tid;
612                         } else if (chain &&
613                                    quorum_tid == chain->bref.modify_tid) {
614                                 /*
615                                  * TID matches current collection.
616                                  */
617                                 ++nmasters;
618                         }
619                 }
620                 if (nmasters >= nquorum)
621                         break;
622                 last_best_quorum_tid = quorum_tid;
623         }
624
625         /*
626          * Pass 3
627          *
628          * NOTE: A NULL chain is not necessarily an error, it could be
629          *       e.g. a lookup failure or the end of an iteration.
630          *       Process normally.
631          */
632         for (i = 0; i < cluster->nchains; ++i) {
633                 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
634                 chain = cluster->array[i].chain;
635                 if (chain && chain->error) {
636                         if (cluster->focus == NULL || cluster->focus == chain) {
637                                 /* error will be overridden by valid focus */
638                                 cluster->error = chain->error;
639                         }
640                         continue;
641                 }
642
643                 switch (cluster->pmp->pfs_types[i]) {
644                 case HAMMER2_PFSTYPE_MASTER:
645                         /*
646                          * We must have enough up-to-date masters to reach
647                          * a quorum and the master modify_tid must match
648                          * the quorum's modify_tid.
649                          *
650                          * Do not select an errored or out-of-sync master.
651                          */
652                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
653                                 nflags |= HAMMER2_CLUSTER_UNHARD;
654                         } else if (nmasters >= nquorum &&
655                                    (chain == NULL || chain->error == 0) &&
656                                    ((chain == NULL && quorum_tid == 0) ||
657                                     (chain != NULL && quorum_tid ==
658                                                   chain->bref.modify_tid))) {
659                                 nflags |= HAMMER2_CLUSTER_WRHARD;
660                                 nflags |= HAMMER2_CLUSTER_RDHARD;
661                                 if (!smpresent) {
662                                         cluster->array[i].flags |=
663                                                         HAMMER2_CITEM_FEMOD;
664                                 }
665                                 if (cluster->focus == NULL ||
666                                     focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
667                                         focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
668                                         cluster->focus_index = i;
669                                         cluster->focus = chain; /* NULL ok */
670                                         cluster->error = chain ? chain->error :
671                                                                  0;
672                                 }
673                         } else if (chain == NULL || chain->error == 0) {
674                                 nflags |= HAMMER2_CLUSTER_UNHARD;
675                         }
676                         break;
677                 case HAMMER2_PFSTYPE_SLAVE:
678                         /*
679                          * We must have enough up-to-date masters to reach
680                          * a quorum and the slave modify_tid must match the
681                          * quorum's modify_tid.
682                          *
683                          * Do not select an errored slave.
684                          */
685                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
686                                 nflags |= HAMMER2_CLUSTER_UNHARD;
687                         } else if (nmasters >= nquorum &&
688                                    (chain == NULL || chain->error == 0) &&
689                                    ((chain == NULL && quorum_tid == 0) ||
690                                     (chain && quorum_tid ==
691                                               chain->bref.modify_tid))) {
692                                 ++nslaves;
693                                 nflags |= HAMMER2_CLUSTER_RDHARD;
694 #if 0
695                                 /* XXX optimize for RESOLVE_RDONLY */
696                                 if (cluster->focus == NULL) {
697                                         focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
698                                         cluster->focus_index = i;
699                                         cluster->focus = chain; /* NULL ok */
700                                         cluster->error = chain ? chain->error :
701                                                                  0;
702                                 }
703 #endif
704                         } else if (chain == NULL || chain->error == 0) {
705                                 nflags |= HAMMER2_CLUSTER_UNSOFT;
706                         }
707                         break;
708                 case HAMMER2_PFSTYPE_SOFT_MASTER:
709                         /*
710                          * Directly mounted soft master always wins.  There
711                          * should be only one.
712                          */
713                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
714                         cluster->focus_index = i;
715                         cluster->focus = chain;
716                         cluster->error = chain ? chain->error : 0;
717                         focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
718                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
719                         break;
720                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
721                         /*
722                          * Directly mounted soft slave always wins.  There
723                          * should be only one.
724                          */
725                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
726                         if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
727                                 cluster->focus_index = i;
728                                 cluster->focus = chain;
729                                 cluster->error = chain ? chain->error : 0;
730                                 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
731                         }
732                         break;
733                 case HAMMER2_PFSTYPE_SUPROOT:
734                         /*
735                          * spmp (degenerate case)
736                          */
737                         KKASSERT(i == 0);
738                         cluster->focus_index = i;
739                         cluster->focus = chain;
740                         cluster->error = chain ? chain->error : 0;
741                         focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
742                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
743                         break;
744                 default:
745                         break;
746                 }
747         }
748
749         if (ttlslaves == 0)
750                 nflags |= HAMMER2_CLUSTER_NOSOFT;
751         if (ttlmasters == 0)
752                 nflags |= HAMMER2_CLUSTER_NOHARD;
753
754         /*
755          * Set SSYNCED or MSYNCED for slaves and masters respectively if
756          * all available nodes (even if 0 are available) are fully
757          * synchronized.  This is used by the synchronization thread to
758          * determine if there is work it could potentially accomplish.
759          */
760         if (nslaves == ttlslaves)
761                 nflags |= HAMMER2_CLUSTER_SSYNCED;
762         if (nmasters == ttlmasters)
763                 nflags |= HAMMER2_CLUSTER_MSYNCED;
764
765         /*
766          * Determine if the cluster was successfully locked for the
767          * requested operation and generate an error code.  The cluster
768          * will not be locked (or ref'd) if an error is returned.
769          *
770          * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
771          * to determine if reading or writing is possible.  If writing, the
772          * cluster still requires a call to hammer2_cluster_modify() first.
773          */
774         atomic_set_int(&cluster->flags, nflags);
775         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
776 }
777
778 /*
779  * Copy a cluster, returned a ref'd cluster.  All underlying chains
780  * are also ref'd, but not locked.
781  *
782  * The cluster focus is not set because the cluster is not yet locked
783  * (and the originating cluster does not have to be locked either).
784  */
785 hammer2_cluster_t *
786 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
787 {
788         hammer2_pfs_t *pmp = ocluster->pmp;
789         hammer2_cluster_t *ncluster;
790         hammer2_chain_t *chain;
791         int i;
792
793         ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
794         ncluster->pmp = pmp;
795         ncluster->nchains = ocluster->nchains;
796         ncluster->refs = 1;
797         ncluster->flags = 0;    /* cluster not locked */
798
799         for (i = 0; i < ocluster->nchains; ++i) {
800                 chain = ocluster->array[i].chain;
801                 ncluster->array[i].chain = chain;
802                 if (chain)
803                         hammer2_chain_ref(chain);
804         }
805         return (ncluster);
806 }
807
808 /*
809  * Unlock and deref a cluster.  The cluster is destroyed if this is the
810  * last ref.
811  */
812 void
813 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
814 {
815         hammer2_chain_t *chain;
816         int i;
817
818         if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
819                 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
820                         cluster);
821         }
822         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
823         KKASSERT(cluster->refs > 0);
824         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
825
826         for (i = 0; i < cluster->nchains; ++i) {
827                 chain = cluster->array[i].chain;
828                 if (chain)
829                         hammer2_chain_unlock(chain);
830         }
831         cluster->focus_index = 0;
832         cluster->focus = NULL;
833 }
834
835 /*
836  * Resize the cluster's physical storage allocation in-place.  This may
837  * replace the cluster's chains.
838  */
839 void
840 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
841                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
842                        int nradix, int flags)
843 {
844         hammer2_chain_t *chain;
845         int i;
846
847         KKASSERT(cparent->pmp == cluster->pmp);         /* can be NULL */
848         KKASSERT(cparent->nchains == cluster->nchains);
849
850         for (i = 0; i < cluster->nchains; ++i) {
851                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
852                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
853                         continue;
854                 }
855                 chain = cluster->array[i].chain;
856                 if (chain) {
857                         KKASSERT(cparent->array[i].chain);
858                         hammer2_chain_resize(trans, ip,
859                                              cparent->array[i].chain, chain,
860                                              nradix, flags);
861                 }
862         }
863 }
864
865 /*
866  * Set an inode's cluster modified, marking the related chains RW and
867  * duplicating them if necessary.
868  *
869  * The passed-in chain is a localized copy of the chain previously acquired
870  * when the inode was locked (and possilby replaced in the mean time), and
871  * must also be updated.  In fact, we update it first and then synchronize
872  * the inode's cluster cache.
873  */
874 hammer2_inode_data_t *
875 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
876                           hammer2_cluster_t *cluster, int flags)
877 {
878         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
879         hammer2_cluster_modify(trans, cluster, flags);
880
881         hammer2_inode_repoint(ip, NULL, cluster);
882         if (ip->vp)
883                 vsetisdirty(ip->vp);
884         return (&hammer2_cluster_wdata(cluster)->ipdata);
885 }
886
887 /*
888  * Adjust the cluster's chains to allow modification and adjust the
889  * focus.  Data will be accessible on return.
890  *
891  * If our focused master errors on modify, re-resolve the cluster to
892  * try to select a different master.
893  */
894 void
895 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
896                        int flags)
897 {
898         hammer2_chain_t *chain;
899         int resolve_again;
900         int i;
901
902         resolve_again = 0;
903         for (i = 0; i < cluster->nchains; ++i) {
904                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
905                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
906                         continue;
907                 }
908                 chain = cluster->array[i].chain;
909                 if (chain == NULL)
910                         continue;
911                 if (chain->error)
912                         continue;
913                 hammer2_chain_modify(trans, chain, flags);
914                 if (cluster->focus == chain && chain->error) {
915                         cluster->error = chain->error;
916                         resolve_again = 1;
917                 }
918         }
919         if (resolve_again)
920                 hammer2_cluster_resolve(cluster);
921 }
922
923 /*
924  * Synchronize modifications from the focus to other chains in a cluster.
925  * Convenient because nominal API users can just modify the contents of the
926  * focus (at least for non-blockref data).
927  *
928  * Nominal front-end operations only edit non-block-table data in a single
929  * chain.  This code copies such modifications to the other chains in the
930  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
931  * by both the frontend and the backend and will explode in fireworks if
932  * blindly copied.
933  */
934 void
935 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
936 {
937         hammer2_chain_t *focus;
938         hammer2_chain_t *scan;
939         const hammer2_inode_data_t *ripdata;
940         hammer2_inode_data_t *wipdata;
941         int i;
942
943         focus = cluster->focus;
944         KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
945
946         for (i = 0; i < cluster->nchains; ++i) {
947                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
948                         continue;
949                 scan = cluster->array[i].chain;
950                 if (scan == NULL || scan == focus)
951                         continue;
952                 if (scan->error)
953                         continue;
954                 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
955                 KKASSERT(focus->bytes == scan->bytes &&
956                          focus->bref.type == scan->bref.type);
957                 switch(focus->bref.type) {
958                 case HAMMER2_BREF_TYPE_INODE:
959                         ripdata = &focus->data->ipdata;
960                         wipdata = &scan->data->ipdata;
961                         if ((ripdata->op_flags &
962                             HAMMER2_OPFLAG_DIRECTDATA) == 0) {
963                                 bcopy(ripdata, wipdata,
964                                       offsetof(hammer2_inode_data_t, u));
965                                 break;
966                         }
967                         /* fall through to full copy */
968                 case HAMMER2_BREF_TYPE_DATA:
969                         bcopy(focus->data, scan->data, focus->bytes);
970                         break;
971                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
972                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
973                 case HAMMER2_BREF_TYPE_FREEMAP:
974                 case HAMMER2_BREF_TYPE_VOLUME:
975                         panic("hammer2_cluster_modsync: illegal node type");
976                         /* NOT REACHED */
977                         break;
978                 default:
979                         panic("hammer2_cluster_modsync: unknown node type");
980                         break;
981                 }
982         }
983 }
984
985 /*
986  * Lookup initialization/completion API.  Returns a locked cluster with 1 ref.
987  */
988 hammer2_cluster_t *
989 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
990 {
991         hammer2_cluster_t *cluster;
992
993         cluster = hammer2_cluster_copy(cparent);
994         if (flags & HAMMER2_LOOKUP_SHARED) {
995                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
996                                               HAMMER2_RESOLVE_SHARED);
997         } else {
998                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
999         }
1000         return (cluster);
1001 }
1002
1003 void
1004 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1005 {
1006         if (cparent) {
1007                 hammer2_cluster_unlock(cparent);
1008                 hammer2_cluster_drop(cparent);
1009         }
1010 }
1011
1012 /*
1013  * Locate first match or overlap under parent, return a new cluster
1014  */
1015 hammer2_cluster_t *
1016 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
1017                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1018 {
1019         hammer2_pfs_t *pmp;
1020         hammer2_cluster_t *cluster;
1021         hammer2_chain_t *chain;
1022         hammer2_chain_t *focus;
1023         hammer2_key_t key_accum;
1024         hammer2_key_t key_next;
1025         int null_count;
1026         int i;
1027
1028         pmp = cparent->pmp;                             /* can be NULL */
1029         key_accum = *key_nextp;
1030         null_count = 0;
1031
1032         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1033         cluster->pmp = pmp;                             /* can be NULL */
1034         cluster->refs = 1;
1035         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1036                 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1037
1038         /*
1039          * Pass-1, issue lookup and find focus.
1040          */
1041         for (i = 0; i < cparent->nchains; ++i) {
1042                 cluster->array[i].flags = cparent->array[i].flags;
1043                 key_next = *key_nextp;
1044
1045                 /*
1046                  * Nothing to base the lookup, or parent was not synchronized.
1047                  */
1048                 if (cparent->array[i].chain == NULL ||
1049                     (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
1050                         ++null_count;
1051                         continue;
1052                 }
1053
1054                 chain = hammer2_chain_lookup(&cparent->array[i].chain,
1055                                              &key_next,
1056                                              key_beg, key_end,
1057                                              &cparent->array[i].cache_index,
1058                                              flags);
1059                 if (cparent->focus_index == i)
1060                         cparent->focus = cparent->array[i].chain;
1061                 cluster->array[i].chain = chain;
1062                 if (chain == NULL) {
1063                         ++null_count;
1064                 } else if (chain->error) {
1065                         /*
1066                          * Leave errored chain in cluster, but it cannot be
1067                          * the cluster's focus.  It is still possible for an
1068                          * error'd chain to be synchronized (since we have
1069                          * the bref), synchronization state will be handled
1070                          * in pass-2.
1071                          */
1072                         ;
1073                 } else if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
1074                         /*
1075                          * Leave unsynchronized chain in cluster, but it cannot
1076                          * be the cluster's focus.
1077                          */
1078                         ;
1079                 } else {
1080                         int ddflag = (chain->bref.type ==
1081                                       HAMMER2_BREF_TYPE_INODE);
1082
1083                         if (cluster->focus == NULL) {
1084                                 cluster->focus_index = i;
1085                                 cluster->focus = chain;
1086                                 cluster->ddflag = ddflag;
1087                         }
1088                         if (cparent->focus == cparent->array[i].chain) {
1089                                 cluster->focus_index = i;
1090                                 cluster->focus = chain;
1091                                 cluster->ddflag = ddflag;
1092                         }
1093                 }
1094                 if (key_accum > key_next)
1095                         key_accum = key_next;
1096         }
1097
1098         /*
1099          * Pass-2 invalidate mismatches
1100          */
1101         focus = cluster->focus;
1102         if (focus == NULL)
1103                 goto done;
1104
1105         for (i = 0; i < cparent->nchains; ++i) {
1106                 int ddflag;
1107
1108                 chain = cluster->array[i].chain;
1109
1110                 if (chain == NULL)
1111                         continue;
1112                 if (chain == focus)
1113                         continue;
1114                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1115                         continue;
1116
1117                 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1118                 if (chain->bref.type != focus->bref.type ||
1119                     chain->bref.key != focus->bref.key ||
1120                     chain->bref.keybits != focus->bref.keybits ||
1121                     chain->bref.modify_tid != focus->bref.modify_tid ||
1122                     chain->bytes != focus->bytes ||
1123                     ddflag != cluster->ddflag) {
1124                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1125                 }
1126         }
1127
1128         /*
1129          * Resolve cluster flags.  A lookup or locking failure could wind
1130          * up changing the cluster.
1131          */
1132 done:
1133         *key_nextp = key_accum;
1134         cluster->nchains = i;
1135         hammer2_cluster_resolve(cluster);
1136
1137         if (null_count == i) {
1138                 hammer2_cluster_drop(cluster);
1139                 cluster = NULL;
1140         }
1141
1142         return (cluster);
1143 }
1144
1145 /*
1146  * Locate next match or overlap under parent, replace cluster
1147  */
1148 hammer2_cluster_t *
1149 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1150                      hammer2_key_t *key_nextp,
1151                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1152 {
1153         hammer2_chain_t *ochain;
1154         hammer2_chain_t *nchain;
1155         hammer2_chain_t *focus;
1156         hammer2_key_t key_accum;
1157         hammer2_key_t key_next;
1158         int parent_index;
1159         int cluster_index;
1160         int null_count;
1161         int i;
1162
1163         key_accum = *key_nextp;
1164         null_count = 0;
1165         parent_index = cparent->focus_index;    /* save prior focus */
1166         cluster_index = cluster->focus_index;
1167
1168         cluster->focus = NULL;          /* XXX needed any more? */
1169         /*cparent->focus = NULL;*/
1170         cluster->focus_index = 0;       /* XXX needed any more? */
1171         /*cparent->focus_index = 0;*/
1172
1173         cluster->ddflag = 0;
1174
1175         /*
1176          * The parent is always locked on entry, the iterator may be locked
1177          * depending on flags.
1178          *
1179          * We must temporarily unlock the passed-in clusters to avoid a
1180          * deadlock between elements of the cluster with other threads.
1181          * We will fixup the lock in the loop.
1182          *
1183          * Note that this will clear the focus.
1184          *
1185          * Reflag the clusters as locked, because we will relock them
1186          * as we go.
1187          */
1188         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1189                 hammer2_cluster_unlock(cluster);
1190                 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1191         }
1192         hammer2_cluster_unlock(cparent);
1193         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1194
1195         for (i = 0; i < cparent->nchains; ++i) {
1196                 key_next = *key_nextp;
1197                 ochain = cluster->array[i].chain;
1198
1199                 /*
1200                  * Always relock the parent as we go.
1201                  */
1202                 if (cparent->array[i].chain) {
1203                         hammer2_chain_lock(cparent->array[i].chain,
1204                                            flags & ~HAMMER2_LOOKUP_NOLOCK);
1205                 }
1206
1207                 /*
1208                  * Nothing to iterate from.  These cases can occur under
1209                  * normal operations.  For example, during synchronization
1210                  * a slave might reach the end of its scan while records
1211                  * are still left on the master(s).
1212                  */
1213                 if (ochain == NULL) {
1214                         ++null_count;
1215                         continue;
1216                 }
1217                 if (cparent->array[i].chain == NULL ||
1218                     (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1219                     (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1220                         /* ochain has not yet been relocked */
1221                         hammer2_chain_drop(ochain);
1222                         cluster->array[i].chain = NULL;
1223                         ++null_count;
1224                         continue;
1225                 }
1226
1227                 /*
1228                  * Relock the child if necessary.  Parent and child will then
1229                  * be locked as expected by hammer2_chain_next() and flags.
1230                  */
1231                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1232                         hammer2_chain_lock(ochain, flags);
1233                 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1234                                             &key_next, key_beg, key_end,
1235                                             &cparent->array[i].cache_index,
1236                                             flags);
1237                 if (parent_index == i) {
1238                         cparent->focus_index = i;
1239                         cparent->focus = cparent->array[i].chain;
1240                 }
1241                 /* ochain now invalid but can still be used for focus check */
1242
1243                 cluster->array[i].chain = nchain;
1244                 if (nchain == NULL) {
1245                         ++null_count;
1246                 } else if (nchain->error) {
1247                         /*
1248                          * Leave errored chain in cluster, but it cannot be
1249                          * the cluster's focus.
1250                          */
1251                         ;
1252                 } else {
1253                         int ddflag = (nchain->bref.type ==
1254                                       HAMMER2_BREF_TYPE_INODE);
1255
1256                         /*
1257                          * Possible new focus.
1258                          */
1259                         if (cluster->focus == NULL) {
1260                                 cluster->ddflag = ddflag;
1261                                 cluster->focus_index = i;
1262                                 cluster->focus = nchain;
1263                         }
1264
1265                         /*
1266                          * Fixup pre-existing focus.
1267                          */
1268                         if (cluster_index == i) {
1269                                 cluster->focus_index = i;
1270                                 cluster->focus = nchain;
1271                         }
1272                 }
1273                 if (key_accum > key_next)
1274                         key_accum = key_next;
1275         }
1276
1277         /*
1278          * Pass-2 invalidate mismatches
1279          */
1280         focus = cluster->focus;
1281         if (focus == NULL)
1282                 goto done;
1283
1284         for (i = 0; i < cparent->nchains; ++i) {
1285                 int ddflag;
1286
1287                 nchain = cluster->array[i].chain;
1288
1289                 if (nchain == NULL)
1290                         continue;
1291                 if (nchain == focus)
1292                         continue;
1293                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1294                         continue;
1295
1296                 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1297                 if (nchain->bref.type != focus->bref.type ||
1298                     nchain->bref.key != focus->bref.key ||
1299                     nchain->bref.keybits != focus->bref.keybits ||
1300                     nchain->bref.modify_tid != focus->bref.modify_tid ||
1301                     nchain->bytes != focus->bytes ||
1302                     ddflag != cluster->ddflag) {
1303                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1304                 }
1305         }
1306
1307 done:
1308         *key_nextp = key_accum;
1309         cluster->nchains = i;
1310         hammer2_cluster_resolve(cluster);
1311
1312         if (null_count == i) {
1313                 hammer2_cluster_drop(cluster);
1314                 cluster = NULL;
1315         }
1316         return(cluster);
1317 }
1318
1319 /*
1320  * Advance just one chain in the cluster and recalculate the invalid bit.
1321  * The cluster index is allowed to be flagged invalid on input and is
1322  * recalculated on return.
1323  *
1324  * (used during synchronization to advance past a chain being deleted).
1325  *
1326  * The chain being advanced must not be the focus and the clusters in
1327  * question must have already passed normal cluster_lookup/cluster_next
1328  * checks.
1329  *
1330  * The cluster always remains intact on return, so void function.
1331  */
1332 void
1333 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1334                                   hammer2_cluster_t *cluster,
1335                                   hammer2_key_t *key_nextp,
1336                                   hammer2_key_t key_beg,
1337                                   hammer2_key_t key_end,
1338                                   int i, int flags)
1339 {
1340         hammer2_chain_t *ochain;
1341         hammer2_chain_t *nchain;
1342         hammer2_chain_t *focus;
1343         hammer2_key_t key_accum;
1344         hammer2_key_t key_next;
1345         int ddflag;
1346
1347         key_accum = *key_nextp;
1348         key_next = *key_nextp;
1349         ochain = cluster->array[i].chain;
1350         if (ochain == NULL)
1351                 goto done;
1352         KKASSERT(ochain != cluster->focus);
1353
1354         nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1355                                     &key_next, key_beg, key_end,
1356                                     &cparent->array[i].cache_index,
1357                                     flags);
1358         if (cparent->focus_index == i)
1359                 cparent->focus = cparent->array[i].chain;
1360         /* ochain now invalid */
1361
1362         /*
1363          * Install nchain.  Note that nchain can be NULL, and can also
1364          * be in an unlocked state depending on flags.
1365          */
1366         cluster->array[i].chain = nchain;
1367         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1368
1369         if (key_accum > key_next)
1370                 key_accum = key_next;
1371
1372         focus = cluster->focus;
1373         if (focus == NULL)
1374                 goto done;
1375         if (nchain == NULL)
1376                 goto done;
1377 #if 0
1378         if (nchain == focus)    /* ASSERTED NOT TRUE */
1379                 ...
1380 #endif
1381         ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1382         if (nchain->bref.type != focus->bref.type ||
1383             nchain->bref.key != focus->bref.key ||
1384             nchain->bref.keybits != focus->bref.keybits ||
1385             nchain->bref.modify_tid != focus->bref.modify_tid ||
1386             nchain->bytes != focus->bytes ||
1387             ddflag != cluster->ddflag) {
1388                 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1389         }
1390
1391 done:
1392         *key_nextp = key_accum;
1393 #if 0
1394         /*
1395          * For now don't re-resolve cluster->flags.
1396          */
1397         hammer2_cluster_resolve(cluster);
1398 #endif
1399 }
1400
1401 /*
1402  * Create a new cluster using the specified key
1403  */
1404 int
1405 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1406                      hammer2_cluster_t **clusterp,
1407                      hammer2_key_t key, int keybits,
1408                      int type, size_t bytes, int flags)
1409 {
1410         hammer2_cluster_t *cluster;
1411         hammer2_pfs_t *pmp;
1412         int error;
1413         int i;
1414
1415         pmp = trans->pmp;                               /* can be NULL */
1416
1417         if ((cluster = *clusterp) == NULL) {
1418                 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1419                                   M_WAITOK | M_ZERO);
1420                 cluster->pmp = pmp;                     /* can be NULL */
1421                 cluster->refs = 1;
1422                 cluster->flags = HAMMER2_CLUSTER_LOCKED;
1423         }
1424         cluster->focus_index = 0;
1425         cluster->focus = NULL;
1426
1427         /*
1428          * NOTE: cluster->array[] entries can initially be NULL.  If
1429          *       *clusterp is supplied, skip NULL entries, otherwise
1430          *       create new chains.
1431          */
1432         for (i = 0; i < cparent->nchains; ++i) {
1433                 if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1434                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1435                         continue;
1436                 }
1437                 if (*clusterp) {
1438                         if ((cluster->array[i].flags &
1439                              HAMMER2_CITEM_FEMOD) == 0) {
1440                                 cluster->array[i].flags |=
1441                                                 HAMMER2_CITEM_INVALID;
1442                                 continue;
1443                         }
1444                         if (cluster->array[i].chain == NULL)
1445                                 continue;
1446                 }
1447                 error = hammer2_chain_create(trans, &cparent->array[i].chain,
1448                                              &cluster->array[i].chain, pmp,
1449                                              key, keybits,
1450                                              type, bytes, flags);
1451                 if (cparent->focus_index == i)
1452                         cparent->focus = cparent->array[i].chain;
1453                 KKASSERT(error == 0);
1454                 if (cluster->focus == NULL) {
1455                         cluster->focus_index = i;
1456                         cluster->focus = cluster->array[i].chain;
1457                 }
1458                 if (cparent->focus == cparent->array[i].chain) {
1459                         cluster->focus_index = i;
1460                         cluster->focus = cluster->array[i].chain;
1461                 }
1462         }
1463         cluster->nchains = i;
1464         *clusterp = cluster;
1465         hammer2_cluster_resolve(cluster);
1466
1467         return error;
1468 }
1469
1470 /*
1471  * Rename a cluster to a new parent.
1472  *
1473  * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1474  *          So the data_off field is not relevant.  Only the key and
1475  *          keybits are used.
1476  */
1477 void
1478 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1479                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1480                        int flags)
1481 {
1482         hammer2_chain_t *chain;
1483         hammer2_blockref_t xbref;
1484         int i;
1485
1486 #if 0
1487         cluster->focus = NULL;
1488         cparent->focus = NULL;
1489         cluster->focus_index = 0;
1490         cparent->focus_index = 0;
1491 #endif
1492
1493         for (i = 0; i < cluster->nchains; ++i) {
1494                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1495                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1496                         continue;
1497                 }
1498                 chain = cluster->array[i].chain;
1499                 if (chain) {
1500                         if (bref) {
1501                                 xbref = chain->bref;
1502                                 xbref.key = bref->key;
1503                                 xbref.keybits = bref->keybits;
1504                                 hammer2_chain_rename(trans, &xbref,
1505                                                      &cparent->array[i].chain,
1506                                                      chain, flags);
1507                         } else {
1508                                 hammer2_chain_rename(trans, NULL,
1509                                                      &cparent->array[i].chain,
1510                                                      chain, flags);
1511                         }
1512                         if (cparent->focus_index == i)
1513                                 cparent->focus = cparent->array[i].chain;
1514                         KKASSERT(cluster->array[i].chain == chain); /*remove*/
1515                 }
1516         }
1517 }
1518
1519 /*
1520  * Mark a cluster deleted
1521  */
1522 void
1523 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1524                        hammer2_cluster_t *cluster, int flags)
1525 {
1526         hammer2_chain_t *chain;
1527         hammer2_chain_t *parent;
1528         int i;
1529
1530         if (cparent == NULL) {
1531                 kprintf("cparent is NULL\n");
1532                 return;
1533         }
1534
1535         for (i = 0; i < cluster->nchains; ++i) {
1536                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1537                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1538                         continue;
1539                 }
1540                 parent = cparent->array[i].chain;
1541                 chain = cluster->array[i].chain;
1542                 if (chain == NULL)
1543                         continue;
1544                 if (chain->parent != parent) {
1545                         kprintf("hammer2_cluster_delete: parent "
1546                                 "mismatch chain=%p parent=%p against=%p\n",
1547                                 chain, chain->parent, parent);
1548                 } else {
1549                         hammer2_chain_delete(trans, parent, chain, flags);
1550                 }
1551         }
1552 }
1553
1554 /*
1555  * Create a snapshot of the specified {parent, ochain} with the specified
1556  * label.  The originating hammer2_inode must be exclusively locked for
1557  * safety.
1558  *
1559  * The ioctl code has already synced the filesystem.
1560  */
1561 int
1562 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1563                        hammer2_ioc_pfs_t *pfs)
1564 {
1565         hammer2_dev_t *hmp;
1566         hammer2_cluster_t *ncluster;
1567         const hammer2_inode_data_t *ripdata;
1568         hammer2_inode_data_t *wipdata;
1569         hammer2_chain_t *nchain;
1570         hammer2_inode_t *nip;
1571         size_t name_len;
1572         hammer2_key_t lhc;
1573         struct vattr vat;
1574 #if 0
1575         uuid_t opfs_clid;
1576 #endif
1577         int error;
1578         int i;
1579
1580         kprintf("snapshot %s\n", pfs->name);
1581
1582         name_len = strlen(pfs->name);
1583         lhc = hammer2_dirhash(pfs->name, name_len);
1584
1585         /*
1586          * Get the clid
1587          */
1588         ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1589 #if 0
1590         opfs_clid = ripdata->pfs_clid;
1591 #endif
1592         hmp = ocluster->focus->hmp;     /* XXX find synchronized local disk */
1593
1594         /*
1595          * Create the snapshot directory under the super-root
1596          *
1597          * Set PFS type, generate a unique filesystem id, and generate
1598          * a cluster id.  Use the same clid when snapshotting a PFS root,
1599          * which theoretically allows the snapshot to be used as part of
1600          * the same cluster (perhaps as a cache).
1601          *
1602          * Copy the (flushed) blockref array.  Theoretically we could use
1603          * chain_duplicate() but it becomes difficult to disentangle
1604          * the shared core so for now just brute-force it.
1605          */
1606         VATTR_NULL(&vat);
1607         vat.va_type = VDIR;
1608         vat.va_mode = 0755;
1609         ncluster = NULL;
1610         nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1611                                    proc0.p_ucred, pfs->name, name_len,
1612                                    &ncluster,
1613                                    HAMMER2_INSERT_PFSROOT, &error);
1614
1615         if (nip) {
1616                 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1617                 wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER;
1618                 wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1619                 wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT;
1620                 kern_uuidgen(&wipdata->pfs_fsid, 1);
1621
1622                 /*
1623                  * Give the snapshot its own private cluster.  As a snapshot
1624                  * no further synchronization with the original cluster will
1625                  * be done.
1626                  */
1627 #if 0
1628                 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1629                         wipdata->pfs_clid = opfs_clid;
1630                 else
1631                         kern_uuidgen(&wipdata->pfs_clid, 1);
1632 #endif
1633                 kern_uuidgen(&wipdata->pfs_clid, 1);
1634
1635                 for (i = 0; i < ncluster->nchains; ++i) {
1636                         if ((ncluster->array[i].flags &
1637                              HAMMER2_CITEM_FEMOD) == 0) {
1638                                 ncluster->array[i].flags |=
1639                                         HAMMER2_CITEM_INVALID;
1640                                 continue;
1641                         }
1642                         nchain = ncluster->array[i].chain;
1643                         if (nchain)
1644                                 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1645                 }
1646 #if 0
1647                 /* XXX can't set this unless we do an explicit flush, which
1648                    we also need a pmp assigned to do, else the flush code
1649                    won't flush ncluster because it thinks it is crossing a
1650                    flush boundary */
1651                 hammer2_cluster_set_chainflags(ncluster,
1652                                                HAMMER2_CHAIN_PFSBOUNDARY);
1653 #endif
1654
1655                 /* XXX hack blockset copy */
1656                 /* XXX doesn't work with real cluster */
1657                 KKASSERT(ocluster->nchains == 1);
1658                 wipdata->u.blockset = ripdata->u.blockset;
1659                 hammer2_cluster_modsync(ncluster);
1660                 for (i = 0; i < ncluster->nchains; ++i) {
1661                         nchain = ncluster->array[i].chain;
1662                         if (nchain)
1663                                 hammer2_flush(trans, nchain, 1);
1664                 }
1665                 hammer2_inode_unlock(nip, ncluster);
1666         }
1667         return (error);
1668 }
1669
1670 /*
1671  * Return locked parent cluster given a locked child.  The child remains
1672  * locked on return.  The new parent's focus follows the child's focus
1673  * and the parent is always resolved.
1674  *
1675  * We must temporarily unlock the passed-in cluster to avoid a deadlock
1676  * between elements of the cluster.
1677  */
1678 hammer2_cluster_t *
1679 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1680 {
1681         hammer2_cluster_t *cparent;
1682         int i;
1683
1684         cparent = hammer2_cluster_copy(cluster);
1685         hammer2_cluster_unlock(cluster);
1686
1687         for (i = 0; i < cparent->nchains; ++i) {
1688                 hammer2_chain_t *chain;
1689                 hammer2_chain_t *rchain;
1690
1691                 /*
1692                  * Calculate parent for each element.  Old chain has an extra
1693                  * ref for cparent but the lock remains with cluster.
1694                  */
1695                 chain = cparent->array[i].chain;
1696                 if (chain == NULL)
1697                         continue;
1698                 while ((rchain = chain->parent) != NULL) {
1699                         hammer2_chain_ref(rchain);
1700                         hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1701                         if (chain->parent == rchain)
1702                                 break;
1703                         hammer2_chain_unlock(rchain);
1704                         hammer2_chain_drop(rchain);
1705                 }
1706                 if (cluster->focus == chain) {
1707                         cparent->focus_index = i;
1708                         cparent->focus = rchain;
1709                 }
1710                 cparent->array[i].chain = rchain;
1711                 hammer2_chain_drop(chain);
1712         }
1713         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1714         hammer2_cluster_resolve(cparent);
1715         hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1716
1717         return cparent;
1718 }
1719
1720 /************************************************************************
1721  *                              CLUSTER I/O                             *
1722  ************************************************************************
1723  *
1724  *
1725  * WARNING! blockref[] array data is not universal.  These functions should
1726  *          only be used to access universal data.
1727  *
1728  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1729  *          complete if necessary.  The I/O's should have already been
1730  *          initiated by the cluster_lock/chain_lock operation.
1731  *
1732  *          The cluster must already be in a modified state before wdata
1733  *          is called.  The data will already be available for this case.
1734  */
1735 const hammer2_media_data_t *
1736 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1737 {
1738         return(cluster->focus->data);
1739 }
1740
1741 hammer2_media_data_t *
1742 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1743 {
1744         KKASSERT(hammer2_cluster_modified(cluster));
1745         return(cluster->focus->data);
1746 }
1747
1748 /*
1749  * Load cluster data asynchronously with callback.
1750  *
1751  * The callback is made for the first validated data found, or NULL
1752  * if no valid data is available.
1753  *
1754  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1755  *       in the inode with an exclusive lock held), the chain structure may be
1756  *       shared.
1757  */
1758 void
1759 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1760                            void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1761 {
1762         hammer2_chain_t *chain;
1763         hammer2_iocb_t *iocb;
1764         hammer2_dev_t *hmp;
1765         hammer2_blockref_t *bref;
1766         int i;
1767
1768         /*
1769          * Try to find a chain whos data is already resolved.  If none can
1770          * be found, start with the first chain.
1771          */
1772         chain = NULL;
1773         for (i = 0; i < cluster->nchains; ++i) {
1774                 chain = cluster->array[i].chain;
1775                 if (chain && chain->data)
1776                         break;
1777         }
1778         if (i == cluster->nchains) {
1779                 chain = cluster->array[0].chain;
1780                 i = 0;
1781         }
1782
1783         iocb = &cluster->iocb;
1784         iocb->callback = callback;
1785         iocb->dio = NULL;               /* for already-validated case */
1786         iocb->cluster = cluster;
1787         iocb->chain = chain;
1788         iocb->ptr = ptr;
1789         iocb->lbase = (off_t)i;
1790         iocb->flags = 0;
1791         iocb->error = 0;
1792
1793         /*
1794          * Data already validated
1795          */
1796         if (chain->data) {
1797                 callback(iocb);
1798                 return;
1799         }
1800
1801         /*
1802          * We must resolve to a device buffer, either by issuing I/O or
1803          * by creating a zero-fill element.  We do not mark the buffer
1804          * dirty when creating a zero-fill element (the hammer2_chain_modify()
1805          * API must still be used to do that).
1806          *
1807          * The device buffer is variable-sized in powers of 2 down
1808          * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1809          * chunk always contains buffers of the same size. (XXX)
1810          *
1811          * The minimum physical IO size may be larger than the variable
1812          * block size.
1813          *
1814          * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
1815          *            matches hammer2_devblksize()?  Or does the freemap's
1816          *            pre-zeroing handle the case for us?
1817          */
1818         bref = &chain->bref;
1819         hmp = chain->hmp;
1820
1821 #if 0
1822         /* handled by callback? <- TODO XXX even needed for loads? */
1823         /*
1824          * The getblk() optimization for a 100% overwrite can only be used
1825          * if the physical block size matches the request.
1826          */
1827         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1828             chain->bytes == hammer2_devblksize(chain->bytes)) {
1829                 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1830                 KKASSERT(error == 0);
1831                 iocb->dio = dio;
1832                 callback(iocb);
1833                 return;
1834         }
1835 #endif
1836
1837         /*
1838          * Otherwise issue a read
1839          */
1840         hammer2_adjreadcounter(&chain->bref, chain->bytes);
1841         hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1842 }