3ba39c57413b02b78dd8dbe03d90ee0e47b84f89
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *          locks and I/O, do quorum and/or master-slave processing, and
50  *          it must operate properly even if some nodes are broken (which
51  *          can also mean indefinite locks).
52  *
53  *                              CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *
59  *      - Most complex functions, quorum management on transaction ids.
60  *
61  *      - Locking and data accesses must be internally asynchronous.
62  *
63  *      - Validate and manage cache coherency primitives (cache state
64  *        is stored in chain topologies but must be validated by these
65  *        functions).
66  *
67  * (2) Lookups and Scans
68  *              hammer2_cluster_lookup()
69  *              hammer2_cluster_next()
70  *
71  *      - Depend on locking & data retrieval functions, but still complex.
72  *
73  *      - Must do quorum management on transaction ids.
74  *
75  *      - Lookup and Iteration ops Must be internally asynchronous.
76  *
77  * (3) Modifying Operations
78  *              hammer2_cluster_create()
79  *
80  *      - Can usually punt on failures, operation continues unless quorum
81  *        is lost.  If quorum is lost, must wait for resynchronization
82  *        (depending on the management mode).
83  *
84  *      - Must disconnect node on failures (also not flush), remount, and
85  *        resynchronize.
86  *
87  *      - Network links (via kdmsg) are relatively easy to issue as the
88  *        complex underworkings of hammer2_chain.c don't have to messed
89  *        with (the protocol is at a higher level than block-level).
90  *
91  *      - Multiple local disk nodes (i.e. block devices) are another matter.
92  *        Chain operations have to be dispatched to per-node threads (xN)
93  *        because we can't asynchronize potentially very complex chain
94  *        operations in hammer2_chain.c (it would be a huge mess).
95  *
96  *        (these threads are also used to terminate incoming kdmsg ops from
97  *        other machines).
98  *
99  *      - Single-node filesystems do not use threads and will simply call
100  *        hammer2_chain.c functions directly.  This short-cut is handled
101  *        at the base of each cluster function.
102  */
103 #include <sys/cdefs.h>
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/types.h>
107 #include <sys/lock.h>
108 #include <sys/uuid.h>
109
110 #include "hammer2.h"
111
112 /*
113  * Returns the bref type of the cluster's foucs.
114  *
115  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
116  * The cluster must be locked.
117  */
118 uint8_t
119 hammer2_cluster_type(hammer2_cluster_t *cluster)
120 {
121         if (cluster->error == 0) {
122                 KKASSERT(cluster->focus != NULL);
123                 return(cluster->focus->bref.type);
124         }
125         return 0;
126 }
127
128 /*
129  * Returns the bref of the cluster's focus, sans any data-offset information
130  * (since offset information is per-node and wouldn't be useful).
131  *
132  * Callers use this function to access modify_tid, mirror_tid, type,
133  * key, and keybits.
134  *
135  * If the cluster is errored, returns an empty bref.
136  * The cluster must be locked.
137  */
138 void
139 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
140 {
141         if (cluster->error == 0) {
142                 KKASSERT(cluster->focus != NULL);
143                 *bref = cluster->focus->bref;
144                 bref->data_off = 0;
145         } else {
146                 bzero(bref, sizeof(*bref));
147         }
148 }
149
150 /*
151  * Create a degenerate cluster with one ref from a single locked chain.
152  * The returned cluster will be focused on the chain and inherit its
153  * error state.
154  *
155  * The chain's lock and reference are transfered to the new cluster, so
156  * the caller should not try to unlock the chain separately.
157  *
158  * We fake the flags.
159  */
160 void
161 hammer2_dummy_xop_from_chain(hammer2_xop_head_t *xop, hammer2_chain_t *chain)
162 {
163         hammer2_cluster_t *cluster;
164
165         bzero(xop, sizeof(*xop));
166
167         cluster = &xop->cluster;
168         cluster->array[0].chain = chain;
169         cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
170         cluster->nchains = 1;
171         cluster->focus = chain;
172         cluster->focus_index = 0;
173         cluster->pmp = chain->pmp;
174         cluster->refs = 1;
175         cluster->error = chain->error;
176         cluster->flags = HAMMER2_CLUSTER_LOCKED |
177                          HAMMER2_CLUSTER_WRHARD |
178                          HAMMER2_CLUSTER_RDHARD |
179                          HAMMER2_CLUSTER_MSYNCED |
180                          HAMMER2_CLUSTER_SSYNCED;
181 }
182
183 /*
184  * Add a reference to a cluster and its underlying chains.
185  *
186  * We must also ref the underlying chains in order to allow ref/unlock
187  * sequences to later re-lock.
188  */
189 void
190 hammer2_cluster_ref(hammer2_cluster_t *cluster)
191 {
192         atomic_add_int(&cluster->refs, 1);
193 }
194
195 /*
196  * Drop the caller's reference to the cluster.  When the ref count drops to
197  * zero this function frees the cluster and drops all underlying chains.
198  *
199  * In-progress read I/Os are typically detached from the cluster once the
200  * first one returns (the remaining stay attached to the DIOs but are then
201  * ignored and drop naturally).
202  */
203 void
204 hammer2_cluster_drop(hammer2_cluster_t *cluster)
205 {
206         hammer2_chain_t *chain;
207         int i;
208
209         KKASSERT(cluster->refs > 0);
210         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
211                 cluster->focus = NULL;          /* safety XXX chg to assert */
212                 cluster->focus_index = 0;
213
214                 for (i = 0; i < cluster->nchains; ++i) {
215                         chain = cluster->array[i].chain;
216                         if (chain) {
217                                 hammer2_chain_drop(chain);
218                                 cluster->array[i].chain = NULL; /* safety */
219                         }
220                 }
221                 cluster->nchains = 0;                           /* safety */
222
223                 kfree(cluster, M_HAMMER2);
224                 /* cluster is invalid */
225         }
226 }
227
228 /*
229  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained. 
230  *
231  * WARNING! This function expects the caller to handle resolution of the
232  *          cluster.  We never re-resolve the cluster in this function,
233  *          because it might be used to temporarily unlock/relock a cparent
234  *          in an iteration or recursrion, and the cparents elements do not
235  *          necessarily match.
236  */
237 void
238 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
239 {
240         hammer2_chain_t *chain;
241         int i;
242
243         /* cannot be on inode-embedded cluster template, must be on copy */
244         KKASSERT(cluster->refs > 0);
245         KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
246         if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
247                 panic("hammer2_cluster_lock: cluster %p already locked!\n",
248                         cluster);
249         }
250         atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
251
252         /*
253          * Lock chains and resolve state.
254          */
255         for (i = 0; i < cluster->nchains; ++i) {
256                 chain = cluster->array[i].chain;
257                 if (chain == NULL)
258                         continue;
259                 hammer2_chain_lock(chain, how);
260         }
261 }
262
263 /*
264  * Calculate the clustering state for the cluster and set its focus.
265  * This routine must be called with care.  For example, it should not
266  * normally be called after relocking a non-leaf cluster because parent
267  * clusters help iterations and each element might be at a slightly different
268  * indirect node (each node's topology is independently indexed).
269  *
270  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
271  * operations.  Typically this is only set on a quorum of MASTERs or
272  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
273  * is present, this bit is *not* set on a quorum of MASTERs.  The
274  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
275  * that create/modify/delete elements use it.
276  *
277  * The chains making up the cluster may be narrowed down based on quorum
278  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
279  * narrowed down to a single chain as long as the entire subtopology is known
280  * to be intact.  So, for example, we can narrow a read-only op to a single
281  * fast SLAVE but if we focus a CACHE chain we must still retain at least
282  * a SLAVE to ensure that the subtopology can be accessed.
283  *
284  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
285  * to be maintained once the topology is validated as-of the top level of
286  * the operation.
287  *
288  * If a failure occurs the operation must be aborted by higher-level code and
289  * retried. XXX
290  */
291 void
292 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
293 {
294         hammer2_chain_t *chain;
295         hammer2_chain_t *focus;
296         hammer2_pfs_t *pmp;
297         hammer2_tid_t quorum_tid;
298         hammer2_tid_t last_best_quorum_tid;
299         int focus_pfs_type;
300         uint32_t nflags;
301         int ttlmasters;
302         int ttlslaves;
303         int nmasters;
304         int nslaves;
305         int nquorum;
306         int smpresent;
307         int i;
308
309         cluster->error = 0;
310         cluster->focus = NULL;
311
312         focus_pfs_type = 0;
313         nflags = 0;
314         ttlmasters = 0;
315         ttlslaves = 0;
316         nmasters = 0;
317         nslaves = 0;
318
319         /*
320          * Calculate quorum
321          */
322         pmp = cluster->pmp;
323         KKASSERT(pmp != NULL || cluster->nchains == 0);
324         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
325         smpresent = 0;
326
327         /*
328          * Pass 1
329          *
330          * NOTE: A NULL chain is not necessarily an error, it could be
331          *       e.g. a lookup failure or the end of an iteration.
332          *       Process normally.
333          */
334         for (i = 0; i < cluster->nchains; ++i) {
335                 chain = cluster->array[i].chain;
336                 if (chain && chain->error) {
337                         if (cluster->focus == NULL || cluster->focus == chain) {
338                                 /* error will be overridden by valid focus */
339                                 cluster->error = chain->error;
340                         }
341
342                         /*
343                          * Must count total masters and slaves whether the
344                          * chain is errored or not.
345                          */
346                         switch (cluster->pmp->pfs_types[i]) {
347                         case HAMMER2_PFSTYPE_SUPROOT:
348                         case HAMMER2_PFSTYPE_MASTER:
349                                 ++ttlmasters;
350                                 break;
351                         case HAMMER2_PFSTYPE_SLAVE:
352                                 ++ttlslaves;
353                                 break;
354                         }
355                         continue;
356                 }
357                 switch (cluster->pmp->pfs_types[i]) {
358                 case HAMMER2_PFSTYPE_MASTER:
359                         ++ttlmasters;
360                         break;
361                 case HAMMER2_PFSTYPE_SLAVE:
362                         ++ttlslaves;
363                         break;
364                 case HAMMER2_PFSTYPE_SOFT_MASTER:
365                         nflags |= HAMMER2_CLUSTER_WRSOFT;
366                         nflags |= HAMMER2_CLUSTER_RDSOFT;
367                         smpresent = 1;
368                         break;
369                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
370                         nflags |= HAMMER2_CLUSTER_RDSOFT;
371                         break;
372                 case HAMMER2_PFSTYPE_SUPROOT:
373                         /*
374                          * Degenerate cluster representing the super-root
375                          * topology on a single device.  Fake stuff so
376                          * cluster ops work as expected.
377                          */
378                         nflags |= HAMMER2_CLUSTER_WRHARD;
379                         nflags |= HAMMER2_CLUSTER_RDHARD;
380                         cluster->focus_index = i;
381                         cluster->focus = chain;
382                         cluster->error = chain ? chain->error : 0;
383                         ++ttlmasters;
384                         break;
385                 default:
386                         break;
387                 }
388         }
389
390         /*
391          * Pass 2
392          *
393          * Resolve masters.  Calculate nmasters for the highest matching
394          * TID, if a quorum cannot be attained try the next lower matching
395          * TID until we exhaust TIDs.
396          *
397          * NOTE: A NULL chain is not necessarily an error, it could be
398          *       e.g. a lookup failure or the end of an iteration.
399          *       Process normally.
400          */
401         last_best_quorum_tid = HAMMER2_TID_MAX;
402         quorum_tid = 0;         /* fix gcc warning */
403
404         while (nmasters < nquorum && last_best_quorum_tid != 0) {
405                 nmasters = 0;
406                 quorum_tid = 0;
407
408                 for (i = 0; i < cluster->nchains; ++i) {
409                         switch (cluster->pmp->pfs_types[i]) {
410                         case HAMMER2_PFSTYPE_SUPROOT:
411                         case HAMMER2_PFSTYPE_MASTER:
412                                 break;
413                         default:
414                                 continue;
415                         }
416                         chain = cluster->array[i].chain;
417
418                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
419                                 /*
420                                  * Invalid as in unsynchronized, cannot be
421                                  * used to calculate the quorum.
422                                  */
423                         } else if (chain == NULL && quorum_tid == 0) {
424                                 /*
425                                  * NULL chain on master matches NULL chains
426                                  * on other masters.
427                                  */
428                                 ++nmasters;
429                         } else if (quorum_tid < last_best_quorum_tid &&
430                                    chain != NULL &&
431                                    (quorum_tid < chain->bref.modify_tid ||
432                                     nmasters == 0)) {
433                                 /*
434                                  * Better TID located, reset nmasters count.
435                                  */
436                                 nmasters = 1;
437                                 quorum_tid = chain->bref.modify_tid;
438                         } else if (chain &&
439                                    quorum_tid == chain->bref.modify_tid) {
440                                 /*
441                                  * TID matches current collection.
442                                  */
443                                 ++nmasters;
444                         }
445                 }
446                 if (nmasters >= nquorum)
447                         break;
448                 last_best_quorum_tid = quorum_tid;
449         }
450
451         /*
452          * Pass 3
453          *
454          * NOTE: A NULL chain is not necessarily an error, it could be
455          *       e.g. a lookup failure or the end of an iteration.
456          *       Process normally.
457          */
458         for (i = 0; i < cluster->nchains; ++i) {
459                 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
460                 chain = cluster->array[i].chain;
461                 if (chain && chain->error) {
462                         if (cluster->focus == NULL || cluster->focus == chain) {
463                                 /* error will be overridden by valid focus */
464                                 cluster->error = chain->error;
465                         }
466                         continue;
467                 }
468
469                 switch (cluster->pmp->pfs_types[i]) {
470                 case HAMMER2_PFSTYPE_MASTER:
471                         /*
472                          * We must have enough up-to-date masters to reach
473                          * a quorum and the master modify_tid must match
474                          * the quorum's modify_tid.
475                          *
476                          * Do not select an errored or out-of-sync master.
477                          */
478                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
479                                 nflags |= HAMMER2_CLUSTER_UNHARD;
480                         } else if (nmasters >= nquorum &&
481                                    (chain == NULL || chain->error == 0) &&
482                                    ((chain == NULL && quorum_tid == 0) ||
483                                     (chain != NULL && quorum_tid ==
484                                                   chain->bref.modify_tid))) {
485                                 nflags |= HAMMER2_CLUSTER_WRHARD;
486                                 nflags |= HAMMER2_CLUSTER_RDHARD;
487                                 if (!smpresent) {
488                                         cluster->array[i].flags |=
489                                                         HAMMER2_CITEM_FEMOD;
490                                 }
491                                 if (cluster->focus == NULL ||
492                                     focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
493                                         focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
494                                         cluster->focus_index = i;
495                                         cluster->focus = chain; /* NULL ok */
496                                         cluster->error = chain ? chain->error :
497                                                                  0;
498                                 }
499                         } else if (chain == NULL || chain->error == 0) {
500                                 nflags |= HAMMER2_CLUSTER_UNHARD;
501                         }
502                         break;
503                 case HAMMER2_PFSTYPE_SLAVE:
504                         /*
505                          * We must have enough up-to-date masters to reach
506                          * a quorum and the slave modify_tid must match the
507                          * quorum's modify_tid.
508                          *
509                          * Do not select an errored slave.
510                          */
511                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
512                                 nflags |= HAMMER2_CLUSTER_UNHARD;
513                         } else if (nmasters >= nquorum &&
514                                    (chain == NULL || chain->error == 0) &&
515                                    ((chain == NULL && quorum_tid == 0) ||
516                                     (chain && quorum_tid ==
517                                               chain->bref.modify_tid))) {
518                                 ++nslaves;
519                                 nflags |= HAMMER2_CLUSTER_RDHARD;
520 #if 0
521                                 /* XXX optimize for RESOLVE_RDONLY */
522                                 if (cluster->focus == NULL) {
523                                         focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
524                                         cluster->focus_index = i;
525                                         cluster->focus = chain; /* NULL ok */
526                                         cluster->error = chain ? chain->error :
527                                                                  0;
528                                 }
529 #endif
530                         } else if (chain == NULL || chain->error == 0) {
531                                 nflags |= HAMMER2_CLUSTER_UNSOFT;
532                         }
533                         break;
534                 case HAMMER2_PFSTYPE_SOFT_MASTER:
535                         /*
536                          * Directly mounted soft master always wins.  There
537                          * should be only one.
538                          */
539                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
540                         cluster->focus_index = i;
541                         cluster->focus = chain;
542                         cluster->error = chain ? chain->error : 0;
543                         focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
544                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
545                         break;
546                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
547                         /*
548                          * Directly mounted soft slave always wins.  There
549                          * should be only one.
550                          */
551                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
552                         if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
553                                 cluster->focus_index = i;
554                                 cluster->focus = chain;
555                                 cluster->error = chain ? chain->error : 0;
556                                 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
557                         }
558                         break;
559                 case HAMMER2_PFSTYPE_SUPROOT:
560                         /*
561                          * spmp (degenerate case)
562                          */
563                         KKASSERT(i == 0);
564                         cluster->focus_index = i;
565                         cluster->focus = chain;
566                         cluster->error = chain ? chain->error : 0;
567                         focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
568                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
569                         break;
570                 default:
571                         break;
572                 }
573         }
574
575         /*
576          * Focus now set, adjust ddflag.  Skip this pass if the focus
577          * is bad or if we are at the PFS root (the bref won't match at
578          * the PFS root, obviously).
579          */
580         focus = cluster->focus;
581         if (focus) {
582                 cluster->ddflag =
583                         (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
584         } else {
585                 cluster->ddflag = 0;
586                 goto skip4;
587         }
588         if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
589                 goto skip4;
590
591         /*
592          * Pass 4
593          *
594          * Validate the elements that were not marked invalid.  They should
595          * match.
596          */
597         for (i = 0; i < cluster->nchains; ++i) {
598                 int ddflag;
599
600                 chain = cluster->array[i].chain;
601
602                 if (chain == NULL)
603                         continue;
604                 if (chain == focus)
605                         continue;
606                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
607                         continue;
608
609                 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
610                 if (chain->bref.type != focus->bref.type ||
611                     chain->bref.key != focus->bref.key ||
612                     chain->bref.keybits != focus->bref.keybits ||
613                     chain->bref.modify_tid != focus->bref.modify_tid ||
614                     chain->bytes != focus->bytes ||
615                     ddflag != cluster->ddflag) {
616                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
617                         if (hammer2_debug & 1)
618                         kprintf("cluster_resolve: matching modify_tid failed "
619                                 "bref test: idx=%d type=%02x/%02x "
620                                 "key=%016jx/%d-%016jx/%d "
621                                 "mod=%016jx/%016jx bytes=%u/%u\n",
622                                 i,
623                                 chain->bref.type, focus->bref.type,
624                                 chain->bref.key, chain->bref.keybits,
625                                 focus->bref.key, focus->bref.keybits,
626                                 chain->bref.modify_tid, focus->bref.modify_tid,
627                                 chain->bytes, focus->bytes);
628                         if (hammer2_debug & 0x4000)
629                                 panic("cluster_resolve");
630                         /* flag issue and force resync? */
631                 }
632         }
633 skip4:
634
635         if (ttlslaves == 0)
636                 nflags |= HAMMER2_CLUSTER_NOSOFT;
637         if (ttlmasters == 0)
638                 nflags |= HAMMER2_CLUSTER_NOHARD;
639
640         /*
641          * Set SSYNCED or MSYNCED for slaves and masters respectively if
642          * all available nodes (even if 0 are available) are fully
643          * synchronized.  This is used by the synchronization thread to
644          * determine if there is work it could potentially accomplish.
645          */
646         if (nslaves == ttlslaves)
647                 nflags |= HAMMER2_CLUSTER_SSYNCED;
648         if (nmasters == ttlmasters)
649                 nflags |= HAMMER2_CLUSTER_MSYNCED;
650
651         /*
652          * Determine if the cluster was successfully locked for the
653          * requested operation and generate an error code.  The cluster
654          * will not be locked (or ref'd) if an error is returned.
655          */
656         atomic_set_int(&cluster->flags, nflags);
657         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
658 }
659
660 /*
661  * This is used by the XOPS subsystem to calculate the state of
662  * the collection and tell hammer2_xop_collect() what to do with it.
663  * The collection can be in various states of desynchronization, the
664  * caller specifically wants to resolve the passed-in key.
665  *
666  * Return values:
667  *      0               - Quorum agreement, key is valid
668  *
669  *      ENOENT          - Quorum agreement, end of scan
670  *
671  *      ESRCH           - Quorum agreement, key is INVALID (caller should
672  *                        skip key).
673  *
674  *      EIO             - Quorum agreement but all elements had errors.
675  *
676  *      EDEADLK         - No quorum agreement possible for key, a repair
677  *                        may be needed.  Caller has to decide what to do,
678  *                        possibly iterating the key or generating an EIO.
679  *
680  *      EINPROGRESS     - No quorum agreement yet, but agreement is still
681  *                        possible if caller waits for more responses.  Caller
682  *                        should not iterate key.
683  *
684  * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
685  *
686  * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
687  */
688 int
689 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
690 {
691         hammer2_chain_t *chain;
692         hammer2_chain_t *focus;
693         hammer2_pfs_t *pmp;
694         hammer2_tid_t quorum_tid;
695         hammer2_tid_t last_best_quorum_tid;
696         uint32_t nflags;
697         int ttlmasters;
698         int ttlslaves;
699         int nmasters;
700         int nmasters_keymatch;
701         int nslaves;
702         int nquorum;
703         int umasters;   /* unknown masters (still in progress) */
704         int smpresent;
705         int error;
706         int i;
707
708         cluster->error = 0;
709         cluster->focus = NULL;
710
711         pmp = cluster->pmp;
712         KKASSERT(pmp != NULL || cluster->nchains == 0);
713
714         /*
715          * Calculate quorum
716          */
717         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
718         smpresent = 0;
719         nflags = 0;
720         ttlmasters = 0;
721         ttlslaves = 0;
722
723         /*
724          * Pass 1
725          *
726          * NOTE: A NULL chain is not necessarily an error, it could be
727          *       e.g. a lookup failure or the end of an iteration.
728          *       Process normally.
729          */
730         for (i = 0; i < cluster->nchains; ++i) {
731                 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
732                 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
733
734                 chain = cluster->array[i].chain;
735                 error = cluster->array[i].error;
736                 if (chain && error) {
737                         if (cluster->focus == NULL || cluster->focus == chain) {
738                                 /* error will be overridden by valid focus */
739                                 /* XXX */
740                         }
741
742                         /*
743                          * Must count total masters and slaves whether the
744                          * chain is errored or not.
745                          */
746                         switch (cluster->pmp->pfs_types[i]) {
747                         case HAMMER2_PFSTYPE_SUPROOT:
748                         case HAMMER2_PFSTYPE_MASTER:
749                                 ++ttlmasters;
750                                 break;
751                         case HAMMER2_PFSTYPE_SLAVE:
752                                 ++ttlslaves;
753                                 break;
754                         }
755                         continue;
756                 }
757                 switch (cluster->pmp->pfs_types[i]) {
758                 case HAMMER2_PFSTYPE_MASTER:
759                         ++ttlmasters;
760                         break;
761                 case HAMMER2_PFSTYPE_SLAVE:
762                         ++ttlslaves;
763                         break;
764                 case HAMMER2_PFSTYPE_SOFT_MASTER:
765                         nflags |= HAMMER2_CLUSTER_WRSOFT;
766                         nflags |= HAMMER2_CLUSTER_RDSOFT;
767                         smpresent = 1;
768                         break;
769                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
770                         nflags |= HAMMER2_CLUSTER_RDSOFT;
771                         break;
772                 case HAMMER2_PFSTYPE_SUPROOT:
773                         /*
774                          * Degenerate cluster representing the super-root
775                          * topology on a single device.  Fake stuff so
776                          * cluster ops work as expected.
777                          */
778                         ++ttlmasters;
779                         nflags |= HAMMER2_CLUSTER_WRHARD;
780                         nflags |= HAMMER2_CLUSTER_RDHARD;
781                         cluster->focus_index = i;
782                         cluster->focus = chain;
783                         cluster->error = error;
784                         break;
785                 default:
786                         break;
787                 }
788         }
789
790         /*
791          * Pass 2
792          *
793          * Resolve nmasters             - master nodes fully match
794          *
795          * Resolve umasters             - master nodes operation still
796          *                                in progress
797          *
798          * Resolve nmasters_keymatch    - master nodes match the passed-in
799          *                                key and may or may not match
800          *                                the quorum-agreed tid.
801          * 
802          * The quorum-agreed TID is the highest matching TID.
803          */
804         last_best_quorum_tid = HAMMER2_TID_MAX;
805         umasters = 0;
806         nmasters = 0;
807         nmasters_keymatch = 0;
808         quorum_tid = 0;         /* fix gcc warning */
809
810         while (nmasters < nquorum && last_best_quorum_tid != 0) {
811                 umasters = 0;
812                 nmasters = 0;
813                 nmasters_keymatch = 0;
814                 quorum_tid = 0;
815
816                 for (i = 0; i < cluster->nchains; ++i) {
817                         /* XXX SOFT smpresent handling */
818                         switch(cluster->pmp->pfs_types[i]) {
819                         case HAMMER2_PFSTYPE_MASTER:
820                         case HAMMER2_PFSTYPE_SUPROOT:
821                                 break;
822                         default:
823                                 continue;
824                         }
825
826                         chain = cluster->array[i].chain;
827                         error = cluster->array[i].error;
828
829                         /*
830                          * Skip elements still in progress.  umasters keeps
831                          * track of masters that might still be in-progress.
832                          */
833                         if (chain == NULL && (cluster->array[i].flags &
834                                               HAMMER2_CITEM_NULL) == 0) {
835                                 ++umasters;
836                                 continue;
837                         }
838
839                         /*
840                          * Key match?
841                          */
842                         if (flags & HAMMER2_CHECK_NULL) {
843                                 if (chain == NULL) {
844                                         ++nmasters;
845                                         ++nmasters_keymatch;
846                                         if (cluster->error == 0)
847                                                 cluster->error = error;
848                                 }
849                         } else if (chain &&
850                                    (key == (hammer2_key_t)-1 ||
851                                     chain->bref.key == key)) {
852                                 ++nmasters_keymatch;
853
854                                 if (chain->bref.modify_tid <
855                                      last_best_quorum_tid &&
856                                     quorum_tid < chain->bref.modify_tid) {
857                                         /*
858                                          * Select new TID as master if better
859                                          * than any found so far in this loop,
860                                          * as long as it does not reach the
861                                          * best tid found in the previous loop.
862                                          */
863                                         nmasters = 0;
864                                         quorum_tid = chain->bref.modify_tid;
865                                 }
866                                 if (quorum_tid == chain->bref.modify_tid) {
867                                         /*
868                                          * TID matches current collection.
869                                          *
870                                          * (error handled in next pass)
871                                          */
872                                         ++nmasters;
873                                         if (chain->error == 0) {
874                                                 cluster->focus = chain;
875                                                 cluster->focus_index = i;
876                                         }
877                                 }
878                         }
879                 }
880                 if (nmasters >= nquorum)
881                         break;
882                 last_best_quorum_tid = quorum_tid;
883         }
884
885         /*
886         kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
887                 nmasters, nquorum, nmasters_keymatch, umasters);
888         */
889
890         /*
891          * Early return if we do not have enough masters.
892          */
893         if (nmasters < nquorum) {
894                 if (nmasters + umasters >= nquorum)
895                         return HAMMER2_ERROR_EINPROGRESS;
896                 if (nmasters_keymatch < nquorum) 
897                         return HAMMER2_ERROR_ESRCH;
898                 return HAMMER2_ERROR_EDEADLK;
899         }
900
901         /*
902          * Validated end of scan.
903          */
904         if (flags & HAMMER2_CHECK_NULL) {
905                 if (cluster->error == 0)
906                         cluster->error = HAMMER2_ERROR_ENOENT;
907                 return cluster->error;
908         }
909
910         /*
911          * If we have a NULL focus at this point the agreeing quorum all
912          * had chain errors.
913          */
914         if (cluster->focus == NULL)
915                 return HAMMER2_ERROR_EIO;
916
917         /*
918          * Pass 3
919          *
920          * We have quorum agreement, validate elements, not end of scan.
921          */
922         nslaves = 0;
923         cluster->error = 0;
924
925         for (i = 0; i < cluster->nchains; ++i) {
926                 chain = cluster->array[i].chain;
927                 error = cluster->array[i].error;
928                 if (chain == NULL ||
929                     chain->bref.key != key ||
930                     chain->bref.modify_tid != quorum_tid) {
931                         continue;
932                 }
933
934                 /*
935                  * Quorum Match
936                  *
937                  * XXX for now, cumulative error.
938                  */
939                 if (cluster->error == 0)
940                         cluster->error = error;
941
942                 switch (cluster->pmp->pfs_types[i]) {
943                 case HAMMER2_PFSTYPE_MASTER:
944                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
945                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
946                         nflags |= HAMMER2_CLUSTER_WRHARD;
947                         nflags |= HAMMER2_CLUSTER_RDHARD;
948                         break;
949                 case HAMMER2_PFSTYPE_SLAVE:
950                         /*
951                          * We must have enough up-to-date masters to reach
952                          * a quorum and the slave modify_tid must match the
953                          * quorum's modify_tid.
954                          *
955                          * Do not select an errored slave.
956                          */
957                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
958                         nflags |= HAMMER2_CLUSTER_RDHARD;
959                         ++nslaves;
960                         break;
961                 case HAMMER2_PFSTYPE_SOFT_MASTER:
962                         /*
963                          * Directly mounted soft master always wins.  There
964                          * should be only one.
965                          */
966                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
967                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
968                         break;
969                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
970                         /*
971                          * Directly mounted soft slave always wins.  There
972                          * should be only one.
973                          *
974                          * XXX
975                          */
976                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
977                         break;
978                 case HAMMER2_PFSTYPE_SUPROOT:
979                         /*
980                          * spmp (degenerate case)
981                          */
982                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
983                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
984                         nflags |= HAMMER2_CLUSTER_WRHARD;
985                         nflags |= HAMMER2_CLUSTER_RDHARD;
986                         break;
987                 default:
988                         break;
989                 }
990         }
991
992         /*
993          * Focus now set, adjust ddflag.  Skip this pass if the focus
994          * is bad or if we are at the PFS root (the bref won't match at
995          * the PFS root, obviously).
996          *
997          * focus is probably not locked and it isn't safe to test its
998          * content (e.g. focus->data, focus->dio, other content).  We
999          * do not synchronize the dio to the cpu here.  In fact, in numerous
1000          * situations the frontend doesn't even need to access its dio/data,
1001          * so synchronizing it here would be wasteful.
1002          */
1003         focus = cluster->focus;
1004         if (focus) {
1005                 cluster->ddflag =
1006                         (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
1007         } else {
1008                 cluster->ddflag = 0;
1009                 goto skip4;
1010         }
1011         if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1012                 goto skip4;
1013
1014         /*
1015          * Pass 4
1016          *
1017          * Validate the elements that were not marked invalid.  They should
1018          * match.
1019          */
1020         for (i = 0; i < cluster->nchains; ++i) {
1021                 int ddflag;
1022
1023                 chain = cluster->array[i].chain;
1024
1025                 if (chain == NULL)
1026                         continue;
1027                 if (chain == focus)
1028                         continue;
1029                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1030                         continue;
1031
1032                 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1033                 if (chain->bref.type != focus->bref.type ||
1034                     chain->bref.key != focus->bref.key ||
1035                     chain->bref.keybits != focus->bref.keybits ||
1036                     chain->bref.modify_tid != focus->bref.modify_tid ||
1037                     chain->bytes != focus->bytes ||
1038                     ddflag != cluster->ddflag) {
1039                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1040                         if (hammer2_debug & 1)
1041                         kprintf("cluster_resolve: matching modify_tid failed "
1042                                 "bref test: idx=%d type=%02x/%02x "
1043                                 "key=%016jx/%d-%016jx/%d "
1044                                 "mod=%016jx/%016jx bytes=%u/%u\n",
1045                                 i,
1046                                 chain->bref.type, focus->bref.type,
1047                                 chain->bref.key, chain->bref.keybits,
1048                                 focus->bref.key, focus->bref.keybits,
1049                                 chain->bref.modify_tid, focus->bref.modify_tid,
1050                                 chain->bytes, focus->bytes);
1051                         if (hammer2_debug & 0x4000)
1052                                 panic("cluster_resolve");
1053                         /* flag issue and force resync? */
1054                 }
1055         }
1056 skip4:
1057
1058         if (ttlslaves == 0)
1059                 nflags |= HAMMER2_CLUSTER_NOSOFT;
1060         if (ttlmasters == 0)
1061                 nflags |= HAMMER2_CLUSTER_NOHARD;
1062
1063         /*
1064          * Set SSYNCED or MSYNCED for slaves and masters respectively if
1065          * all available nodes (even if 0 are available) are fully
1066          * synchronized.  This is used by the synchronization thread to
1067          * determine if there is work it could potentially accomplish.
1068          */
1069         if (nslaves == ttlslaves)
1070                 nflags |= HAMMER2_CLUSTER_SSYNCED;
1071         if (nmasters == ttlmasters)
1072                 nflags |= HAMMER2_CLUSTER_MSYNCED;
1073
1074         /*
1075          * Determine if the cluster was successfully locked for the
1076          * requested operation and generate an error code.  The cluster
1077          * will not be locked (or ref'd) if an error is returned.
1078          */
1079         atomic_set_int(&cluster->flags, nflags);
1080         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1081
1082         return cluster->error;
1083 }
1084
1085 /*
1086  * This is used by the sync thread to force non-NULL elements of a copy
1087  * of the pmp->iroot cluster to be good which is required to prime the
1088  * sync.
1089  */
1090 void
1091 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1092 {
1093         int i;
1094
1095         for (i = 0; i < cluster->nchains; ++i) {
1096                 if (cluster->array[i].chain)
1097                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1098         }
1099 }
1100
1101 /*
1102  * Unlock a cluster.  Refcount and focus is maintained.
1103  */
1104 void
1105 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1106 {
1107         hammer2_chain_t *chain;
1108         int i;
1109
1110         if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1111                 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1112                         cluster);
1113         }
1114         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1115         KKASSERT(cluster->refs > 0);
1116         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1117
1118         for (i = 0; i < cluster->nchains; ++i) {
1119                 chain = cluster->array[i].chain;
1120                 if (chain)
1121                         hammer2_chain_unlock(chain);
1122         }
1123 }