sys/vfs/hammer2: Test bref type HAMMER2_BREF_TYPE_EMPTY instead 0
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *          locks and I/O, do quorum and/or master-slave processing, and
50  *          it must operate properly even if some nodes are broken (which
51  *          can also mean indefinite locks).
52  *
53  *                              CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *
59  *      - Most complex functions, quorum management on transaction ids.
60  *
61  *      - Locking and data accesses must be internally asynchronous.
62  *
63  *      - Validate and manage cache coherency primitives (cache state
64  *        is stored in chain topologies but must be validated by these
65  *        functions).
66  *
67  * (2) Lookups and Scans
68  *              hammer2_cluster_lookup()
69  *              hammer2_cluster_next()
70  *
71  *      - Depend on locking & data retrieval functions, but still complex.
72  *
73  *      - Must do quorum management on transaction ids.
74  *
75  *      - Lookup and Iteration ops Must be internally asynchronous.
76  *
77  * (3) Modifying Operations
78  *              hammer2_cluster_create()
79  *
80  *      - Can usually punt on failures, operation continues unless quorum
81  *        is lost.  If quorum is lost, must wait for resynchronization
82  *        (depending on the management mode).
83  *
84  *      - Must disconnect node on failures (also not flush), remount, and
85  *        resynchronize.
86  *
87  *      - Network links (via kdmsg) are relatively easy to issue as the
88  *        complex underworkings of hammer2_chain.c don't have to messed
89  *        with (the protocol is at a higher level than block-level).
90  *
91  *      - Multiple local disk nodes (i.e. block devices) are another matter.
92  *        Chain operations have to be dispatched to per-node threads (xN)
93  *        because we can't asynchronize potentially very complex chain
94  *        operations in hammer2_chain.c (it would be a huge mess).
95  *
96  *        (these threads are also used to terminate incoming kdmsg ops from
97  *        other machines).
98  *
99  *      - Single-node filesystems do not use threads and will simply call
100  *        hammer2_chain.c functions directly.  This short-cut is handled
101  *        at the base of each cluster function.
102  */
103 #include <sys/cdefs.h>
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/types.h>
107 #include <sys/lock.h>
108 #include <sys/uuid.h>
109
110 #include "hammer2.h"
111
112 /*
113  * Returns the bref type of the cluster's foucs.
114  *
115  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
116  * The cluster must be locked.
117  */
118 uint8_t
119 hammer2_cluster_type(hammer2_cluster_t *cluster)
120 {
121         if (cluster->error == 0) {
122                 KKASSERT(cluster->focus != NULL);
123                 return(cluster->focus->bref.type);
124         }
125         return 0;
126 }
127
128 /*
129  * Returns the bref of the cluster's focus, sans any data-offset information
130  * (since offset information is per-node and wouldn't be useful).
131  *
132  * Callers use this function to access modify_tid, mirror_tid, type,
133  * key, and keybits.
134  *
135  * If the cluster is errored, returns an empty bref.
136  * The cluster must be locked.
137  */
138 void
139 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
140 {
141         if (cluster->error == 0) {
142                 KKASSERT(cluster->focus != NULL);
143                 *bref = cluster->focus->bref;
144                 bref->data_off = 0;
145         } else {
146                 bzero(bref, sizeof(*bref));
147         }
148 }
149
150 /*
151  * Create a degenerate cluster with one ref from a single locked chain.
152  * The returned cluster will be focused on the chain and inherit its
153  * error state.
154  *
155  * The chain's lock and reference are transfered to the new cluster, so
156  * the caller should not try to unlock the chain separately.
157  *
158  * We fake the flags.
159  */
160 void
161 hammer2_dummy_xop_from_chain(hammer2_xop_head_t *xop, hammer2_chain_t *chain)
162 {
163         hammer2_cluster_t *cluster;
164
165         bzero(xop, sizeof(*xop));
166
167         cluster = &xop->cluster;
168         cluster->array[0].chain = chain;
169         cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
170         cluster->nchains = 1;
171         cluster->focus = chain;
172         cluster->focus_index = 0;
173         cluster->pmp = chain->pmp;
174         cluster->refs = 1;
175         cluster->error = chain->error;
176         cluster->flags = HAMMER2_CLUSTER_LOCKED |
177                          HAMMER2_CLUSTER_WRHARD |
178                          HAMMER2_CLUSTER_RDHARD |
179                          HAMMER2_CLUSTER_MSYNCED |
180                          HAMMER2_CLUSTER_SSYNCED;
181 }
182
183 /*
184  * Add a reference to a cluster and its underlying chains.
185  *
186  * We must also ref the underlying chains in order to allow ref/unlock
187  * sequences to later re-lock.
188  */
189 void
190 hammer2_cluster_ref(hammer2_cluster_t *cluster)
191 {
192         atomic_add_int(&cluster->refs, 1);
193 }
194
195 /*
196  * Drop the caller's reference to the cluster.  When the ref count drops to
197  * zero this function frees the cluster and drops all underlying chains.
198  *
199  * In-progress read I/Os are typically detached from the cluster once the
200  * first one returns (the remaining stay attached to the DIOs but are then
201  * ignored and drop naturally).
202  */
203 void
204 hammer2_cluster_drop(hammer2_cluster_t *cluster)
205 {
206         hammer2_chain_t *chain;
207         int i;
208
209         KKASSERT(cluster->refs > 0);
210         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
211                 cluster->focus = NULL;          /* safety XXX chg to assert */
212                 cluster->focus_index = 0;
213
214                 for (i = 0; i < cluster->nchains; ++i) {
215                         chain = cluster->array[i].chain;
216                         if (chain) {
217                                 hammer2_chain_drop(chain);
218                                 cluster->array[i].chain = NULL; /* safety */
219                         }
220                 }
221                 cluster->nchains = 0;                           /* safety */
222
223                 kfree(cluster, M_HAMMER2);
224                 /* cluster is invalid */
225         }
226 }
227
228 /*
229  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained. 
230  *
231  * WARNING! This function expects the caller to handle resolution of the
232  *          cluster.  We never re-resolve the cluster in this function,
233  *          because it might be used to temporarily unlock/relock a cparent
234  *          in an iteration or recursrion, and the cparents elements do not
235  *          necessarily match.
236  */
237 void
238 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
239 {
240         hammer2_chain_t *chain;
241         int i;
242
243         /* cannot be on inode-embedded cluster template, must be on copy */
244         KKASSERT(cluster->refs > 0);
245         KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
246         if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
247                 panic("hammer2_cluster_lock: cluster %p already locked!\n",
248                         cluster);
249         }
250         atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
251
252         /*
253          * Lock chains and resolve state.
254          */
255         for (i = 0; i < cluster->nchains; ++i) {
256                 chain = cluster->array[i].chain;
257                 if (chain == NULL)
258                         continue;
259                 hammer2_chain_lock(chain, how);
260         }
261 }
262
263 void
264 hammer2_cluster_unhold(hammer2_cluster_t *cluster)
265 {
266         hammer2_chain_t *chain;
267         int i;
268
269         for (i = 0; i < cluster->nchains; ++i) {
270                 chain = cluster->array[i].chain;
271                 if (chain == NULL)
272                         continue;
273                 hammer2_chain_unhold(chain);
274         }
275 }
276
277 void
278 hammer2_cluster_rehold(hammer2_cluster_t *cluster)
279 {
280         hammer2_chain_t *chain;
281         int i;
282
283         for (i = 0; i < cluster->nchains; ++i) {
284                 chain = cluster->array[i].chain;
285                 if (chain == NULL)
286                         continue;
287                 hammer2_chain_rehold(chain);
288         }
289 }
290
291 /*
292  * Calculate the clustering state for the cluster and set its focus.
293  * This routine must be called with care.  For example, it should not
294  * normally be called after relocking a non-leaf cluster because parent
295  * clusters help iterations and each element might be at a slightly different
296  * indirect node (each node's topology is independently indexed).
297  *
298  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
299  * operations.  Typically this is only set on a quorum of MASTERs or
300  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
301  * is present, this bit is *not* set on a quorum of MASTERs.  The
302  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
303  * that create/modify/delete elements use it.
304  *
305  * The chains making up the cluster may be narrowed down based on quorum
306  * acceptability.
307  *
308  * If a failure occurs the operation must be aborted by higher-level code and
309  * retried. XXX
310  */
311 void
312 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
313 {
314         hammer2_chain_t *chain;
315         hammer2_chain_t *focus;
316         hammer2_pfs_t *pmp;
317         hammer2_tid_t quorum_tid;
318         hammer2_tid_t last_best_quorum_tid;
319         int focus_pfs_type;
320         uint32_t nflags;
321         int ttlmasters;
322         int ttlslaves;
323         int nmasters;
324         int nslaves;
325         int nquorum;
326         int smpresent;
327         int i;
328
329         cluster->error = 0;
330         cluster->focus = NULL;
331
332         focus_pfs_type = 0;
333         nflags = 0;
334         ttlmasters = 0;
335         ttlslaves = 0;
336         nmasters = 0;
337         nslaves = 0;
338
339         /*
340          * Calculate quorum
341          */
342         pmp = cluster->pmp;
343         KKASSERT(pmp != NULL || cluster->nchains == 0);
344         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
345         smpresent = 0;
346
347         /*
348          * Pass 1
349          *
350          * NOTE: A NULL chain is not necessarily an error, it could be
351          *       e.g. a lookup failure or the end of an iteration.
352          *       Process normally.
353          */
354         for (i = 0; i < cluster->nchains; ++i) {
355                 chain = cluster->array[i].chain;
356                 if (chain && chain->error) {
357                         if (cluster->focus == NULL || cluster->focus == chain) {
358                                 /* error will be overridden by valid focus */
359                                 cluster->error = chain->error;
360                         }
361
362                         /*
363                          * Must count total masters and slaves whether the
364                          * chain is errored or not.
365                          */
366                         switch (cluster->pmp->pfs_types[i]) {
367                         case HAMMER2_PFSTYPE_SUPROOT:
368                         case HAMMER2_PFSTYPE_MASTER:
369                                 ++ttlmasters;
370                                 break;
371                         case HAMMER2_PFSTYPE_SLAVE:
372                                 ++ttlslaves;
373                                 break;
374                         }
375                         continue;
376                 }
377                 switch (cluster->pmp->pfs_types[i]) {
378                 case HAMMER2_PFSTYPE_MASTER:
379                         ++ttlmasters;
380                         break;
381                 case HAMMER2_PFSTYPE_SLAVE:
382                         ++ttlslaves;
383                         break;
384                 case HAMMER2_PFSTYPE_SOFT_MASTER:
385                         nflags |= HAMMER2_CLUSTER_WRSOFT;
386                         nflags |= HAMMER2_CLUSTER_RDSOFT;
387                         smpresent = 1;
388                         break;
389                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
390                         nflags |= HAMMER2_CLUSTER_RDSOFT;
391                         break;
392                 case HAMMER2_PFSTYPE_SUPROOT:
393                         /*
394                          * Degenerate cluster representing the super-root
395                          * topology on a single device.  Fake stuff so
396                          * cluster ops work as expected.
397                          */
398                         nflags |= HAMMER2_CLUSTER_WRHARD;
399                         nflags |= HAMMER2_CLUSTER_RDHARD;
400                         cluster->focus_index = i;
401                         cluster->focus = chain;
402                         cluster->error = chain ? chain->error : 0;
403                         ++ttlmasters;
404                         break;
405                 default:
406                         break;
407                 }
408         }
409
410         /*
411          * Pass 2
412          *
413          * Resolve masters.  Calculate nmasters for the highest matching
414          * TID, if a quorum cannot be attained try the next lower matching
415          * TID until we exhaust TIDs.
416          *
417          * NOTE: A NULL chain is not necessarily an error, it could be
418          *       e.g. a lookup failure or the end of an iteration.
419          *       Process normally.
420          */
421         last_best_quorum_tid = HAMMER2_TID_MAX;
422         quorum_tid = 0;         /* fix gcc warning */
423
424         while (nmasters < nquorum && last_best_quorum_tid != 0) {
425                 nmasters = 0;
426                 quorum_tid = 0;
427
428                 for (i = 0; i < cluster->nchains; ++i) {
429                         switch (cluster->pmp->pfs_types[i]) {
430                         case HAMMER2_PFSTYPE_SUPROOT:
431                         case HAMMER2_PFSTYPE_MASTER:
432                                 break;
433                         default:
434                                 continue;
435                         }
436                         chain = cluster->array[i].chain;
437
438                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
439                                 /*
440                                  * Invalid as in unsynchronized, cannot be
441                                  * used to calculate the quorum.
442                                  */
443                         } else if (chain == NULL && quorum_tid == 0) {
444                                 /*
445                                  * NULL chain on master matches NULL chains
446                                  * on other masters.
447                                  */
448                                 ++nmasters;
449                         } else if (quorum_tid < last_best_quorum_tid &&
450                                    chain != NULL &&
451                                    (quorum_tid < chain->bref.modify_tid ||
452                                     nmasters == 0)) {
453                                 /*
454                                  * Better TID located, reset nmasters count.
455                                  */
456                                 nmasters = 1;
457                                 quorum_tid = chain->bref.modify_tid;
458                         } else if (chain &&
459                                    quorum_tid == chain->bref.modify_tid) {
460                                 /*
461                                  * TID matches current collection.
462                                  */
463                                 ++nmasters;
464                         }
465                 }
466                 if (nmasters >= nquorum)
467                         break;
468                 last_best_quorum_tid = quorum_tid;
469         }
470
471         /*
472          * Pass 3
473          *
474          * NOTE: A NULL chain is not necessarily an error, it could be
475          *       e.g. a lookup failure or the end of an iteration.
476          *       Process normally.
477          */
478         for (i = 0; i < cluster->nchains; ++i) {
479                 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
480                 chain = cluster->array[i].chain;
481                 if (chain && chain->error) {
482                         if (cluster->focus == NULL || cluster->focus == chain) {
483                                 /* error will be overridden by valid focus */
484                                 cluster->error = chain->error;
485                         }
486                         continue;
487                 }
488
489                 switch (cluster->pmp->pfs_types[i]) {
490                 case HAMMER2_PFSTYPE_MASTER:
491                         /*
492                          * We must have enough up-to-date masters to reach
493                          * a quorum and the master modify_tid must match
494                          * the quorum's modify_tid.
495                          *
496                          * Do not select an errored or out-of-sync master.
497                          */
498                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
499                                 nflags |= HAMMER2_CLUSTER_UNHARD;
500                         } else if (nmasters >= nquorum &&
501                                    (chain == NULL || chain->error == 0) &&
502                                    ((chain == NULL && quorum_tid == 0) ||
503                                     (chain != NULL && quorum_tid ==
504                                                   chain->bref.modify_tid))) {
505                                 nflags |= HAMMER2_CLUSTER_WRHARD;
506                                 nflags |= HAMMER2_CLUSTER_RDHARD;
507                                 if (!smpresent) {
508                                         cluster->array[i].flags |=
509                                                         HAMMER2_CITEM_FEMOD;
510                                 }
511                                 if (cluster->focus == NULL ||
512                                     focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
513                                         focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
514                                         cluster->focus_index = i;
515                                         cluster->focus = chain; /* NULL ok */
516                                         cluster->error = chain ? chain->error :
517                                                                  0;
518                                 }
519                         } else if (chain == NULL || chain->error == 0) {
520                                 nflags |= HAMMER2_CLUSTER_UNHARD;
521                         }
522                         break;
523                 case HAMMER2_PFSTYPE_SLAVE:
524                         /*
525                          * We must have enough up-to-date masters to reach
526                          * a quorum and the slave modify_tid must match the
527                          * quorum's modify_tid.
528                          *
529                          * Do not select an errored slave.
530                          */
531                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
532                                 nflags |= HAMMER2_CLUSTER_UNHARD;
533                         } else if (nmasters >= nquorum &&
534                                    (chain == NULL || chain->error == 0) &&
535                                    ((chain == NULL && quorum_tid == 0) ||
536                                     (chain && quorum_tid ==
537                                               chain->bref.modify_tid))) {
538                                 ++nslaves;
539                                 nflags |= HAMMER2_CLUSTER_RDHARD;
540                         } else if (chain == NULL || chain->error == 0) {
541                                 nflags |= HAMMER2_CLUSTER_UNSOFT;
542                         }
543                         break;
544                 case HAMMER2_PFSTYPE_SOFT_MASTER:
545                         /*
546                          * Directly mounted soft master always wins.  There
547                          * should be only one.
548                          */
549                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
550                         cluster->focus_index = i;
551                         cluster->focus = chain;
552                         cluster->error = chain ? chain->error : 0;
553                         focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
554                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
555                         break;
556                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
557                         /*
558                          * Directly mounted soft slave always wins.  There
559                          * should be only one.
560                          */
561                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
562                         if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
563                                 cluster->focus_index = i;
564                                 cluster->focus = chain;
565                                 cluster->error = chain ? chain->error : 0;
566                                 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
567                         }
568                         break;
569                 case HAMMER2_PFSTYPE_SUPROOT:
570                         /*
571                          * spmp (degenerate case)
572                          */
573                         KKASSERT(i == 0);
574                         cluster->focus_index = i;
575                         cluster->focus = chain;
576                         cluster->error = chain ? chain->error : 0;
577                         focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
578                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
579                         break;
580                 default:
581                         break;
582                 }
583         }
584
585         /*
586          * Focus now set, adjust ddflag.  Skip this pass if the focus
587          * is bad or if we are at the PFS root (the bref won't match at
588          * the PFS root, obviously).
589          */
590         focus = cluster->focus;
591         if (focus) {
592                 cluster->ddflag =
593                         (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
594         } else {
595                 cluster->ddflag = 0;
596                 goto skip4;
597         }
598         if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
599                 goto skip4;
600
601         /*
602          * Pass 4
603          *
604          * Validate the elements that were not marked invalid.  They should
605          * match.
606          */
607         for (i = 0; i < cluster->nchains; ++i) {
608                 int ddflag;
609
610                 chain = cluster->array[i].chain;
611
612                 if (chain == NULL)
613                         continue;
614                 if (chain == focus)
615                         continue;
616                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
617                         continue;
618
619                 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
620                 if (chain->bref.type != focus->bref.type ||
621                     chain->bref.key != focus->bref.key ||
622                     chain->bref.keybits != focus->bref.keybits ||
623                     chain->bref.modify_tid != focus->bref.modify_tid ||
624                     chain->bytes != focus->bytes ||
625                     ddflag != cluster->ddflag) {
626                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
627                         if (hammer2_debug & 1)
628                         kprintf("cluster_resolve: matching modify_tid failed "
629                                 "bref test: idx=%d type=%02x/%02x "
630                                 "key=%016jx/%d-%016jx/%d "
631                                 "mod=%016jx/%016jx bytes=%u/%u\n",
632                                 i,
633                                 chain->bref.type, focus->bref.type,
634                                 chain->bref.key, chain->bref.keybits,
635                                 focus->bref.key, focus->bref.keybits,
636                                 chain->bref.modify_tid, focus->bref.modify_tid,
637                                 chain->bytes, focus->bytes);
638                         if (hammer2_debug & 0x4000)
639                                 panic("cluster_resolve");
640                         /* flag issue and force resync? */
641                 }
642         }
643 skip4:
644
645         if (ttlslaves == 0)
646                 nflags |= HAMMER2_CLUSTER_NOSOFT;
647         if (ttlmasters == 0)
648                 nflags |= HAMMER2_CLUSTER_NOHARD;
649
650         /*
651          * Set SSYNCED or MSYNCED for slaves and masters respectively if
652          * all available nodes (even if 0 are available) are fully
653          * synchronized.  This is used by the synchronization thread to
654          * determine if there is work it could potentially accomplish.
655          */
656         if (nslaves == ttlslaves)
657                 nflags |= HAMMER2_CLUSTER_SSYNCED;
658         if (nmasters == ttlmasters)
659                 nflags |= HAMMER2_CLUSTER_MSYNCED;
660
661         /*
662          * Determine if the cluster was successfully locked for the
663          * requested operation and generate an error code.  The cluster
664          * will not be locked (or ref'd) if an error is returned.
665          */
666         atomic_set_int(&cluster->flags, nflags);
667         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
668 }
669
670 /*
671  * This is used by the XOPS subsystem to calculate the state of
672  * the collection and tell hammer2_xop_collect() what to do with it.
673  * The collection can be in various states of desynchronization, the
674  * caller specifically wants to resolve the passed-in key.
675  *
676  * Return values:
677  *      0               - Quorum agreement, key is valid
678  *
679  *      ENOENT          - Quorum agreement, end of scan
680  *
681  *      ESRCH           - Quorum agreement, key is INVALID (caller should
682  *                        skip key).
683  *
684  *      EIO             - Quorum agreement but all elements had errors.
685  *
686  *      EDEADLK         - No quorum agreement possible for key, a repair
687  *                        may be needed.  Caller has to decide what to do,
688  *                        possibly iterating the key or generating an EIO.
689  *
690  *      EINPROGRESS     - No quorum agreement yet, but agreement is still
691  *                        possible if caller waits for more responses.  Caller
692  *                        should not iterate key.
693  *
694  * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
695  *
696  * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
697  */
698 int
699 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
700 {
701         hammer2_chain_t *chain;
702         hammer2_chain_t *focus;
703         hammer2_pfs_t *pmp;
704         hammer2_tid_t quorum_tid;
705         hammer2_tid_t last_best_quorum_tid;
706         uint32_t nflags;
707         int ttlmasters;
708         int ttlslaves;
709         int nmasters;
710         int nmasters_keymatch;
711         int nslaves;
712         int nquorum;
713         int umasters;   /* unknown masters (still in progress) */
714         int smpresent;
715         int error;
716         int i;
717
718         cluster->error = 0;
719         cluster->focus = NULL;
720
721         pmp = cluster->pmp;
722         KKASSERT(pmp != NULL || cluster->nchains == 0);
723
724         /*
725          * Calculate quorum
726          */
727         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
728         smpresent = 0;
729         nflags = 0;
730         ttlmasters = 0;
731         ttlslaves = 0;
732
733         /*
734          * Pass 1
735          *
736          * NOTE: A NULL chain is not necessarily an error, it could be
737          *       e.g. a lookup failure or the end of an iteration.
738          *       Process normally.
739          */
740         for (i = 0; i < cluster->nchains; ++i) {
741                 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
742                 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
743
744                 chain = cluster->array[i].chain;
745                 error = cluster->array[i].error;
746                 if (chain && error) {
747                         if (cluster->focus == NULL || cluster->focus == chain) {
748                                 /* error will be overridden by valid focus */
749                                 /* XXX */
750                         }
751
752                         /*
753                          * Must count total masters and slaves whether the
754                          * chain is errored or not.
755                          */
756                         switch (cluster->pmp->pfs_types[i]) {
757                         case HAMMER2_PFSTYPE_SUPROOT:
758                         case HAMMER2_PFSTYPE_MASTER:
759                                 ++ttlmasters;
760                                 break;
761                         case HAMMER2_PFSTYPE_SLAVE:
762                                 ++ttlslaves;
763                                 break;
764                         }
765                         continue;
766                 }
767                 switch (cluster->pmp->pfs_types[i]) {
768                 case HAMMER2_PFSTYPE_MASTER:
769                         ++ttlmasters;
770                         break;
771                 case HAMMER2_PFSTYPE_SLAVE:
772                         ++ttlslaves;
773                         break;
774                 case HAMMER2_PFSTYPE_SOFT_MASTER:
775                         nflags |= HAMMER2_CLUSTER_WRSOFT;
776                         nflags |= HAMMER2_CLUSTER_RDSOFT;
777                         smpresent = 1;
778                         break;
779                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
780                         nflags |= HAMMER2_CLUSTER_RDSOFT;
781                         break;
782                 case HAMMER2_PFSTYPE_SUPROOT:
783                         /*
784                          * Degenerate cluster representing the super-root
785                          * topology on a single device.  Fake stuff so
786                          * cluster ops work as expected.
787                          */
788                         ++ttlmasters;
789                         nflags |= HAMMER2_CLUSTER_WRHARD;
790                         nflags |= HAMMER2_CLUSTER_RDHARD;
791                         cluster->focus_index = i;
792                         cluster->focus = chain;
793                         cluster->error = error;
794                         break;
795                 default:
796                         break;
797                 }
798         }
799
800         /*
801          * Pass 2
802          *
803          * Resolve nmasters             - master nodes fully match
804          *
805          * Resolve umasters             - master nodes operation still
806          *                                in progress
807          *
808          * Resolve nmasters_keymatch    - master nodes match the passed-in
809          *                                key and may or may not match
810          *                                the quorum-agreed tid.
811          * 
812          * The quorum-agreed TID is the highest matching TID.
813          */
814         last_best_quorum_tid = HAMMER2_TID_MAX;
815         umasters = 0;
816         nmasters = 0;
817         nmasters_keymatch = 0;
818         quorum_tid = 0;         /* fix gcc warning */
819
820         while (nmasters < nquorum && last_best_quorum_tid != 0) {
821                 umasters = 0;
822                 nmasters = 0;
823                 nmasters_keymatch = 0;
824                 quorum_tid = 0;
825
826                 for (i = 0; i < cluster->nchains; ++i) {
827                         /* XXX SOFT smpresent handling */
828                         switch(cluster->pmp->pfs_types[i]) {
829                         case HAMMER2_PFSTYPE_MASTER:
830                         case HAMMER2_PFSTYPE_SUPROOT:
831                                 break;
832                         default:
833                                 continue;
834                         }
835
836                         chain = cluster->array[i].chain;
837                         error = cluster->array[i].error;
838
839                         /*
840                          * Skip elements still in progress.  umasters keeps
841                          * track of masters that might still be in-progress.
842                          */
843                         if (chain == NULL && (cluster->array[i].flags &
844                                               HAMMER2_CITEM_NULL) == 0) {
845                                 ++umasters;
846                                 continue;
847                         }
848
849                         /*
850                          * Key match?
851                          */
852                         if (flags & HAMMER2_CHECK_NULL) {
853                                 if (chain == NULL) {
854                                         ++nmasters;
855                                         ++nmasters_keymatch;
856                                         if (cluster->error == 0)
857                                                 cluster->error = error;
858                                 }
859                         } else if (chain &&
860                                    (key == (hammer2_key_t)-1 ||
861                                     chain->bref.key == key)) {
862                                 ++nmasters_keymatch;
863
864                                 if (chain->bref.modify_tid <
865                                      last_best_quorum_tid &&
866                                     quorum_tid < chain->bref.modify_tid) {
867                                         /*
868                                          * Select new TID as master if better
869                                          * than any found so far in this loop,
870                                          * as long as it does not reach the
871                                          * best tid found in the previous loop.
872                                          */
873                                         nmasters = 0;
874                                         quorum_tid = chain->bref.modify_tid;
875                                 }
876                                 if (quorum_tid == chain->bref.modify_tid) {
877                                         /*
878                                          * TID matches current collection.
879                                          *
880                                          * (error handled in next pass)
881                                          */
882                                         ++nmasters;
883                                         if (chain->error == 0) {
884                                                 cluster->focus = chain;
885                                                 cluster->focus_index = i;
886                                         }
887                                 }
888                         }
889                 }
890                 if (nmasters >= nquorum)
891                         break;
892                 last_best_quorum_tid = quorum_tid;
893         }
894
895         /*
896         kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
897                 nmasters, nquorum, nmasters_keymatch, umasters);
898         */
899
900         /*
901          * Early return if we do not have enough masters.
902          */
903         if (nmasters < nquorum) {
904                 if (nmasters + umasters >= nquorum)
905                         return HAMMER2_ERROR_EINPROGRESS;
906                 if (nmasters_keymatch < nquorum) 
907                         return HAMMER2_ERROR_ESRCH;
908                 return HAMMER2_ERROR_EDEADLK;
909         }
910
911         /*
912          * Validated end of scan.
913          */
914         if (flags & HAMMER2_CHECK_NULL) {
915                 if (cluster->error == 0)
916                         cluster->error = HAMMER2_ERROR_ENOENT;
917                 return cluster->error;
918         }
919
920         /*
921          * If we have a NULL focus at this point the agreeing quorum all
922          * had chain errors.
923          */
924         if (cluster->focus == NULL)
925                 return HAMMER2_ERROR_EIO;
926
927         /*
928          * Pass 3
929          *
930          * We have quorum agreement, validate elements, not end of scan.
931          */
932         nslaves = 0;
933         cluster->error = 0;
934
935         for (i = 0; i < cluster->nchains; ++i) {
936                 chain = cluster->array[i].chain;
937                 error = cluster->array[i].error;
938                 if (chain == NULL ||
939                     chain->bref.key != key ||
940                     chain->bref.modify_tid != quorum_tid) {
941                         continue;
942                 }
943
944                 /*
945                  * Quorum Match
946                  *
947                  * XXX for now, cumulative error.
948                  */
949                 if (cluster->error == 0)
950                         cluster->error = error;
951
952                 switch (cluster->pmp->pfs_types[i]) {
953                 case HAMMER2_PFSTYPE_MASTER:
954                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
955                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
956                         nflags |= HAMMER2_CLUSTER_WRHARD;
957                         nflags |= HAMMER2_CLUSTER_RDHARD;
958                         break;
959                 case HAMMER2_PFSTYPE_SLAVE:
960                         /*
961                          * We must have enough up-to-date masters to reach
962                          * a quorum and the slave modify_tid must match the
963                          * quorum's modify_tid.
964                          *
965                          * Do not select an errored slave.
966                          */
967                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
968                         nflags |= HAMMER2_CLUSTER_RDHARD;
969                         ++nslaves;
970                         break;
971                 case HAMMER2_PFSTYPE_SOFT_MASTER:
972                         /*
973                          * Directly mounted soft master always wins.  There
974                          * should be only one.
975                          */
976                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
977                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
978                         break;
979                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
980                         /*
981                          * Directly mounted soft slave always wins.  There
982                          * should be only one.
983                          *
984                          * XXX
985                          */
986                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
987                         break;
988                 case HAMMER2_PFSTYPE_SUPROOT:
989                         /*
990                          * spmp (degenerate case)
991                          */
992                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
993                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
994                         nflags |= HAMMER2_CLUSTER_WRHARD;
995                         nflags |= HAMMER2_CLUSTER_RDHARD;
996                         break;
997                 default:
998                         break;
999                 }
1000         }
1001
1002         /*
1003          * Focus now set, adjust ddflag.  Skip this pass if the focus
1004          * is bad or if we are at the PFS root (the bref won't match at
1005          * the PFS root, obviously).
1006          *
1007          * focus is probably not locked and it isn't safe to test its
1008          * content (e.g. focus->data, focus->dio, other content).  We
1009          * do not synchronize the dio to the cpu here.  In fact, in numerous
1010          * situations the frontend doesn't even need to access its dio/data,
1011          * so synchronizing it here would be wasteful.
1012          */
1013         focus = cluster->focus;
1014         if (focus) {
1015                 cluster->ddflag =
1016                         (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
1017         } else {
1018                 cluster->ddflag = 0;
1019                 goto skip4;
1020         }
1021         if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1022                 goto skip4;
1023
1024         /*
1025          * Pass 4
1026          *
1027          * Validate the elements that were not marked invalid.  They should
1028          * match.
1029          */
1030         for (i = 0; i < cluster->nchains; ++i) {
1031                 int ddflag;
1032
1033                 chain = cluster->array[i].chain;
1034
1035                 if (chain == NULL)
1036                         continue;
1037                 if (chain == focus)
1038                         continue;
1039                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1040                         continue;
1041
1042                 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1043                 if (chain->bref.type != focus->bref.type ||
1044                     chain->bref.key != focus->bref.key ||
1045                     chain->bref.keybits != focus->bref.keybits ||
1046                     chain->bref.modify_tid != focus->bref.modify_tid ||
1047                     chain->bytes != focus->bytes ||
1048                     ddflag != cluster->ddflag) {
1049                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1050                         if (hammer2_debug & 1)
1051                         kprintf("cluster_resolve: matching modify_tid failed "
1052                                 "bref test: idx=%d type=%02x/%02x "
1053                                 "key=%016jx/%d-%016jx/%d "
1054                                 "mod=%016jx/%016jx bytes=%u/%u\n",
1055                                 i,
1056                                 chain->bref.type, focus->bref.type,
1057                                 chain->bref.key, chain->bref.keybits,
1058                                 focus->bref.key, focus->bref.keybits,
1059                                 chain->bref.modify_tid, focus->bref.modify_tid,
1060                                 chain->bytes, focus->bytes);
1061                         if (hammer2_debug & 0x4000)
1062                                 panic("cluster_resolve");
1063                         /* flag issue and force resync? */
1064                 }
1065         }
1066 skip4:
1067
1068         if (ttlslaves == 0)
1069                 nflags |= HAMMER2_CLUSTER_NOSOFT;
1070         if (ttlmasters == 0)
1071                 nflags |= HAMMER2_CLUSTER_NOHARD;
1072
1073         /*
1074          * Set SSYNCED or MSYNCED for slaves and masters respectively if
1075          * all available nodes (even if 0 are available) are fully
1076          * synchronized.  This is used by the synchronization thread to
1077          * determine if there is work it could potentially accomplish.
1078          */
1079         if (nslaves == ttlslaves)
1080                 nflags |= HAMMER2_CLUSTER_SSYNCED;
1081         if (nmasters == ttlmasters)
1082                 nflags |= HAMMER2_CLUSTER_MSYNCED;
1083
1084         /*
1085          * Determine if the cluster was successfully locked for the
1086          * requested operation and generate an error code.  The cluster
1087          * will not be locked (or ref'd) if an error is returned.
1088          */
1089         atomic_set_int(&cluster->flags, nflags);
1090         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1091
1092         return cluster->error;
1093 }
1094
1095 /*
1096  * This is used by the sync thread to force non-NULL elements of a copy
1097  * of the pmp->iroot cluster to be good which is required to prime the
1098  * sync.
1099  */
1100 void
1101 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1102 {
1103         int i;
1104
1105         for (i = 0; i < cluster->nchains; ++i) {
1106                 if (cluster->array[i].chain)
1107                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1108         }
1109 }
1110
1111 /*
1112  * Unlock a cluster.  Refcount and focus is maintained.
1113  */
1114 void
1115 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1116 {
1117         hammer2_chain_t *chain;
1118         int i;
1119
1120         if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1121                 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1122                         cluster);
1123         }
1124         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1125         KKASSERT(cluster->refs > 0);
1126         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1127
1128         for (i = 0; i < cluster->nchains; ++i) {
1129                 chain = cluster->array[i].chain;
1130                 if (chain)
1131                         hammer2_chain_unlock(chain);
1132         }
1133 }