Merge branch 'vendor/GCC50'
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *          locks and I/O, do quorum and/or master-slave processing, and
50  *          it must operate properly even if some nodes are broken (which
51  *          can also mean indefinite locks).
52  *
53  *                              CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *
59  *      - Most complex functions, quorum management on transaction ids.
60  *
61  *      - Locking and data accesses must be internally asynchronous.
62  *
63  *      - Validate and manage cache coherency primitives (cache state
64  *        is stored in chain topologies but must be validated by these
65  *        functions).
66  *
67  * (2) Lookups and Scans
68  *              hammer2_cluster_lookup()
69  *              hammer2_cluster_next()
70  *
71  *      - Depend on locking & data retrieval functions, but still complex.
72  *
73  *      - Must do quorum management on transaction ids.
74  *
75  *      - Lookup and Iteration ops Must be internally asynchronous.
76  *
77  * (3) Modifying Operations
78  *              hammer2_cluster_create()
79  *
80  *      - Can usually punt on failures, operation continues unless quorum
81  *        is lost.  If quorum is lost, must wait for resynchronization
82  *        (depending on the management mode).
83  *
84  *      - Must disconnect node on failures (also not flush), remount, and
85  *        resynchronize.
86  *
87  *      - Network links (via kdmsg) are relatively easy to issue as the
88  *        complex underworkings of hammer2_chain.c don't have to messed
89  *        with (the protocol is at a higher level than block-level).
90  *
91  *      - Multiple local disk nodes (i.e. block devices) are another matter.
92  *        Chain operations have to be dispatched to per-node threads (xN)
93  *        because we can't asynchronize potentially very complex chain
94  *        operations in hammer2_chain.c (it would be a huge mess).
95  *
96  *        (these threads are also used to terminate incoming kdmsg ops from
97  *        other machines).
98  *
99  *      - Single-node filesystems do not use threads and will simply call
100  *        hammer2_chain.c functions directly.  This short-cut is handled
101  *        at the base of each cluster function.
102  */
103 #include <sys/cdefs.h>
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/types.h>
107 #include <sys/lock.h>
108 #include <sys/uuid.h>
109
110 #include "hammer2.h"
111
112 /*
113  * Returns the bref type of the cluster's foucs.
114  *
115  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
116  * The cluster must be locked.
117  */
118 uint8_t
119 hammer2_cluster_type(hammer2_cluster_t *cluster)
120 {
121         if (cluster->error == 0) {
122                 KKASSERT(cluster->focus != NULL);
123                 return(cluster->focus->bref.type);
124         }
125         return 0;
126 }
127
128 /*
129  * Returns non-zero if the cluster's focus is flagged as being modified.
130  *
131  * If the cluster is errored, returns 0.
132  */
133 static
134 int
135 hammer2_cluster_modified(hammer2_cluster_t *cluster)
136 {
137         if (cluster->error == 0) {
138                 KKASSERT(cluster->focus != NULL);
139                 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
140         }
141         return 0;
142 }
143
144 /*
145  * Returns the bref of the cluster's focus, sans any data-offset information
146  * (since offset information is per-node and wouldn't be useful).
147  *
148  * Callers use this function to access modify_tid, mirror_tid, type,
149  * key, and keybits.
150  *
151  * If the cluster is errored, returns an empty bref.
152  * The cluster must be locked.
153  */
154 void
155 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
156 {
157         if (cluster->error == 0) {
158                 KKASSERT(cluster->focus != NULL);
159                 *bref = cluster->focus->bref;
160                 bref->data_off = 0;
161         } else {
162                 bzero(bref, sizeof(*bref));
163         }
164 }
165
166 /*
167  * Create a degenerate cluster with one ref from a single locked chain.
168  * The returned cluster will be focused on the chain and inherit its
169  * error state.
170  *
171  * The chain's lock and reference are transfered to the new cluster, so
172  * the caller should not try to unlock the chain separately.
173  *
174  * We fake the flags.
175  */
176 hammer2_cluster_t *
177 hammer2_cluster_from_chain(hammer2_chain_t *chain)
178 {
179         hammer2_cluster_t *cluster;
180
181         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
182         cluster->array[0].chain = chain;
183         cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
184         cluster->nchains = 1;
185         cluster->focus = chain;
186         cluster->focus_index = 0;
187         cluster->pmp = chain->pmp;
188         cluster->refs = 1;
189         cluster->error = chain->error;
190         cluster->flags = HAMMER2_CLUSTER_LOCKED |
191                          HAMMER2_CLUSTER_WRHARD |
192                          HAMMER2_CLUSTER_RDHARD |
193                          HAMMER2_CLUSTER_MSYNCED |
194                          HAMMER2_CLUSTER_SSYNCED;
195
196         return cluster;
197 }
198
199 /*
200  * Add a reference to a cluster and its underlying chains.
201  *
202  * We must also ref the underlying chains in order to allow ref/unlock
203  * sequences to later re-lock.
204  */
205 void
206 hammer2_cluster_ref(hammer2_cluster_t *cluster)
207 {
208         atomic_add_int(&cluster->refs, 1);
209 }
210
211 /*
212  * Drop the caller's reference to the cluster.  When the ref count drops to
213  * zero this function frees the cluster and drops all underlying chains.
214  *
215  * In-progress read I/Os are typically detached from the cluster once the
216  * first one returns (the remaining stay attached to the DIOs but are then
217  * ignored and drop naturally).
218  */
219 void
220 hammer2_cluster_drop(hammer2_cluster_t *cluster)
221 {
222         hammer2_chain_t *chain;
223         int i;
224
225         KKASSERT(cluster->refs > 0);
226         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
227                 cluster->focus = NULL;          /* safety XXX chg to assert */
228                 cluster->focus_index = 0;
229
230                 for (i = 0; i < cluster->nchains; ++i) {
231                         chain = cluster->array[i].chain;
232                         if (chain) {
233                                 hammer2_chain_drop(chain);
234                                 cluster->array[i].chain = NULL; /* safety */
235                         }
236                 }
237                 cluster->nchains = 0;                           /* safety */
238
239                 kfree(cluster, M_HAMMER2);
240                 /* cluster is invalid */
241         }
242 }
243
244 /*
245  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained. 
246  *
247  * WARNING! This function expects the caller to handle resolution of the
248  *          cluster.  We never re-resolve the cluster in this function,
249  *          because it might be used to temporarily unlock/relock a cparent
250  *          in an iteration or recursrion, and the cparents elements do not
251  *          necessarily match.
252  */
253 void
254 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
255 {
256         hammer2_chain_t *chain;
257         int i;
258
259         /* cannot be on inode-embedded cluster template, must be on copy */
260         KKASSERT(cluster->refs > 0);
261         KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
262         if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
263                 panic("hammer2_cluster_lock: cluster %p already locked!\n",
264                         cluster);
265         }
266         atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
267
268         /*
269          * Lock chains and resolve state.
270          */
271         for (i = 0; i < cluster->nchains; ++i) {
272                 chain = cluster->array[i].chain;
273                 if (chain == NULL)
274                         continue;
275                 hammer2_chain_lock(chain, how);
276         }
277 }
278
279 /*
280  * Calculate the clustering state for the cluster and set its focus.
281  * This routine must be called with care.  For example, it should not
282  * normally be called after relocking a non-leaf cluster because parent
283  * clusters help iterations and each element might be at a slightly different
284  * indirect node (each node's topology is independently indexed).
285  *
286  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
287  * operations.  Typically this is only set on a quorum of MASTERs or
288  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
289  * is present, this bit is *not* set on a quorum of MASTERs.  The
290  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
291  * that create/modify/delete elements use it.
292  *
293  * The chains making up the cluster may be narrowed down based on quorum
294  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
295  * narrowed down to a single chain as long as the entire subtopology is known
296  * to be intact.  So, for example, we can narrow a read-only op to a single
297  * fast SLAVE but if we focus a CACHE chain we must still retain at least
298  * a SLAVE to ensure that the subtopology can be accessed.
299  *
300  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
301  * to be maintained once the topology is validated as-of the top level of
302  * the operation.
303  *
304  * If a failure occurs the operation must be aborted by higher-level code and
305  * retried. XXX
306  */
307 void
308 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
309 {
310         hammer2_chain_t *chain;
311         hammer2_chain_t *focus;
312         hammer2_pfs_t *pmp;
313         hammer2_tid_t quorum_tid;
314         hammer2_tid_t last_best_quorum_tid;
315         int focus_pfs_type;
316         uint32_t nflags;
317         int ttlmasters;
318         int ttlslaves;
319         int nmasters;
320         int nslaves;
321         int nquorum;
322         int smpresent;
323         int i;
324
325         cluster->error = 0;
326         cluster->focus = NULL;
327
328         focus_pfs_type = 0;
329         nflags = 0;
330         ttlmasters = 0;
331         ttlslaves = 0;
332         nmasters = 0;
333         nslaves = 0;
334
335         /*
336          * Calculate quorum
337          */
338         pmp = cluster->pmp;
339         KKASSERT(pmp != NULL || cluster->nchains == 0);
340         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
341         smpresent = 0;
342
343         /*
344          * Pass 1
345          *
346          * NOTE: A NULL chain is not necessarily an error, it could be
347          *       e.g. a lookup failure or the end of an iteration.
348          *       Process normally.
349          */
350         for (i = 0; i < cluster->nchains; ++i) {
351                 chain = cluster->array[i].chain;
352                 if (chain && chain->error) {
353                         if (cluster->focus == NULL || cluster->focus == chain) {
354                                 /* error will be overridden by valid focus */
355                                 cluster->error = chain->error;
356                         }
357
358                         /*
359                          * Must count total masters and slaves whether the
360                          * chain is errored or not.
361                          */
362                         switch (cluster->pmp->pfs_types[i]) {
363                         case HAMMER2_PFSTYPE_SUPROOT:
364                         case HAMMER2_PFSTYPE_MASTER:
365                                 ++ttlmasters;
366                                 break;
367                         case HAMMER2_PFSTYPE_SLAVE:
368                                 ++ttlslaves;
369                                 break;
370                         }
371                         continue;
372                 }
373                 switch (cluster->pmp->pfs_types[i]) {
374                 case HAMMER2_PFSTYPE_MASTER:
375                         ++ttlmasters;
376                         break;
377                 case HAMMER2_PFSTYPE_SLAVE:
378                         ++ttlslaves;
379                         break;
380                 case HAMMER2_PFSTYPE_SOFT_MASTER:
381                         nflags |= HAMMER2_CLUSTER_WRSOFT;
382                         nflags |= HAMMER2_CLUSTER_RDSOFT;
383                         smpresent = 1;
384                         break;
385                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
386                         nflags |= HAMMER2_CLUSTER_RDSOFT;
387                         break;
388                 case HAMMER2_PFSTYPE_SUPROOT:
389                         /*
390                          * Degenerate cluster representing the super-root
391                          * topology on a single device.  Fake stuff so
392                          * cluster ops work as expected.
393                          */
394                         nflags |= HAMMER2_CLUSTER_WRHARD;
395                         nflags |= HAMMER2_CLUSTER_RDHARD;
396                         cluster->focus_index = i;
397                         cluster->focus = chain;
398                         cluster->error = chain ? chain->error : 0;
399                         ++ttlmasters;
400                         break;
401                 default:
402                         break;
403                 }
404         }
405
406         /*
407          * Pass 2
408          *
409          * Resolve masters.  Calculate nmasters for the highest matching
410          * TID, if a quorum cannot be attained try the next lower matching
411          * TID until we exhaust TIDs.
412          *
413          * NOTE: A NULL chain is not necessarily an error, it could be
414          *       e.g. a lookup failure or the end of an iteration.
415          *       Process normally.
416          */
417         last_best_quorum_tid = HAMMER2_TID_MAX;
418         quorum_tid = 0;         /* fix gcc warning */
419
420         while (nmasters < nquorum && last_best_quorum_tid != 0) {
421                 nmasters = 0;
422                 quorum_tid = 0;
423
424                 for (i = 0; i < cluster->nchains; ++i) {
425                         switch (cluster->pmp->pfs_types[i]) {
426                         case HAMMER2_PFSTYPE_SUPROOT:
427                         case HAMMER2_PFSTYPE_MASTER:
428                                 break;
429                         default:
430                                 continue;
431                         }
432                         chain = cluster->array[i].chain;
433
434                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
435                                 /*
436                                  * Invalid as in unsynchronized, cannot be
437                                  * used to calculate the quorum.
438                                  */
439                         } else if (chain == NULL && quorum_tid == 0) {
440                                 /*
441                                  * NULL chain on master matches NULL chains
442                                  * on other masters.
443                                  */
444                                 ++nmasters;
445                         } else if (quorum_tid < last_best_quorum_tid &&
446                                    chain != NULL &&
447                                    (quorum_tid < chain->bref.modify_tid ||
448                                     nmasters == 0)) {
449                                 /*
450                                  * Better TID located, reset nmasters count.
451                                  */
452                                 nmasters = 1;
453                                 quorum_tid = chain->bref.modify_tid;
454                         } else if (chain &&
455                                    quorum_tid == chain->bref.modify_tid) {
456                                 /*
457                                  * TID matches current collection.
458                                  */
459                                 ++nmasters;
460                         }
461                 }
462                 if (nmasters >= nquorum)
463                         break;
464                 last_best_quorum_tid = quorum_tid;
465         }
466
467         /*
468          * Pass 3
469          *
470          * NOTE: A NULL chain is not necessarily an error, it could be
471          *       e.g. a lookup failure or the end of an iteration.
472          *       Process normally.
473          */
474         for (i = 0; i < cluster->nchains; ++i) {
475                 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
476                 chain = cluster->array[i].chain;
477                 if (chain && chain->error) {
478                         if (cluster->focus == NULL || cluster->focus == chain) {
479                                 /* error will be overridden by valid focus */
480                                 cluster->error = chain->error;
481                         }
482                         continue;
483                 }
484
485                 switch (cluster->pmp->pfs_types[i]) {
486                 case HAMMER2_PFSTYPE_MASTER:
487                         /*
488                          * We must have enough up-to-date masters to reach
489                          * a quorum and the master modify_tid must match
490                          * the quorum's modify_tid.
491                          *
492                          * Do not select an errored or out-of-sync master.
493                          */
494                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
495                                 nflags |= HAMMER2_CLUSTER_UNHARD;
496                         } else if (nmasters >= nquorum &&
497                                    (chain == NULL || chain->error == 0) &&
498                                    ((chain == NULL && quorum_tid == 0) ||
499                                     (chain != NULL && quorum_tid ==
500                                                   chain->bref.modify_tid))) {
501                                 nflags |= HAMMER2_CLUSTER_WRHARD;
502                                 nflags |= HAMMER2_CLUSTER_RDHARD;
503                                 if (!smpresent) {
504                                         cluster->array[i].flags |=
505                                                         HAMMER2_CITEM_FEMOD;
506                                 }
507                                 if (cluster->focus == NULL ||
508                                     focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
509                                         focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
510                                         cluster->focus_index = i;
511                                         cluster->focus = chain; /* NULL ok */
512                                         cluster->error = chain ? chain->error :
513                                                                  0;
514                                 }
515                         } else if (chain == NULL || chain->error == 0) {
516                                 nflags |= HAMMER2_CLUSTER_UNHARD;
517                         }
518                         break;
519                 case HAMMER2_PFSTYPE_SLAVE:
520                         /*
521                          * We must have enough up-to-date masters to reach
522                          * a quorum and the slave modify_tid must match the
523                          * quorum's modify_tid.
524                          *
525                          * Do not select an errored slave.
526                          */
527                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
528                                 nflags |= HAMMER2_CLUSTER_UNHARD;
529                         } else if (nmasters >= nquorum &&
530                                    (chain == NULL || chain->error == 0) &&
531                                    ((chain == NULL && quorum_tid == 0) ||
532                                     (chain && quorum_tid ==
533                                               chain->bref.modify_tid))) {
534                                 ++nslaves;
535                                 nflags |= HAMMER2_CLUSTER_RDHARD;
536 #if 0
537                                 /* XXX optimize for RESOLVE_RDONLY */
538                                 if (cluster->focus == NULL) {
539                                         focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
540                                         cluster->focus_index = i;
541                                         cluster->focus = chain; /* NULL ok */
542                                         cluster->error = chain ? chain->error :
543                                                                  0;
544                                 }
545 #endif
546                         } else if (chain == NULL || chain->error == 0) {
547                                 nflags |= HAMMER2_CLUSTER_UNSOFT;
548                         }
549                         break;
550                 case HAMMER2_PFSTYPE_SOFT_MASTER:
551                         /*
552                          * Directly mounted soft master always wins.  There
553                          * should be only one.
554                          */
555                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
556                         cluster->focus_index = i;
557                         cluster->focus = chain;
558                         cluster->error = chain ? chain->error : 0;
559                         focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
560                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
561                         break;
562                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
563                         /*
564                          * Directly mounted soft slave always wins.  There
565                          * should be only one.
566                          */
567                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
568                         if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
569                                 cluster->focus_index = i;
570                                 cluster->focus = chain;
571                                 cluster->error = chain ? chain->error : 0;
572                                 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
573                         }
574                         break;
575                 case HAMMER2_PFSTYPE_SUPROOT:
576                         /*
577                          * spmp (degenerate case)
578                          */
579                         KKASSERT(i == 0);
580                         cluster->focus_index = i;
581                         cluster->focus = chain;
582                         cluster->error = chain ? chain->error : 0;
583                         focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
584                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
585                         break;
586                 default:
587                         break;
588                 }
589         }
590
591         /*
592          * Focus now set, adjust ddflag.  Skip this pass if the focus
593          * is bad or if we are at the PFS root (the bref won't match at
594          * the PFS root, obviously).
595          */
596         focus = cluster->focus;
597         if (focus) {
598                 cluster->ddflag =
599                         (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
600         } else {
601                 cluster->ddflag = 0;
602                 goto skip4;
603         }
604         if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
605                 goto skip4;
606
607         /*
608          * Pass 4
609          *
610          * Validate the elements that were not marked invalid.  They should
611          * match.
612          */
613         for (i = 0; i < cluster->nchains; ++i) {
614                 int ddflag;
615
616                 chain = cluster->array[i].chain;
617
618                 if (chain == NULL)
619                         continue;
620                 if (chain == focus)
621                         continue;
622                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
623                         continue;
624
625                 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
626                 if (chain->bref.type != focus->bref.type ||
627                     chain->bref.key != focus->bref.key ||
628                     chain->bref.keybits != focus->bref.keybits ||
629                     chain->bref.modify_tid != focus->bref.modify_tid ||
630                     chain->bytes != focus->bytes ||
631                     ddflag != cluster->ddflag) {
632                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
633                         if (hammer2_debug & 1)
634                         kprintf("cluster_resolve: matching modify_tid failed "
635                                 "bref test: idx=%d type=%02x/%02x "
636                                 "key=%016jx/%d-%016jx/%d "
637                                 "mod=%016jx/%016jx bytes=%u/%u\n",
638                                 i,
639                                 chain->bref.type, focus->bref.type,
640                                 chain->bref.key, chain->bref.keybits,
641                                 focus->bref.key, focus->bref.keybits,
642                                 chain->bref.modify_tid, focus->bref.modify_tid,
643                                 chain->bytes, focus->bytes);
644                         if (hammer2_debug & 0x4000)
645                                 panic("cluster_resolve");
646                         /* flag issue and force resync? */
647                 }
648         }
649 skip4:
650
651         if (ttlslaves == 0)
652                 nflags |= HAMMER2_CLUSTER_NOSOFT;
653         if (ttlmasters == 0)
654                 nflags |= HAMMER2_CLUSTER_NOHARD;
655
656         /*
657          * Set SSYNCED or MSYNCED for slaves and masters respectively if
658          * all available nodes (even if 0 are available) are fully
659          * synchronized.  This is used by the synchronization thread to
660          * determine if there is work it could potentially accomplish.
661          */
662         if (nslaves == ttlslaves)
663                 nflags |= HAMMER2_CLUSTER_SSYNCED;
664         if (nmasters == ttlmasters)
665                 nflags |= HAMMER2_CLUSTER_MSYNCED;
666
667         /*
668          * Determine if the cluster was successfully locked for the
669          * requested operation and generate an error code.  The cluster
670          * will not be locked (or ref'd) if an error is returned.
671          */
672         atomic_set_int(&cluster->flags, nflags);
673         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
674 }
675
676 /*
677  * This is used by the XOPS subsystem to calculate the state of
678  * the collection and tell hammer2_xop_collect() what to do with it.
679  * The collection can be in various states of desynchronization, the
680  * caller specifically wants to resolve the passed-in key.
681  *
682  * Return values:
683  *      0               - Quorum agreement, key is valid
684  *
685  *      ENOENT          - Quorum agreement, end of scan
686  *
687  *      ESRCH           - Quorum agreement, key is INVALID (caller should
688  *                        skip key).
689  *
690  *      EIO             - Quorum agreement but all elements had errors.
691  *
692  *      EDEADLK         - No quorum agreement possible for key, a repair
693  *                        may be needed.  Caller has to decide what to do,
694  *                        possibly iterating the key or generating an EIO.
695  *
696  *      EINPROGRESS     - No quorum agreement yet, but agreement is still
697  *                        possible if caller waits for more responses.  Caller
698  *                        should not iterate key.
699  *
700  * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
701  *
702  * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
703  */
704 int
705 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
706 {
707         hammer2_chain_t *chain;
708         hammer2_chain_t *focus;
709         hammer2_pfs_t *pmp;
710         hammer2_tid_t quorum_tid;
711         hammer2_tid_t last_best_quorum_tid;
712         uint32_t nflags;
713         int ttlmasters;
714         int ttlslaves;
715         int nmasters;
716         int nmasters_keymatch;
717         int nslaves;
718         int nquorum;
719         int umasters;   /* unknown masters (still in progress) */
720         int smpresent;
721         int error;
722         int i;
723
724         cluster->error = 0;
725         cluster->focus = NULL;
726
727         pmp = cluster->pmp;
728         KKASSERT(pmp != NULL || cluster->nchains == 0);
729
730         /*
731          * Calculate quorum
732          */
733         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
734         smpresent = 0;
735         nflags = 0;
736         ttlmasters = 0;
737         ttlslaves = 0;
738         nmasters = 0;
739         nmasters_keymatch = 0;
740         umasters = 0;
741         nslaves = 0;
742
743
744         /*
745          * Pass 1
746          *
747          * NOTE: A NULL chain is not necessarily an error, it could be
748          *       e.g. a lookup failure or the end of an iteration.
749          *       Process normally.
750          */
751         for (i = 0; i < cluster->nchains; ++i) {
752                 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
753                 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
754
755                 chain = cluster->array[i].chain;
756                 error = cluster->array[i].error;
757                 if (chain && error) {
758                         if (cluster->focus == NULL || cluster->focus == chain) {
759                                 /* error will be overridden by valid focus */
760                                 /* XXX */
761                         }
762
763                         /*
764                          * Must count total masters and slaves whether the
765                          * chain is errored or not.
766                          */
767                         switch (cluster->pmp->pfs_types[i]) {
768                         case HAMMER2_PFSTYPE_SUPROOT:
769                         case HAMMER2_PFSTYPE_MASTER:
770                                 ++ttlmasters;
771                                 break;
772                         case HAMMER2_PFSTYPE_SLAVE:
773                                 ++ttlslaves;
774                                 break;
775                         }
776                         continue;
777                 }
778                 switch (cluster->pmp->pfs_types[i]) {
779                 case HAMMER2_PFSTYPE_MASTER:
780                         ++ttlmasters;
781                         break;
782                 case HAMMER2_PFSTYPE_SLAVE:
783                         ++ttlslaves;
784                         break;
785                 case HAMMER2_PFSTYPE_SOFT_MASTER:
786                         nflags |= HAMMER2_CLUSTER_WRSOFT;
787                         nflags |= HAMMER2_CLUSTER_RDSOFT;
788                         smpresent = 1;
789                         break;
790                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
791                         nflags |= HAMMER2_CLUSTER_RDSOFT;
792                         break;
793                 case HAMMER2_PFSTYPE_SUPROOT:
794                         /*
795                          * Degenerate cluster representing the super-root
796                          * topology on a single device.  Fake stuff so
797                          * cluster ops work as expected.
798                          */
799                         ++ttlmasters;
800                         nflags |= HAMMER2_CLUSTER_WRHARD;
801                         nflags |= HAMMER2_CLUSTER_RDHARD;
802                         cluster->focus_index = i;
803                         cluster->focus = chain;
804                         cluster->error = error;
805                         break;
806                 default:
807                         break;
808                 }
809         }
810
811         /*
812          * Pass 2
813          *
814          * Resolve nmasters             - master nodes fully match
815          *
816          * Resolve umasters             - master nodes operation still
817          *                                in progress
818          *
819          * Resolve nmasters_keymatch    - master nodes match the passed-in
820          *                                key and may or may not match
821          *                                the quorum-agreed tid.
822          * 
823          * The quorum-agreed TID is the highest matching TID.
824          */
825         last_best_quorum_tid = HAMMER2_TID_MAX;
826         quorum_tid = 0;         /* fix gcc warning */
827
828         while (nmasters < nquorum && last_best_quorum_tid != 0) {
829                 nmasters = 0;
830                 quorum_tid = 0;
831
832                 for (i = 0; i < cluster->nchains; ++i) {
833                         /* XXX SOFT smpresent handling */
834                         switch(cluster->pmp->pfs_types[i]) {
835                         case HAMMER2_PFSTYPE_MASTER:
836                         case HAMMER2_PFSTYPE_SUPROOT:
837                                 break;
838                         default:
839                                 continue;
840                         }
841
842                         chain = cluster->array[i].chain;
843                         error = cluster->array[i].error;
844
845                         /*
846                          * Skip elements still in progress.  umasters keeps
847                          * track of masters that might still be in-progress.
848                          */
849                         if (chain == NULL && (cluster->array[i].flags &
850                                               HAMMER2_CITEM_NULL) == 0) {
851                                 ++umasters;
852                                 continue;
853                         }
854
855                         /*
856                          * Key match?
857                          */
858                         if (flags & HAMMER2_CHECK_NULL) {
859                                 if (chain == NULL) {
860                                         ++nmasters;
861                                         ++nmasters_keymatch;
862                                         if (cluster->error == 0)
863                                                 cluster->error = error;
864                                 }
865                         } else if (chain &&
866                                    (key == (hammer2_key_t)-1 ||
867                                     chain->bref.key == key)) {
868                                 ++nmasters_keymatch;
869                                 if (quorum_tid < last_best_quorum_tid &&
870                                     (quorum_tid < chain->bref.modify_tid ||
871                                      nmasters == 0)) {
872                                         /*
873                                          * Better TID located, reset
874                                          * nmasters count.
875                                          */
876                                         nmasters = 0;
877                                         quorum_tid = chain->bref.modify_tid;
878                                 }
879                                 if (quorum_tid == chain->bref.modify_tid) {
880                                         /*
881                                          * TID matches current collection.
882                                          *
883                                          * (error handled in next pass)
884                                          */
885                                         ++nmasters;
886                                         if (chain->error == 0) {
887                                                 cluster->focus = chain;
888                                                 cluster->focus_index = i;
889                                         }
890                                 }
891                         }
892                 }
893                 if (nmasters >= nquorum)
894                         break;
895                 last_best_quorum_tid = quorum_tid;
896         }
897
898         /*
899         kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
900                 nmasters, nquorum, nmasters_keymatch, umasters);
901         */
902
903         /*
904          * Early return if we do not have enough masters.
905          */
906         if (nmasters < nquorum) {
907                 if (nmasters + umasters >= nquorum)
908                         return EINPROGRESS;
909                 if (nmasters_keymatch < nquorum) 
910                         return ESRCH;
911                 return EDEADLK;
912         }
913
914         /*
915          * Validated end of scan.
916          */
917         if (flags & HAMMER2_CHECK_NULL) {
918                 if (cluster->error == 0)
919                         cluster->error = ENOENT;
920                 return cluster->error;
921         }
922
923         /*
924          * If we have a NULL focus at this point the agreeing quorum all
925          * had chain errors.
926          */
927         if (cluster->focus == NULL)
928                 return EIO;
929
930         /*
931          * Pass 3
932          *
933          * We have quorum agreement, validate elements, not end of scan.
934          */
935         cluster->error = 0;
936         for (i = 0; i < cluster->nchains; ++i) {
937                 chain = cluster->array[i].chain;
938                 error = cluster->array[i].error;
939                 if (chain == NULL ||
940                     chain->bref.key != key ||
941                     chain->bref.modify_tid != quorum_tid) {
942                         continue;
943                 }
944
945                 /*
946                  * Quorum Match
947                  *
948                  * XXX for now, cumulative error.
949                  */
950                 if (cluster->error == 0)
951                         cluster->error = error;
952
953                 switch (cluster->pmp->pfs_types[i]) {
954                 case HAMMER2_PFSTYPE_MASTER:
955                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
956                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
957                         nflags |= HAMMER2_CLUSTER_WRHARD;
958                         nflags |= HAMMER2_CLUSTER_RDHARD;
959                         break;
960                 case HAMMER2_PFSTYPE_SLAVE:
961                         /*
962                          * We must have enough up-to-date masters to reach
963                          * a quorum and the slave modify_tid must match the
964                          * quorum's modify_tid.
965                          *
966                          * Do not select an errored slave.
967                          */
968                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
969                         nflags |= HAMMER2_CLUSTER_RDHARD;
970                         ++nslaves;
971                         break;
972                 case HAMMER2_PFSTYPE_SOFT_MASTER:
973                         /*
974                          * Directly mounted soft master always wins.  There
975                          * should be only one.
976                          */
977                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
978                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
979                         break;
980                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
981                         /*
982                          * Directly mounted soft slave always wins.  There
983                          * should be only one.
984                          *
985                          * XXX
986                          */
987                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
988                         break;
989                 case HAMMER2_PFSTYPE_SUPROOT:
990                         /*
991                          * spmp (degenerate case)
992                          */
993                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
994                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
995                         nflags |= HAMMER2_CLUSTER_WRHARD;
996                         nflags |= HAMMER2_CLUSTER_RDHARD;
997                         break;
998                 default:
999                         break;
1000                 }
1001         }
1002
1003         /*
1004          * Focus now set, adjust ddflag.  Skip this pass if the focus
1005          * is bad or if we are at the PFS root (the bref won't match at
1006          * the PFS root, obviously).
1007          */
1008         focus = cluster->focus;
1009         if (focus) {
1010                 cluster->ddflag =
1011                         (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
1012         } else {
1013                 cluster->ddflag = 0;
1014                 goto skip4;
1015         }
1016         if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1017                 goto skip4;
1018
1019         /*
1020          * Pass 4
1021          *
1022          * Validate the elements that were not marked invalid.  They should
1023          * match.
1024          */
1025         for (i = 0; i < cluster->nchains; ++i) {
1026                 int ddflag;
1027
1028                 chain = cluster->array[i].chain;
1029
1030                 if (chain == NULL)
1031                         continue;
1032                 if (chain == focus)
1033                         continue;
1034                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1035                         continue;
1036
1037                 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1038                 if (chain->bref.type != focus->bref.type ||
1039                     chain->bref.key != focus->bref.key ||
1040                     chain->bref.keybits != focus->bref.keybits ||
1041                     chain->bref.modify_tid != focus->bref.modify_tid ||
1042                     chain->bytes != focus->bytes ||
1043                     ddflag != cluster->ddflag) {
1044                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1045                         if (hammer2_debug & 1)
1046                         kprintf("cluster_resolve: matching modify_tid failed "
1047                                 "bref test: idx=%d type=%02x/%02x "
1048                                 "key=%016jx/%d-%016jx/%d "
1049                                 "mod=%016jx/%016jx bytes=%u/%u\n",
1050                                 i,
1051                                 chain->bref.type, focus->bref.type,
1052                                 chain->bref.key, chain->bref.keybits,
1053                                 focus->bref.key, focus->bref.keybits,
1054                                 chain->bref.modify_tid, focus->bref.modify_tid,
1055                                 chain->bytes, focus->bytes);
1056                         if (hammer2_debug & 0x4000)
1057                                 panic("cluster_resolve");
1058                         /* flag issue and force resync? */
1059                 }
1060         }
1061 skip4:
1062
1063         if (ttlslaves == 0)
1064                 nflags |= HAMMER2_CLUSTER_NOSOFT;
1065         if (ttlmasters == 0)
1066                 nflags |= HAMMER2_CLUSTER_NOHARD;
1067
1068         /*
1069          * Set SSYNCED or MSYNCED for slaves and masters respectively if
1070          * all available nodes (even if 0 are available) are fully
1071          * synchronized.  This is used by the synchronization thread to
1072          * determine if there is work it could potentially accomplish.
1073          */
1074         if (nslaves == ttlslaves)
1075                 nflags |= HAMMER2_CLUSTER_SSYNCED;
1076         if (nmasters == ttlmasters)
1077                 nflags |= HAMMER2_CLUSTER_MSYNCED;
1078
1079         /*
1080          * Determine if the cluster was successfully locked for the
1081          * requested operation and generate an error code.  The cluster
1082          * will not be locked (or ref'd) if an error is returned.
1083          */
1084         atomic_set_int(&cluster->flags, nflags);
1085         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1086
1087         return cluster->error;
1088 }
1089
1090 /*
1091  * This is used by the sync thread to force non-NULL elements of a copy
1092  * of the pmp->iroot cluster to be good which is required to prime the
1093  * sync.
1094  */
1095 void
1096 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1097 {
1098         int i;
1099
1100         for (i = 0; i < cluster->nchains; ++i) {
1101                 if (cluster->array[i].chain)
1102                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1103         }
1104 }
1105
1106 /*
1107  * Copy a cluster, returned a ref'd cluster.  All underlying chains
1108  * are also ref'd, but not locked.  Focus state is also copied.
1109  *
1110  * Original cluster does not have to be locked but usually is.
1111  * New cluster will not be flagged as locked.
1112  *
1113  * Callers using this function to initialize a new cluster from an inode
1114  * generally lock and resolve the resulting cluster.
1115  *
1116  * Callers which use this function to save/restore a cluster structure
1117  * generally retain the focus state and do not re-resolve it.  Caller should
1118  * not try to re-resolve internal (cparent) node state during an iteration
1119  * as the individual tracking elements of cparent in an iteration may not
1120  * match even though they are correct.
1121  */
1122 hammer2_cluster_t *
1123 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
1124 {
1125         hammer2_pfs_t *pmp = ocluster->pmp;
1126         hammer2_cluster_t *ncluster;
1127         hammer2_chain_t *chain;
1128         int i;
1129
1130         ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
1131         ncluster->pmp = pmp;
1132         ncluster->nchains = ocluster->nchains;
1133         ncluster->refs = 1;
1134
1135         for (i = 0; i < ocluster->nchains; ++i) {
1136                 chain = ocluster->array[i].chain;
1137                 ncluster->array[i].chain = chain;
1138                 ncluster->array[i].flags = ocluster->array[i].flags;
1139                 if (chain)
1140                         hammer2_chain_ref(chain);
1141         }
1142         ncluster->focus_index = ocluster->focus_index;
1143         ncluster->focus = ocluster->focus;
1144         ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED |
1145                                               HAMMER2_CLUSTER_INODE);
1146
1147         return (ncluster);
1148 }
1149
1150 /*
1151  * Unlock a cluster.  Refcount and focus is maintained.
1152  */
1153 void
1154 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1155 {
1156         hammer2_chain_t *chain;
1157         int i;
1158
1159         if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1160                 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1161                         cluster);
1162         }
1163         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1164         KKASSERT(cluster->refs > 0);
1165         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1166
1167         for (i = 0; i < cluster->nchains; ++i) {
1168                 chain = cluster->array[i].chain;
1169                 if (chain)
1170                         hammer2_chain_unlock(chain);
1171         }
1172 }
1173
1174 /************************************************************************
1175  *                              CLUSTER I/O                             *
1176  ************************************************************************
1177  *
1178  *
1179  * WARNING! blockref[] array data is not universal.  These functions should
1180  *          only be used to access universal data.
1181  *
1182  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1183  *          complete if necessary.  The I/O's should have already been
1184  *          initiated by the cluster_lock/chain_lock operation.
1185  *
1186  *          The cluster must already be in a modified state before wdata
1187  *          is called.  The data will already be available for this case.
1188  */
1189 const hammer2_media_data_t *
1190 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1191 {
1192         KKASSERT(cluster->focus != NULL);
1193         return(cluster->focus->data);
1194 }
1195
1196 hammer2_media_data_t *
1197 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1198 {
1199         KKASSERT(cluster->focus != NULL);
1200         KKASSERT(hammer2_cluster_modified(cluster));
1201         return(cluster->focus->data);
1202 }