5aaff03a81bf17fd7d110c368125eb82492a1bb5
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *          locks and I/O, do quorum and/or master-slave processing, and
50  *          it must operate properly even if some nodes are broken (which
51  *          can also mean indefinite locks).
52  *
53  *                              CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *              hammer2_cluster_lock()
59  *              hammer2_cluster_parent()
60  *
61  *      - Most complex functions, quorum management on transaction ids.
62  *
63  *      - Locking and data accesses must be internally asynchronous.
64  *
65  *      - Validate and manage cache coherency primitives (cache state
66  *        is stored in chain topologies but must be validated by these
67  *        functions).
68  *
69  * (2) Lookups and Scans
70  *              hammer2_cluster_lookup()
71  *              hammer2_cluster_next()
72  *
73  *      - Depend on locking & data retrieval functions, but still complex.
74  *
75  *      - Must do quorum management on transaction ids.
76  *
77  *      - Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *              hammer2_cluster_create()
81  *              hammer2_cluster_rename()
82  *              hammer2_cluster_delete()
83  *              hammer2_cluster_modify()
84  *              hammer2_cluster_modsync()
85  *
86  *      - Can usually punt on failures, operation continues unless quorum
87  *        is lost.  If quorum is lost, must wait for resynchronization
88  *        (depending on the management mode).
89  *
90  *      - Must disconnect node on failures (also not flush), remount, and
91  *        resynchronize.
92  *
93  *      - Network links (via kdmsg) are relatively easy to issue as the
94  *        complex underworkings of hammer2_chain.c don't have to messed
95  *        with (the protocol is at a higher level than block-level).
96  *
97  *      - Multiple local disk nodes (i.e. block devices) are another matter.
98  *        Chain operations have to be dispatched to per-node threads (xN)
99  *        because we can't asynchronize potentially very complex chain
100  *        operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *        (these threads are also used to terminate incoming kdmsg ops from
103  *        other machines).
104  *
105  *      - Single-node filesystems do not use threads and will simply call
106  *        hammer2_chain.c functions directly.  This short-cut is handled
107  *        at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115
116 #include "hammer2.h"
117
118 /*
119  * Returns the bref type of the cluster's foucs.
120  *
121  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
122  * The cluster must be locked.
123  */
124 uint8_t
125 hammer2_cluster_type(hammer2_cluster_t *cluster)
126 {
127         if (cluster->error == 0) {
128                 KKASSERT(cluster->focus != NULL);
129                 return(cluster->focus->bref.type);
130         }
131         return 0;
132 }
133
134 /*
135  * Returns non-zero if the cluster's focus is flagged as being modified.
136  *
137  * If the cluster is errored, returns 0.
138  */
139 static
140 int
141 hammer2_cluster_modified(hammer2_cluster_t *cluster)
142 {
143         if (cluster->error == 0) {
144                 KKASSERT(cluster->focus != NULL);
145                 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
146         }
147         return 0;
148 }
149
150 /*
151  * Returns the bref of the cluster's focus, sans any data-offset information
152  * (since offset information is per-node and wouldn't be useful).
153  *
154  * Callers use this function to access modify_tid, mirror_tid, type,
155  * key, and keybits.
156  *
157  * If the cluster is errored, returns an empty bref.
158  * The cluster must be locked.
159  */
160 void
161 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
162 {
163         if (cluster->error == 0) {
164                 KKASSERT(cluster->focus != NULL);
165                 *bref = cluster->focus->bref;
166                 bref->data_off = 0;
167         } else {
168                 bzero(bref, sizeof(*bref));
169         }
170 }
171
172 /*
173  * Create a degenerate cluster with one ref from a single locked chain.
174  * The returned cluster will be focused on the chain and inherit its
175  * error state.
176  *
177  * The chain's lock and reference are transfered to the new cluster, so
178  * the caller should not try to unlock the chain separately.
179  *
180  * We fake the flags.
181  */
182 hammer2_cluster_t *
183 hammer2_cluster_from_chain(hammer2_chain_t *chain)
184 {
185         hammer2_cluster_t *cluster;
186
187         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
188         cluster->array[0].chain = chain;
189         cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
190         cluster->nchains = 1;
191         cluster->focus = chain;
192         cluster->focus_index = 0;
193         cluster->pmp = chain->pmp;
194         cluster->refs = 1;
195         cluster->error = chain->error;
196         cluster->flags = HAMMER2_CLUSTER_LOCKED |
197                          HAMMER2_CLUSTER_WRHARD |
198                          HAMMER2_CLUSTER_RDHARD |
199                          HAMMER2_CLUSTER_MSYNCED |
200                          HAMMER2_CLUSTER_SSYNCED;
201
202         return cluster;
203 }
204
205 /*
206  * Add a reference to a cluster and its underlying chains.
207  *
208  * We must also ref the underlying chains in order to allow ref/unlock
209  * sequences to later re-lock.
210  */
211 void
212 hammer2_cluster_ref(hammer2_cluster_t *cluster)
213 {
214         atomic_add_int(&cluster->refs, 1);
215 }
216
217 /*
218  * Drop the caller's reference to the cluster.  When the ref count drops to
219  * zero this function frees the cluster and drops all underlying chains.
220  *
221  * In-progress read I/Os are typically detached from the cluster once the
222  * first one returns (the remaining stay attached to the DIOs but are then
223  * ignored and drop naturally).
224  */
225 void
226 hammer2_cluster_drop(hammer2_cluster_t *cluster)
227 {
228         hammer2_chain_t *chain;
229         int i;
230
231         KKASSERT(cluster->refs > 0);
232         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
233                 cluster->focus = NULL;          /* safety XXX chg to assert */
234                 cluster->focus_index = 0;
235
236                 for (i = 0; i < cluster->nchains; ++i) {
237                         chain = cluster->array[i].chain;
238                         if (chain) {
239                                 hammer2_chain_drop(chain);
240                                 cluster->array[i].chain = NULL; /* safety */
241                         }
242                 }
243                 cluster->nchains = 0;                           /* safety */
244
245                 kfree(cluster, M_HAMMER2);
246                 /* cluster is invalid */
247         }
248 }
249
250 /*
251  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained. 
252  *
253  * WARNING! This function expects the caller to handle resolution of the
254  *          cluster.  We never re-resolve the cluster in this function,
255  *          because it might be used to temporarily unlock/relock a cparent
256  *          in an iteration or recursrion, and the cparents elements do not
257  *          necessarily match.
258  */
259 void
260 hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
261 {
262         hammer2_chain_t *chain;
263         int i;
264
265         /* cannot be on inode-embedded cluster template, must be on copy */
266         KKASSERT(cluster->refs > 0);
267         KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
268         if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
269                 panic("hammer2_cluster_lock: cluster %p already locked!\n",
270                         cluster);
271         }
272         atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
273
274         /*
275          * Lock chains and resolve state.
276          */
277         for (i = 0; i < cluster->nchains; ++i) {
278                 if (i == idx)
279                         continue;
280                 chain = cluster->array[i].chain;
281                 if (chain == NULL)
282                         continue;
283                 hammer2_chain_lock(chain, how);
284         }
285 }
286
287 void
288 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
289 {
290         hammer2_cluster_lock_except(cluster, -1, how);
291 }
292
293 /*
294  * Calculate the clustering state for the cluster and set its focus.
295  * This routine must be called with care.  For example, it should not
296  * normally be called after relocking a non-leaf cluster because parent
297  * clusters help iterations and each element might be at a slightly different
298  * indirect node (each node's topology is independently indexed).
299  *
300  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
301  * operations.  Typically this is only set on a quorum of MASTERs or
302  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
303  * is present, this bit is *not* set on a quorum of MASTERs.  The
304  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
305  * that create/modify/delete elements use it.
306  *
307  * The chains making up the cluster may be narrowed down based on quorum
308  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
309  * narrowed down to a single chain as long as the entire subtopology is known
310  * to be intact.  So, for example, we can narrow a read-only op to a single
311  * fast SLAVE but if we focus a CACHE chain we must still retain at least
312  * a SLAVE to ensure that the subtopology can be accessed.
313  *
314  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
315  * to be maintained once the topology is validated as-of the top level of
316  * the operation.
317  *
318  * If a failure occurs the operation must be aborted by higher-level code and
319  * retried. XXX
320  */
321 void
322 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
323 {
324         hammer2_chain_t *chain;
325         hammer2_chain_t *focus;
326         hammer2_pfs_t *pmp;
327         hammer2_tid_t quorum_tid;
328         hammer2_tid_t last_best_quorum_tid;
329         int focus_pfs_type;
330         uint32_t nflags;
331         int ttlmasters;
332         int ttlslaves;
333         int nmasters;
334         int nslaves;
335         int nquorum;
336         int smpresent;
337         int i;
338
339         cluster->error = 0;
340         cluster->focus = NULL;
341
342         focus_pfs_type = 0;
343         nflags = 0;
344         ttlmasters = 0;
345         ttlslaves = 0;
346         nmasters = 0;
347         nslaves = 0;
348
349         /*
350          * Calculate quorum
351          */
352         pmp = cluster->pmp;
353         KKASSERT(pmp != NULL || cluster->nchains == 0);
354         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
355         smpresent = 0;
356
357         /*
358          * Pass 1
359          *
360          * NOTE: A NULL chain is not necessarily an error, it could be
361          *       e.g. a lookup failure or the end of an iteration.
362          *       Process normally.
363          */
364         for (i = 0; i < cluster->nchains; ++i) {
365                 chain = cluster->array[i].chain;
366                 if (chain && chain->error) {
367                         if (cluster->focus == NULL || cluster->focus == chain) {
368                                 /* error will be overridden by valid focus */
369                                 cluster->error = chain->error;
370                         }
371
372                         /*
373                          * Must count total masters and slaves whether the
374                          * chain is errored or not.
375                          */
376                         switch (cluster->pmp->pfs_types[i]) {
377                         case HAMMER2_PFSTYPE_MASTER:
378                                 ++ttlmasters;
379                                 break;
380                         case HAMMER2_PFSTYPE_SLAVE:
381                                 ++ttlslaves;
382                                 break;
383                         }
384                         continue;
385                 }
386                 switch (cluster->pmp->pfs_types[i]) {
387                 case HAMMER2_PFSTYPE_MASTER:
388                         ++ttlmasters;
389                         break;
390                 case HAMMER2_PFSTYPE_SLAVE:
391                         ++ttlslaves;
392                         break;
393                 case HAMMER2_PFSTYPE_SOFT_MASTER:
394                         nflags |= HAMMER2_CLUSTER_WRSOFT;
395                         nflags |= HAMMER2_CLUSTER_RDSOFT;
396                         smpresent = 1;
397                         break;
398                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
399                         nflags |= HAMMER2_CLUSTER_RDSOFT;
400                         break;
401                 case HAMMER2_PFSTYPE_SUPROOT:
402                         /*
403                          * Degenerate cluster representing the super-root
404                          * topology on a single device.  Fake stuff so
405                          * cluster ops work as expected.
406                          */
407                         nflags |= HAMMER2_CLUSTER_WRHARD;
408                         nflags |= HAMMER2_CLUSTER_RDHARD;
409                         cluster->focus_index = i;
410                         cluster->focus = chain;
411                         cluster->error = chain ? chain->error : 0;
412                         break;
413                 default:
414                         break;
415                 }
416         }
417
418         /*
419          * Pass 2
420          *
421          * Resolve masters.  Calculate nmasters for the highest matching
422          * TID, if a quorum cannot be attained try the next lower matching
423          * TID until we exhaust TIDs.
424          *
425          * NOTE: A NULL chain is not necessarily an error, it could be
426          *       e.g. a lookup failure or the end of an iteration.
427          *       Process normally.
428          */
429         last_best_quorum_tid = HAMMER2_TID_MAX;
430         quorum_tid = 0;         /* fix gcc warning */
431
432         while (nmasters < nquorum && last_best_quorum_tid != 0) {
433                 nmasters = 0;
434                 quorum_tid = 0;
435
436                 for (i = 0; i < cluster->nchains; ++i) {
437                         if (cluster->pmp->pfs_types[i] !=
438                             HAMMER2_PFSTYPE_MASTER) {
439                                 continue;
440                         }
441                         chain = cluster->array[i].chain;
442
443                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
444                                 /*
445                                  * Invalid as in unsynchronized, cannot be
446                                  * used to calculate the quorum.
447                                  */
448                         } else if (chain == NULL && quorum_tid == 0) {
449                                 /*
450                                  * NULL chain on master matches NULL chains
451                                  * on other masters.
452                                  */
453                                 ++nmasters;
454                         } else if (quorum_tid < last_best_quorum_tid &&
455                                    chain != NULL &&
456                                    (quorum_tid < chain->bref.modify_tid ||
457                                     nmasters == 0)) {
458                                 /*
459                                  * Better TID located, reset nmasters count.
460                                  */
461                                 nmasters = 1;
462                                 quorum_tid = chain->bref.modify_tid;
463                         } else if (chain &&
464                                    quorum_tid == chain->bref.modify_tid) {
465                                 /*
466                                  * TID matches current collection.
467                                  */
468                                 ++nmasters;
469                         }
470                 }
471                 if (nmasters >= nquorum)
472                         break;
473                 last_best_quorum_tid = quorum_tid;
474         }
475
476         /*
477          * Pass 3
478          *
479          * NOTE: A NULL chain is not necessarily an error, it could be
480          *       e.g. a lookup failure or the end of an iteration.
481          *       Process normally.
482          */
483         for (i = 0; i < cluster->nchains; ++i) {
484                 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
485                 chain = cluster->array[i].chain;
486                 if (chain && chain->error) {
487                         if (cluster->focus == NULL || cluster->focus == chain) {
488                                 /* error will be overridden by valid focus */
489                                 cluster->error = chain->error;
490                         }
491                         continue;
492                 }
493
494                 switch (cluster->pmp->pfs_types[i]) {
495                 case HAMMER2_PFSTYPE_MASTER:
496                         /*
497                          * We must have enough up-to-date masters to reach
498                          * a quorum and the master modify_tid must match
499                          * the quorum's modify_tid.
500                          *
501                          * Do not select an errored or out-of-sync master.
502                          */
503                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
504                                 nflags |= HAMMER2_CLUSTER_UNHARD;
505                         } else if (nmasters >= nquorum &&
506                                    (chain == NULL || chain->error == 0) &&
507                                    ((chain == NULL && quorum_tid == 0) ||
508                                     (chain != NULL && quorum_tid ==
509                                                   chain->bref.modify_tid))) {
510                                 nflags |= HAMMER2_CLUSTER_WRHARD;
511                                 nflags |= HAMMER2_CLUSTER_RDHARD;
512                                 if (!smpresent) {
513                                         cluster->array[i].flags |=
514                                                         HAMMER2_CITEM_FEMOD;
515                                 }
516                                 if (cluster->focus == NULL ||
517                                     focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
518                                         focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
519                                         cluster->focus_index = i;
520                                         cluster->focus = chain; /* NULL ok */
521                                         cluster->error = chain ? chain->error :
522                                                                  0;
523                                 }
524                         } else if (chain == NULL || chain->error == 0) {
525                                 nflags |= HAMMER2_CLUSTER_UNHARD;
526                         }
527                         break;
528                 case HAMMER2_PFSTYPE_SLAVE:
529                         /*
530                          * We must have enough up-to-date masters to reach
531                          * a quorum and the slave modify_tid must match the
532                          * quorum's modify_tid.
533                          *
534                          * Do not select an errored slave.
535                          */
536                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
537                                 nflags |= HAMMER2_CLUSTER_UNHARD;
538                         } else if (nmasters >= nquorum &&
539                                    (chain == NULL || chain->error == 0) &&
540                                    ((chain == NULL && quorum_tid == 0) ||
541                                     (chain && quorum_tid ==
542                                               chain->bref.modify_tid))) {
543                                 ++nslaves;
544                                 nflags |= HAMMER2_CLUSTER_RDHARD;
545 #if 0
546                                 /* XXX optimize for RESOLVE_RDONLY */
547                                 if (cluster->focus == NULL) {
548                                         focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
549                                         cluster->focus_index = i;
550                                         cluster->focus = chain; /* NULL ok */
551                                         cluster->error = chain ? chain->error :
552                                                                  0;
553                                 }
554 #endif
555                         } else if (chain == NULL || chain->error == 0) {
556                                 nflags |= HAMMER2_CLUSTER_UNSOFT;
557                         }
558                         break;
559                 case HAMMER2_PFSTYPE_SOFT_MASTER:
560                         /*
561                          * Directly mounted soft master always wins.  There
562                          * should be only one.
563                          */
564                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
565                         cluster->focus_index = i;
566                         cluster->focus = chain;
567                         cluster->error = chain ? chain->error : 0;
568                         focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
569                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
570                         break;
571                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
572                         /*
573                          * Directly mounted soft slave always wins.  There
574                          * should be only one.
575                          */
576                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
577                         if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
578                                 cluster->focus_index = i;
579                                 cluster->focus = chain;
580                                 cluster->error = chain ? chain->error : 0;
581                                 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
582                         }
583                         break;
584                 case HAMMER2_PFSTYPE_SUPROOT:
585                         /*
586                          * spmp (degenerate case)
587                          */
588                         KKASSERT(i == 0);
589                         cluster->focus_index = i;
590                         cluster->focus = chain;
591                         cluster->error = chain ? chain->error : 0;
592                         focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
593                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
594                         break;
595                 default:
596                         break;
597                 }
598         }
599
600         /*
601          * Focus now set, adjust ddflag.  Skip this pass if the focus
602          * is bad or if we are at the PFS root (the bref won't match at
603          * the PFS root, obviously).
604          */
605         focus = cluster->focus;
606         if (focus) {
607                 cluster->ddflag =
608                         (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
609         } else {
610                 cluster->ddflag = 0;
611                 goto skip4;
612         }
613         if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
614                 goto skip4;
615
616         /*
617          * Pass 4
618          *
619          * Validate the elements that were not marked invalid.  They should
620          * match.
621          */
622         for (i = 0; i < cluster->nchains; ++i) {
623                 int ddflag;
624
625                 chain = cluster->array[i].chain;
626
627                 if (chain == NULL)
628                         continue;
629                 if (chain == focus)
630                         continue;
631                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
632                         continue;
633
634                 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
635                 if (chain->bref.type != focus->bref.type ||
636                     chain->bref.key != focus->bref.key ||
637                     chain->bref.keybits != focus->bref.keybits ||
638                     chain->bref.modify_tid != focus->bref.modify_tid ||
639                     chain->bytes != focus->bytes ||
640                     ddflag != cluster->ddflag) {
641                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
642                         if (hammer2_debug & 1)
643                         kprintf("cluster_resolve: matching modify_tid failed "
644                                 "bref test: idx=%d type=%02x/%02x "
645                                 "key=%016jx/%d-%016jx/%d "
646                                 "mod=%016jx/%016jx bytes=%u/%u\n",
647                                 i,
648                                 chain->bref.type, focus->bref.type,
649                                 chain->bref.key, chain->bref.keybits,
650                                 focus->bref.key, focus->bref.keybits,
651                                 chain->bref.modify_tid, focus->bref.modify_tid,
652                                 chain->bytes, focus->bytes);
653                         if (hammer2_debug & 0x4000)
654                                 panic("cluster_resolve");
655                         /* flag issue and force resync? */
656                 }
657         }
658 skip4:
659
660         if (ttlslaves == 0)
661                 nflags |= HAMMER2_CLUSTER_NOSOFT;
662         if (ttlmasters == 0)
663                 nflags |= HAMMER2_CLUSTER_NOHARD;
664
665         /*
666          * Set SSYNCED or MSYNCED for slaves and masters respectively if
667          * all available nodes (even if 0 are available) are fully
668          * synchronized.  This is used by the synchronization thread to
669          * determine if there is work it could potentially accomplish.
670          */
671         if (nslaves == ttlslaves)
672                 nflags |= HAMMER2_CLUSTER_SSYNCED;
673         if (nmasters == ttlmasters)
674                 nflags |= HAMMER2_CLUSTER_MSYNCED;
675
676         /*
677          * Determine if the cluster was successfully locked for the
678          * requested operation and generate an error code.  The cluster
679          * will not be locked (or ref'd) if an error is returned.
680          *
681          * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
682          * to determine if reading or writing is possible.  If writing, the
683          * cluster still requires a call to hammer2_cluster_modify() first.
684          */
685         atomic_set_int(&cluster->flags, nflags);
686         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
687 }
688
689 /*
690  * This is used by the XOPS subsystem to calculate the state of
691  * the collection and tell hammer2_xop_collect() what to do with it.
692  * The collection can be in various states of desynchronization, the
693  * caller specifically wants to resolve the passed-in key.
694  *
695  * Return values:
696  *      0               - Quorum agreement, key is valid
697  *
698  *      ENOENT          - Quorum agreement, end of scan
699  *
700  *      ESRCH           - Quorum agreement, key is INVALID (caller should
701  *                        skip key).
702  *
703  *      EIO             - Quorum agreement but all elements had errors.
704  *
705  *      EDEADLK         - No quorum agreement possible for key, a repair
706  *                        may be needed.  Caller has to decide what to do,
707  *                        possibly iterating the key or generating an EIO.
708  *
709  *      EINPROGRESS     - No quorum agreement yet, but agreement is still
710  *                        possible if caller waits for more responses.  Caller
711  *                        should not iterate key.
712  *
713  * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
714  */
715 int
716 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
717 {
718         hammer2_chain_t *chain;
719         hammer2_chain_t *focus;
720         hammer2_pfs_t *pmp;
721         hammer2_tid_t quorum_tid;
722         hammer2_tid_t last_best_quorum_tid;
723         uint32_t nflags;
724         int ttlmasters;
725         int ttlslaves;
726         int nmasters;
727         int nmasters_keymatch;
728         int nslaves;
729         int nquorum;
730         int umasters;   /* unknown masters (still in progress) */
731         int smpresent;
732         int i;
733
734         cluster->error = 0;
735         cluster->focus = NULL;
736
737         nflags = 0;
738         ttlmasters = 0;
739         ttlslaves = 0;
740         nmasters = 0;
741         nmasters_keymatch = 0;
742         umasters = 0;
743         nslaves = 0;
744
745         /*
746          * Calculate quorum
747          */
748         pmp = cluster->pmp;
749         KKASSERT(pmp != NULL || cluster->nchains == 0);
750         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
751         smpresent = 0;
752
753         /*
754          * Pass 1
755          *
756          * NOTE: A NULL chain is not necessarily an error, it could be
757          *       e.g. a lookup failure or the end of an iteration.
758          *       Process normally.
759          */
760         for (i = 0; i < cluster->nchains; ++i) {
761                 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
762                 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
763
764                 chain = cluster->array[i].chain;
765                 if (chain && chain->error) {
766                         if (cluster->focus == NULL || cluster->focus == chain) {
767                                 /* error will be overridden by valid focus */
768                                 cluster->error = chain->error;
769                         }
770
771                         /*
772                          * Must count total masters and slaves whether the
773                          * chain is errored or not.
774                          */
775                         switch (cluster->pmp->pfs_types[i]) {
776                         case HAMMER2_PFSTYPE_MASTER:
777                                 ++ttlmasters;
778                                 break;
779                         case HAMMER2_PFSTYPE_SLAVE:
780                                 ++ttlslaves;
781                                 break;
782                         }
783                         continue;
784                 }
785                 switch (cluster->pmp->pfs_types[i]) {
786                 case HAMMER2_PFSTYPE_MASTER:
787                         ++ttlmasters;
788                         break;
789                 case HAMMER2_PFSTYPE_SLAVE:
790                         ++ttlslaves;
791                         break;
792                 case HAMMER2_PFSTYPE_SOFT_MASTER:
793                         nflags |= HAMMER2_CLUSTER_WRSOFT;
794                         nflags |= HAMMER2_CLUSTER_RDSOFT;
795                         smpresent = 1;
796                         break;
797                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
798                         nflags |= HAMMER2_CLUSTER_RDSOFT;
799                         break;
800                 case HAMMER2_PFSTYPE_SUPROOT:
801                         /*
802                          * Degenerate cluster representing the super-root
803                          * topology on a single device.  Fake stuff so
804                          * cluster ops work as expected.
805                          */
806                         nflags |= HAMMER2_CLUSTER_WRHARD;
807                         nflags |= HAMMER2_CLUSTER_RDHARD;
808                         cluster->focus_index = i;
809                         cluster->focus = chain;
810                         cluster->error = chain ? chain->error : 0;
811                         break;
812                 default:
813                         break;
814                 }
815         }
816
817         /*
818          * Pass 2
819          *
820          * Resolve nmasters             - master nodes fully match
821          *
822          * Resolve umasters             - master nodes operation still
823          *                                in progress
824          *
825          * Resolve nmasters_keymatch    - master nodes match the passed-in
826          *                                key and may or may not match
827          *                                the quorum-agreed tid.
828          * 
829          * The quorum-agreed TID is the highest matching TID.
830          */
831         last_best_quorum_tid = HAMMER2_TID_MAX;
832         quorum_tid = 0;         /* fix gcc warning */
833
834         while (nmasters < nquorum && last_best_quorum_tid != 0) {
835                 nmasters = 0;
836                 quorum_tid = 0;
837
838                 for (i = 0; i < cluster->nchains; ++i) {
839                         /* XXX SOFT smpresent handling */
840                         if (cluster->pmp->pfs_types[i] !=
841                             HAMMER2_PFSTYPE_MASTER) {
842                                 continue;
843                         }
844
845                         chain = cluster->array[i].chain;
846
847                         /*
848                          * Skip elements still in progress.  umasters keeps
849                          * track of masters that might still be in-progress.
850                          */
851                         if (chain == NULL && (cluster->array[i].flags &
852                                               HAMMER2_CITEM_NULL) == 0) {
853                                 ++umasters;
854                                 continue;
855                         }
856
857                         /*
858                          * Key match?
859                          */
860                         if (flags & HAMMER2_CHECK_NULL) {
861                                 if (chain == NULL) {
862                                         ++nmasters;
863                                         ++nmasters_keymatch;
864                                 }
865                         } else if (chain &&
866                                    (key == (hammer2_key_t)-1 ||
867                                     chain->bref.key == key)) {
868                                 ++nmasters_keymatch;
869                                 if (quorum_tid < last_best_quorum_tid &&
870                                     (quorum_tid < chain->bref.modify_tid ||
871                                      nmasters == 0)) {
872                                         /*
873                                          * Better TID located, reset
874                                          * nmasters count.
875                                          */
876                                         nmasters = 0;
877                                         quorum_tid = chain->bref.modify_tid;
878                                 }
879                                 if (quorum_tid == chain->bref.modify_tid) {
880                                         /*
881                                          * TID matches current collection.
882                                          */
883                                         ++nmasters;
884                                         if (chain->error == 0) {
885                                                 cluster->focus = chain;
886                                                 cluster->focus_index = i;
887                                         }
888                                 }
889                         }
890                 }
891                 if (nmasters >= nquorum)
892                         break;
893                 last_best_quorum_tid = quorum_tid;
894         }
895
896         /*
897         kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
898                 nmasters, nquorum, nmasters_keymatch, umasters);
899         */
900
901         /*
902          * Early return if we do not have enough masters.
903          */
904         if (nmasters < nquorum) {
905                 if (nmasters + umasters >= nquorum)
906                         return EINPROGRESS;
907                 if (nmasters_keymatch < nquorum) 
908                         return ESRCH;
909                 return EDEADLK;
910         }
911
912         /*
913          * Validated end of scan.
914          */
915         if (flags & HAMMER2_CHECK_NULL)
916                 return ENOENT;
917
918         /*
919          * If we have a NULL focus at this point the agreeing quorum all
920          * had chain errors.
921          */
922         if (cluster->focus == NULL)
923                 return EIO;
924
925         /*
926          * Pass 3
927          *
928          * We have quorum agreement, validate elements, not end of scan.
929          */
930         for (i = 0; i < cluster->nchains; ++i) {
931                 chain = cluster->array[i].chain;
932                 if (chain == NULL ||
933                     chain->bref.key != key ||
934                     chain->bref.modify_tid != quorum_tid) {
935                         continue;
936                 }
937
938                 switch (cluster->pmp->pfs_types[i]) {
939                 case HAMMER2_PFSTYPE_MASTER:
940                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
941                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
942                         nflags |= HAMMER2_CLUSTER_WRHARD;
943                         nflags |= HAMMER2_CLUSTER_RDHARD;
944                         break;
945                 case HAMMER2_PFSTYPE_SLAVE:
946                         /*
947                          * We must have enough up-to-date masters to reach
948                          * a quorum and the slave modify_tid must match the
949                          * quorum's modify_tid.
950                          *
951                          * Do not select an errored slave.
952                          */
953                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
954                         nflags |= HAMMER2_CLUSTER_RDHARD;
955                         ++nslaves;
956                         break;
957                 case HAMMER2_PFSTYPE_SOFT_MASTER:
958                         /*
959                          * Directly mounted soft master always wins.  There
960                          * should be only one.
961                          */
962                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
963                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
964                         break;
965                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
966                         /*
967                          * Directly mounted soft slave always wins.  There
968                          * should be only one.
969                          *
970                          * XXX
971                          */
972                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
973                         break;
974                 case HAMMER2_PFSTYPE_SUPROOT:
975                         /*
976                          * spmp (degenerate case)
977                          */
978                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
979                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
980                         break;
981                 default:
982                         break;
983                 }
984         }
985
986         /*
987          * Focus now set, adjust ddflag.  Skip this pass if the focus
988          * is bad or if we are at the PFS root (the bref won't match at
989          * the PFS root, obviously).
990          */
991         focus = cluster->focus;
992         if (focus) {
993                 cluster->ddflag =
994                         (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
995         } else {
996                 cluster->ddflag = 0;
997                 goto skip4;
998         }
999         if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1000                 goto skip4;
1001
1002         /*
1003          * Pass 4
1004          *
1005          * Validate the elements that were not marked invalid.  They should
1006          * match.
1007          */
1008         for (i = 0; i < cluster->nchains; ++i) {
1009                 int ddflag;
1010
1011                 chain = cluster->array[i].chain;
1012
1013                 if (chain == NULL)
1014                         continue;
1015                 if (chain == focus)
1016                         continue;
1017                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1018                         continue;
1019
1020                 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1021                 if (chain->bref.type != focus->bref.type ||
1022                     chain->bref.key != focus->bref.key ||
1023                     chain->bref.keybits != focus->bref.keybits ||
1024                     chain->bref.modify_tid != focus->bref.modify_tid ||
1025                     chain->bytes != focus->bytes ||
1026                     ddflag != cluster->ddflag) {
1027                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1028                         if (hammer2_debug & 1)
1029                         kprintf("cluster_resolve: matching modify_tid failed "
1030                                 "bref test: idx=%d type=%02x/%02x "
1031                                 "key=%016jx/%d-%016jx/%d "
1032                                 "mod=%016jx/%016jx bytes=%u/%u\n",
1033                                 i,
1034                                 chain->bref.type, focus->bref.type,
1035                                 chain->bref.key, chain->bref.keybits,
1036                                 focus->bref.key, focus->bref.keybits,
1037                                 chain->bref.modify_tid, focus->bref.modify_tid,
1038                                 chain->bytes, focus->bytes);
1039                         if (hammer2_debug & 0x4000)
1040                                 panic("cluster_resolve");
1041                         /* flag issue and force resync? */
1042                 }
1043         }
1044 skip4:
1045
1046         if (ttlslaves == 0)
1047                 nflags |= HAMMER2_CLUSTER_NOSOFT;
1048         if (ttlmasters == 0)
1049                 nflags |= HAMMER2_CLUSTER_NOHARD;
1050
1051         /*
1052          * Set SSYNCED or MSYNCED for slaves and masters respectively if
1053          * all available nodes (even if 0 are available) are fully
1054          * synchronized.  This is used by the synchronization thread to
1055          * determine if there is work it could potentially accomplish.
1056          */
1057         if (nslaves == ttlslaves)
1058                 nflags |= HAMMER2_CLUSTER_SSYNCED;
1059         if (nmasters == ttlmasters)
1060                 nflags |= HAMMER2_CLUSTER_MSYNCED;
1061
1062         /*
1063          * Determine if the cluster was successfully locked for the
1064          * requested operation and generate an error code.  The cluster
1065          * will not be locked (or ref'd) if an error is returned.
1066          *
1067          * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
1068          * to determine if reading or writing is possible.  If writing, the
1069          * cluster still requires a call to hammer2_cluster_modify() first.
1070          */
1071         atomic_set_int(&cluster->flags, nflags);
1072         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1073
1074         return 0;
1075 }
1076
1077 /*
1078  * This is used by the sync thread to force non-NULL elements of a copy
1079  * of the pmp->iroot cluster to be good which is required to prime the
1080  * sync.
1081  */
1082 void
1083 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1084 {
1085         int i;
1086
1087         for (i = 0; i < cluster->nchains; ++i) {
1088                 if (cluster->array[i].chain)
1089                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1090         }
1091 }
1092
1093 /*
1094  * Copy a cluster, returned a ref'd cluster.  All underlying chains
1095  * are also ref'd, but not locked.  Focus state is also copied.
1096  *
1097  * Original cluster does not have to be locked but usually is.
1098  * New cluster will not be flagged as locked.
1099  *
1100  * Callers using this function to initialize a new cluster from an inode
1101  * generally lock and resolve the resulting cluster.
1102  *
1103  * Callers which use this function to save/restore a cluster structure
1104  * generally retain the focus state and do not re-resolve it.  Caller should
1105  * not try to re-resolve internal (cparent) node state during an iteration
1106  * as the individual tracking elements of cparent in an iteration may not
1107  * match even though they are correct.
1108  */
1109 hammer2_cluster_t *
1110 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
1111 {
1112         hammer2_pfs_t *pmp = ocluster->pmp;
1113         hammer2_cluster_t *ncluster;
1114         hammer2_chain_t *chain;
1115         int i;
1116
1117         ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
1118         ncluster->pmp = pmp;
1119         ncluster->nchains = ocluster->nchains;
1120         ncluster->refs = 1;
1121
1122         for (i = 0; i < ocluster->nchains; ++i) {
1123                 chain = ocluster->array[i].chain;
1124                 ncluster->array[i].chain = chain;
1125                 ncluster->array[i].flags = ocluster->array[i].flags;
1126                 if (chain)
1127                         hammer2_chain_ref(chain);
1128         }
1129         ncluster->focus_index = ocluster->focus_index;
1130         ncluster->focus = ocluster->focus;
1131         ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED |
1132                                               HAMMER2_CLUSTER_INODE);
1133
1134         return (ncluster);
1135 }
1136
1137 /*
1138  * Unlock a cluster.  Refcount and focus is maintained.
1139  */
1140 void
1141 hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx)
1142 {
1143         hammer2_chain_t *chain;
1144         int i;
1145
1146         if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1147                 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1148                         cluster);
1149         }
1150         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1151         KKASSERT(cluster->refs > 0);
1152         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1153
1154         for (i = 0; i < cluster->nchains; ++i) {
1155                 if (i == idx)
1156                         continue;
1157                 chain = cluster->array[i].chain;
1158                 if (chain)
1159                         hammer2_chain_unlock(chain);
1160         }
1161 }
1162
1163 void
1164 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1165 {
1166         hammer2_cluster_unlock_except(cluster, -1);
1167 }
1168
1169 /*
1170  * Set an inode's cluster modified, marking the related chains RW and
1171  * duplicating them if necessary.
1172  *
1173  * The passed-in chain is a localized copy of the chain previously acquired
1174  * when the inode was locked (and possilby replaced in the mean time), and
1175  * must also be updated.  In fact, we update it first and then synchronize
1176  * the inode's cluster cache.
1177  */
1178 hammer2_inode_data_t *
1179 hammer2_cluster_modify_ip(hammer2_inode_t *ip,
1180                           hammer2_cluster_t *cluster, int flags)
1181 {
1182         hammer2_inode_modify(ip);
1183         hammer2_cluster_modify(cluster, flags);
1184         hammer2_inode_repoint(ip, NULL, cluster);
1185         return (&hammer2_cluster_wdata(cluster)->ipdata);
1186 }
1187
1188 /*
1189  * Adjust the cluster's chains to allow modification and adjust the
1190  * focus.  Data will be accessible on return.
1191  *
1192  * If our focused master errors on modify, re-resolve the cluster to
1193  * try to select a different master.
1194  */
1195 void
1196 hammer2_cluster_modify(hammer2_cluster_t *cluster, int flags)
1197 {
1198         hammer2_chain_t *chain;
1199         int resolve_again;
1200         int i;
1201
1202         resolve_again = 0;
1203         for (i = 0; i < cluster->nchains; ++i) {
1204                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1205                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1206                         continue;
1207                 }
1208                 chain = cluster->array[i].chain;
1209                 if (chain == NULL)
1210                         continue;
1211                 if (chain->error)
1212                         continue;
1213                 hammer2_chain_modify(chain, flags);
1214                 if (cluster->focus == chain && chain->error) {
1215                         cluster->error = chain->error;
1216                         resolve_again = 1;
1217                 }
1218         }
1219         if (resolve_again)
1220                 hammer2_cluster_resolve(cluster);
1221 }
1222
1223 /*
1224  * Synchronize modifications from the focus to other chains in a cluster.
1225  * Convenient because nominal API users can just modify the contents of the
1226  * focus (at least for non-blockref data).
1227  *
1228  * Nominal front-end operations only edit non-block-table data in a single
1229  * chain.  This code copies such modifications to the other chains in the
1230  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
1231  * by both the frontend and the backend and will explode in fireworks if
1232  * blindly copied.
1233  */
1234 void
1235 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
1236 {
1237         hammer2_chain_t *focus;
1238         hammer2_chain_t *scan;
1239         const hammer2_inode_data_t *ripdata;
1240         hammer2_inode_data_t *wipdata;
1241         int i;
1242
1243         focus = cluster->focus;
1244         KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
1245
1246         for (i = 0; i < cluster->nchains; ++i) {
1247                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1248                         continue;
1249                 scan = cluster->array[i].chain;
1250                 if (scan == NULL || scan == focus)
1251                         continue;
1252                 if (scan->error)
1253                         continue;
1254                 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
1255                 KKASSERT(focus->bytes == scan->bytes &&
1256                          focus->bref.type == scan->bref.type);
1257                 switch(focus->bref.type) {
1258                 case HAMMER2_BREF_TYPE_INODE:
1259                         ripdata = &focus->data->ipdata;
1260                         wipdata = &scan->data->ipdata;
1261                         if ((ripdata->meta.op_flags &
1262                             HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1263                                 bcopy(ripdata, wipdata,
1264                                       offsetof(hammer2_inode_data_t, u));
1265                                 break;
1266                         }
1267                         /* fall through to full copy */
1268                 case HAMMER2_BREF_TYPE_DATA:
1269                         bcopy(focus->data, scan->data, focus->bytes);
1270                         break;
1271                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1272                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1273                 case HAMMER2_BREF_TYPE_FREEMAP:
1274                 case HAMMER2_BREF_TYPE_VOLUME:
1275                         panic("hammer2_cluster_modsync: illegal node type");
1276                         /* NOT REACHED */
1277                         break;
1278                 default:
1279                         panic("hammer2_cluster_modsync: unknown node type");
1280                         break;
1281                 }
1282         }
1283 }
1284
1285 /*
1286  * Lookup initialization/completion API.  Returns a locked, fully resolved
1287  * cluster with one ref.
1288  */
1289 hammer2_cluster_t *
1290 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
1291 {
1292         hammer2_cluster_t *cluster;
1293
1294         cluster = hammer2_cluster_copy(cparent);
1295         if (flags & HAMMER2_LOOKUP_SHARED) {
1296                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
1297                                               HAMMER2_RESOLVE_SHARED);
1298         } else {
1299                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1300         }
1301         hammer2_cluster_resolve(cluster);
1302
1303         return (cluster);
1304 }
1305
1306 void
1307 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1308 {
1309         if (cparent) {
1310                 hammer2_cluster_unlock(cparent);
1311                 hammer2_cluster_drop(cparent);
1312         }
1313 }
1314
1315 /*
1316  * Locate first match or overlap under parent, return a new, locked, resolved
1317  * cluster with one ref.
1318  *
1319  * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1320  */
1321 hammer2_cluster_t *
1322 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
1323                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1324 {
1325         hammer2_pfs_t *pmp;
1326         hammer2_cluster_t *cluster;
1327         hammer2_chain_t *chain;
1328         hammer2_key_t key_accum;
1329         hammer2_key_t key_next;
1330         int null_count;
1331         int rflags;
1332         int i;
1333
1334         KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1335
1336         pmp = cparent->pmp;                             /* can be NULL */
1337         key_accum = *key_nextp;
1338         null_count = 0;
1339         if (flags & HAMMER2_LOOKUP_SHARED)
1340                 rflags = HAMMER2_RESOLVE_SHARED;
1341         else
1342                 rflags = 0;
1343
1344         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1345         cluster->pmp = pmp;                             /* can be NULL */
1346         cluster->refs = 1;
1347         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1348                 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1349
1350         /*
1351          * Iterating earlier cluster elements with later elements still
1352          * locked is a problem, so we have to unlock the parent and then
1353          * re-lock as we go.
1354          */
1355         hammer2_cluster_unlock(cparent);
1356         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1357
1358         /*
1359          * Pass-1, issue lookups.
1360          */
1361         for (i = 0; i < cparent->nchains; ++i) {
1362                 cluster->array[i].flags = cparent->array[i].flags;
1363                 key_next = *key_nextp;
1364
1365                 /*
1366                  * Always relock the parent as we go.
1367                  */
1368                 if (cparent->array[i].chain) {
1369                         hammer2_chain_lock(cparent->array[i].chain, rflags);
1370                 }
1371
1372                 /*
1373                  * Nothing to base the lookup, or parent was not synchronized.
1374                  */
1375                 if (cparent->array[i].chain == NULL ||
1376                     (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
1377                         ++null_count;
1378                         continue;
1379                 }
1380
1381                 chain = hammer2_chain_lookup(&cparent->array[i].chain,
1382                                              &key_next,
1383                                              key_beg, key_end,
1384                                              &cparent->array[i].cache_index,
1385                                              flags);
1386                 cluster->array[i].chain = chain;
1387                 if (chain == NULL) {
1388                         ++null_count;
1389                 }
1390                 if (key_accum > key_next)
1391                         key_accum = key_next;
1392         }
1393
1394         /*
1395          * Cleanup
1396          */
1397         cluster->nchains = i;
1398         *key_nextp = key_accum;
1399
1400         /*
1401          * The cluster must be resolved, out of sync elements may be present.
1402          *
1403          * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1404          */
1405         if (null_count != i)
1406                 hammer2_cluster_resolve(cluster);
1407         if (null_count == i ||
1408             (cluster->focus == NULL &&
1409              (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1410                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1411                         hammer2_cluster_unlock(cluster);
1412                 hammer2_cluster_drop(cluster);
1413                 cluster = NULL;
1414         }
1415
1416         return (cluster);
1417 }
1418
1419 /*
1420  * Locate next match or overlap under parent, replace the passed-in cluster.
1421  * The returned cluster is a new, locked, resolved cluster with one ref.
1422  *
1423  * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1424  */
1425 hammer2_cluster_t *
1426 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1427                      hammer2_key_t *key_nextp,
1428                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1429 {
1430         hammer2_chain_t *ochain;
1431         hammer2_chain_t *nchain;
1432         hammer2_key_t key_accum;
1433         hammer2_key_t key_next;
1434         int parent_index;
1435         int cluster_index;
1436         int null_count;
1437         int rflags;
1438         int i;
1439
1440         KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1441
1442         key_accum = *key_nextp;
1443         null_count = 0;
1444         parent_index = cparent->focus_index;    /* save prior focus */
1445         cluster_index = cluster->focus_index;
1446         if (flags & HAMMER2_LOOKUP_SHARED)
1447                 rflags = HAMMER2_RESOLVE_SHARED;
1448         else
1449                 rflags = 0;
1450
1451         cluster->focus = NULL;          /* XXX needed any more? */
1452         /*cparent->focus = NULL;*/
1453         cluster->focus_index = 0;       /* XXX needed any more? */
1454         /*cparent->focus_index = 0;*/
1455
1456         cluster->ddflag = 0;
1457
1458         /*
1459          * The parent is always locked on entry, the iterator may be locked
1460          * depending on flags.
1461          *
1462          * We must temporarily unlock the passed-in clusters to avoid a
1463          * deadlock between elements of the cluster with other threads.
1464          * We will fixup the lock in the loop.
1465          *
1466          * Note that this will clear the focus.
1467          *
1468          * Reflag the clusters as locked, because we will relock them
1469          * as we go.
1470          */
1471         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1472                 hammer2_cluster_unlock(cluster);
1473                 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1474         }
1475         hammer2_cluster_unlock(cparent);
1476         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1477
1478         for (i = 0; i < cparent->nchains; ++i) {
1479                 key_next = *key_nextp;
1480                 ochain = cluster->array[i].chain;
1481
1482                 /*
1483                  * Always relock the parent as we go.
1484                  */
1485                 if (cparent->array[i].chain)
1486                         hammer2_chain_lock(cparent->array[i].chain, rflags);
1487
1488                 /*
1489                  * Nothing to iterate from.  These cases can occur under
1490                  * normal operations.  For example, during synchronization
1491                  * a slave might reach the end of its scan while records
1492                  * are still left on the master(s).
1493                  */
1494                 if (ochain == NULL) {
1495                         ++null_count;
1496                         continue;
1497                 }
1498                 if (cparent->array[i].chain == NULL ||
1499                     (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1500                     (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1501                         /* ochain has not yet been relocked */
1502                         hammer2_chain_drop(ochain);
1503                         cluster->array[i].chain = NULL;
1504                         ++null_count;
1505                         continue;
1506                 }
1507
1508                 /*
1509                  * Relock the child if necessary.  Parent and child will then
1510                  * be locked as expected by hammer2_chain_next() and flags.
1511                  */
1512                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1513                         hammer2_chain_lock(ochain, rflags);
1514                 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1515                                             &key_next, key_beg, key_end,
1516                                             &cparent->array[i].cache_index,
1517                                             flags);
1518                 /* ochain now invalid but can still be used for focus check */
1519                 if (parent_index == i) {
1520                         cparent->focus_index = i;
1521                         cparent->focus = cparent->array[i].chain;
1522                 }
1523
1524                 cluster->array[i].chain = nchain;
1525                 if (nchain == NULL) {
1526                         ++null_count;
1527                 }
1528                 if (key_accum > key_next)
1529                         key_accum = key_next;
1530         }
1531
1532         /*
1533          * Cleanup
1534          */
1535         cluster->nchains = i;
1536         *key_nextp = key_accum;
1537
1538         /*
1539          * The cluster must be resolved, out of sync elements may be present.
1540          *
1541          * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1542          */
1543         if (null_count != i)
1544                 hammer2_cluster_resolve(cluster);
1545         if (null_count == i ||
1546             (cluster->focus == NULL &&
1547              (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1548                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1549                         hammer2_cluster_unlock(cluster);
1550                 hammer2_cluster_drop(cluster);
1551                 cluster = NULL;
1552         }
1553         return(cluster);
1554 }
1555
1556 /*
1557  * Advance just one chain in the cluster and recalculate the invalid bit.
1558  * The cluster index is allowed to be flagged invalid on input and is
1559  * recalculated on return.
1560  *
1561  * (used during synchronization to advance past a chain being deleted).
1562  *
1563  * The chain being advanced must not be the focus and the clusters in
1564  * question must have already passed normal cluster_lookup/cluster_next
1565  * checks.
1566  *
1567  * The cluster always remains intact on return, so void function.
1568  */
1569 void
1570 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1571                                   hammer2_cluster_t *cluster,
1572                                   hammer2_key_t *key_nextp,
1573                                   hammer2_key_t key_beg,
1574                                   hammer2_key_t key_end,
1575                                   int i, int flags)
1576 {
1577         hammer2_chain_t *ochain;
1578         hammer2_chain_t *nchain;
1579         hammer2_chain_t *focus;
1580         hammer2_key_t key_accum;
1581         hammer2_key_t key_next;
1582         int ddflag;
1583
1584         key_accum = *key_nextp;
1585         key_next = *key_nextp;
1586         ochain = cluster->array[i].chain;
1587         if (ochain == NULL)
1588                 goto done;
1589         KKASSERT(ochain != cluster->focus);
1590
1591         nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1592                                     &key_next, key_beg, key_end,
1593                                     &cparent->array[i].cache_index,
1594                                     flags);
1595         /* ochain now invalid */
1596         if (cparent->focus_index == i)
1597                 cparent->focus = cparent->array[i].chain;
1598
1599         /*
1600          * Install nchain.  Note that nchain can be NULL, and can also
1601          * be in an unlocked state depending on flags.
1602          */
1603         cluster->array[i].chain = nchain;
1604         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1605
1606         if (key_accum > key_next)
1607                 key_accum = key_next;
1608
1609         focus = cluster->focus;
1610         if (focus == NULL)
1611                 goto done;
1612         if (nchain == NULL)
1613                 goto done;
1614 #if 0
1615         if (nchain == focus)    /* ASSERTED NOT TRUE */
1616                 ...
1617 #endif
1618         ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1619         if (nchain->bref.type != focus->bref.type ||
1620             nchain->bref.key != focus->bref.key ||
1621             nchain->bref.keybits != focus->bref.keybits ||
1622             nchain->bref.modify_tid != focus->bref.modify_tid ||
1623             nchain->bytes != focus->bytes ||
1624             ddflag != cluster->ddflag) {
1625                 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1626         }
1627
1628 done:
1629         *key_nextp = key_accum;
1630 #if 0
1631         /*
1632          * For now don't re-resolve cluster->flags.
1633          */
1634         hammer2_cluster_resolve(cluster);
1635 #endif
1636 }
1637
1638 /*
1639  * Mark a cluster deleted
1640  */
1641 void
1642 hammer2_cluster_delete(hammer2_cluster_t *cparent,
1643                        hammer2_cluster_t *cluster, int flags)
1644 {
1645         hammer2_chain_t *chain;
1646         hammer2_chain_t *parent;
1647         int i;
1648
1649         if (cparent == NULL) {
1650                 kprintf("cparent is NULL\n");
1651                 return;
1652         }
1653
1654         for (i = 0; i < cluster->nchains; ++i) {
1655                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1656                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1657                         continue;
1658                 }
1659                 parent = cparent->array[i].chain;
1660                 chain = cluster->array[i].chain;
1661                 if (chain == NULL)
1662                         continue;
1663                 if (chain->parent != parent) {
1664                         kprintf("hammer2_cluster_delete: parent "
1665                                 "mismatch chain=%p parent=%p against=%p\n",
1666                                 chain, chain->parent, parent);
1667                 } else {
1668                         hammer2_chain_delete(parent, chain, flags);
1669                 }
1670         }
1671 }
1672
1673 /*
1674  * Create a snapshot of the specified {parent, ochain} with the specified
1675  * label.  The originating hammer2_inode must be exclusively locked for
1676  * safety.
1677  *
1678  * The ioctl code has already synced the filesystem.
1679  */
1680 int
1681 hammer2_cluster_snapshot(hammer2_cluster_t *ocluster,
1682                        hammer2_ioc_pfs_t *pmp)
1683 {
1684         hammer2_dev_t *hmp;
1685         const hammer2_inode_data_t *ripdata;
1686         hammer2_inode_data_t *wipdata;
1687         hammer2_chain_t *nchain;
1688         hammer2_inode_t *nip;
1689         size_t name_len;
1690         hammer2_key_t lhc;
1691         struct vattr vat;
1692 #if 0
1693         uuid_t opfs_clid;
1694 #endif
1695         int error;
1696
1697         kprintf("snapshot %s\n", pmp->name);
1698
1699         name_len = strlen(pmp->name);
1700         lhc = hammer2_dirhash(pmp->name, name_len);
1701
1702         /*
1703          * Get the clid
1704          */
1705         ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1706 #if 0
1707         opfs_clid = ripdata->meta.pfs_clid;
1708 #endif
1709         hmp = ocluster->focus->hmp;     /* XXX find synchronized local disk */
1710
1711         /*
1712          * Create the snapshot directory under the super-root
1713          *
1714          * Set PFS type, generate a unique filesystem id, and generate
1715          * a cluster id.  Use the same clid when snapshotting a PFS root,
1716          * which theoretically allows the snapshot to be used as part of
1717          * the same cluster (perhaps as a cache).
1718          *
1719          * Copy the (flushed) blockref array.  Theoretically we could use
1720          * chain_duplicate() but it becomes difficult to disentangle
1721          * the shared core so for now just brute-force it.
1722          */
1723         VATTR_NULL(&vat);
1724         vat.va_type = VDIR;
1725         vat.va_mode = 0755;
1726         nip = hammer2_inode_create(hmp->spmp->iroot, &vat, proc0.p_ucred,
1727                                    pmp->name, name_len, 0,
1728                                    1, 0, 0,
1729                                    HAMMER2_INSERT_PFSROOT, &error);
1730
1731         if (nip) {
1732                 hammer2_inode_modify(nip);
1733                 nchain = hammer2_inode_chain(nip, 0, HAMMER2_RESOLVE_ALWAYS);
1734                 hammer2_chain_modify(nchain, 0);
1735                 wipdata = &nchain->data->ipdata;
1736
1737                 nip->meta.pfs_type = HAMMER2_PFSTYPE_MASTER;
1738                 nip->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1739                 nip->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT;
1740                 kern_uuidgen(&nip->meta.pfs_fsid, 1);
1741
1742                 /*
1743                  * Give the snapshot its own private cluster id.  As a
1744                  * snapshot no further synchronization with the original
1745                  * cluster will be done.
1746                  */
1747 #if 0
1748                 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1749                         nip->meta.pfs_clid = opfs_clid;
1750                 else
1751                         kern_uuidgen(&nip->meta.pfs_clid, 1);
1752 #endif
1753                 kern_uuidgen(&nip->meta.pfs_clid, 1);
1754                 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1755
1756                 /* XXX hack blockset copy */
1757                 /* XXX doesn't work with real cluster */
1758                 KKASSERT(ocluster->nchains == 1);
1759                 wipdata->meta = nip->meta;
1760                 wipdata->u.blockset = ripdata->u.blockset;
1761                 hammer2_flush(nchain, 1);
1762                 hammer2_chain_unlock(nchain);
1763                 hammer2_chain_drop(nchain);
1764                 hammer2_inode_unlock(nip);
1765         }
1766         return (error);
1767 }
1768
1769 /************************************************************************
1770  *                              CLUSTER I/O                             *
1771  ************************************************************************
1772  *
1773  *
1774  * WARNING! blockref[] array data is not universal.  These functions should
1775  *          only be used to access universal data.
1776  *
1777  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1778  *          complete if necessary.  The I/O's should have already been
1779  *          initiated by the cluster_lock/chain_lock operation.
1780  *
1781  *          The cluster must already be in a modified state before wdata
1782  *          is called.  The data will already be available for this case.
1783  */
1784 const hammer2_media_data_t *
1785 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1786 {
1787         KKASSERT(cluster->focus != NULL);
1788         return(cluster->focus->data);
1789 }
1790
1791 const hammer2_media_data_t *
1792 hammer2_cluster_rdata_bytes(hammer2_cluster_t *cluster, size_t *bytesp)
1793 {
1794         KKASSERT(cluster->focus != NULL);
1795         *bytesp = cluster->focus->bytes;
1796         return(cluster->focus->data);
1797 }
1798
1799 hammer2_media_data_t *
1800 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1801 {
1802         KKASSERT(cluster->focus != NULL);
1803         KKASSERT(hammer2_cluster_modified(cluster));
1804         return(cluster->focus->data);
1805 }