hammer2 - Refactor frontend part 9/many
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *          locks and I/O, do quorum and/or master-slave processing, and
50  *          it must operate properly even if some nodes are broken (which
51  *          can also mean indefinite locks).
52  *
53  *                              CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *              hammer2_cluster_lock()
59  *              hammer2_cluster_parent()
60  *
61  *      - Most complex functions, quorum management on transaction ids.
62  *
63  *      - Locking and data accesses must be internally asynchronous.
64  *
65  *      - Validate and manage cache coherency primitives (cache state
66  *        is stored in chain topologies but must be validated by these
67  *        functions).
68  *
69  * (2) Lookups and Scans
70  *              hammer2_cluster_lookup()
71  *              hammer2_cluster_next()
72  *
73  *      - Depend on locking & data retrieval functions, but still complex.
74  *
75  *      - Must do quorum management on transaction ids.
76  *
77  *      - Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *              hammer2_cluster_create()
81  *              hammer2_cluster_rename()
82  *              hammer2_cluster_delete()
83  *              hammer2_cluster_modify()
84  *              hammer2_cluster_modsync()
85  *
86  *      - Can usually punt on failures, operation continues unless quorum
87  *        is lost.  If quorum is lost, must wait for resynchronization
88  *        (depending on the management mode).
89  *
90  *      - Must disconnect node on failures (also not flush), remount, and
91  *        resynchronize.
92  *
93  *      - Network links (via kdmsg) are relatively easy to issue as the
94  *        complex underworkings of hammer2_chain.c don't have to messed
95  *        with (the protocol is at a higher level than block-level).
96  *
97  *      - Multiple local disk nodes (i.e. block devices) are another matter.
98  *        Chain operations have to be dispatched to per-node threads (xN)
99  *        because we can't asynchronize potentially very complex chain
100  *        operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *        (these threads are also used to terminate incoming kdmsg ops from
103  *        other machines).
104  *
105  *      - Single-node filesystems do not use threads and will simply call
106  *        hammer2_chain.c functions directly.  This short-cut is handled
107  *        at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115
116 #include "hammer2.h"
117
118 /*
119  * Returns non-zero if any chain in the cluster needs to be resized.
120  * Errored elements are not used in the calculation.
121  */
122 int
123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
124 {
125         hammer2_chain_t *chain;
126         int i;
127
128         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
129         for (i = 0; i < cluster->nchains; ++i) {
130                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
131                         continue;
132                 chain = cluster->array[i].chain;
133                 if (chain == NULL)
134                         continue;
135                 if (chain->error)
136                         continue;
137                 if (chain->bytes != bytes)
138                         return 1;
139         }
140         return 0;
141 }
142
143 /*
144  * Returns the bref type of the cluster's foucs.
145  *
146  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
147  * The cluster must be locked.
148  */
149 uint8_t
150 hammer2_cluster_type(hammer2_cluster_t *cluster)
151 {
152         if (cluster->error == 0) {
153                 KKASSERT(cluster->focus != NULL);
154                 return(cluster->focus->bref.type);
155         }
156         return 0;
157 }
158
159 /*
160  * Returns non-zero if the cluster's focus is flagged as being modified.
161  *
162  * If the cluster is errored, returns 0.
163  */
164 int
165 hammer2_cluster_modified(hammer2_cluster_t *cluster)
166 {
167         if (cluster->error == 0) {
168                 KKASSERT(cluster->focus != NULL);
169                 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
170         }
171         return 0;
172 }
173
174 /*
175  * Returns the bref of the cluster's focus, sans any data-offset information
176  * (since offset information is per-node and wouldn't be useful).
177  *
178  * Callers use this function to access modify_tid, mirror_tid, type,
179  * key, and keybits.
180  *
181  * If the cluster is errored, returns an empty bref.
182  * The cluster must be locked.
183  */
184 void
185 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
186 {
187         if (cluster->error == 0) {
188                 KKASSERT(cluster->focus != NULL);
189                 *bref = cluster->focus->bref;
190                 bref->data_off = 0;
191         } else {
192                 bzero(bref, sizeof(*bref));
193         }
194 }
195
196 /*
197  * Flag the cluster for flushing recursively up to the root.  Despite the
198  * work it does, this is relatively benign.  It just makes sure that the
199  * flusher has top-down visibility to this cluster.
200  *
201  * Errored chains are not flagged for flushing.
202  *
203  * The cluster should probably be locked.
204  */
205 void
206 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
207 {
208         hammer2_chain_t *chain;
209         int i;
210
211         for (i = 0; i < cluster->nchains; ++i) {
212                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
213                         continue;
214                 chain = cluster->array[i].chain;
215                 if (chain == NULL)
216                         continue;
217                 if (chain->error)
218                         continue;
219                 hammer2_chain_setflush(trans, chain);
220         }
221 }
222
223 /*
224  * Set the check mode for the cluster.
225  * Errored elements of the cluster are ignored.
226  *
227  * The cluster must be locked and modified.
228  */
229 void
230 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
231                                 hammer2_cluster_t *cluster,
232                                 int check_algo)
233 {
234         hammer2_chain_t *chain;
235         int i;
236
237         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
238         for (i = 0; i < cluster->nchains; ++i) {
239                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
240                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
241                         continue;
242                 }
243                 chain = cluster->array[i].chain;
244                 if (chain == NULL)
245                         continue;
246                 if (chain->error)
247                         continue;
248                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
249                 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
250                 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
251         }
252 }
253
254 /*
255  * Create a degenerate cluster with one ref from a single locked chain.
256  * The returned cluster will be focused on the chain and inherit its
257  * error state.
258  *
259  * The chain's lock and reference are transfered to the new cluster, so
260  * the caller should not try to unlock the chain separately.
261  *
262  * We fake the flags.
263  */
264 hammer2_cluster_t *
265 hammer2_cluster_from_chain(hammer2_chain_t *chain)
266 {
267         hammer2_cluster_t *cluster;
268
269         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
270         cluster->array[0].chain = chain;
271         cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
272         cluster->nchains = 1;
273         cluster->focus = chain;
274         cluster->focus_index = 0;
275         cluster->pmp = chain->pmp;
276         cluster->refs = 1;
277         cluster->error = chain->error;
278         cluster->flags = HAMMER2_CLUSTER_LOCKED |
279                          HAMMER2_CLUSTER_WRHARD |
280                          HAMMER2_CLUSTER_RDHARD |
281                          HAMMER2_CLUSTER_MSYNCED |
282                          HAMMER2_CLUSTER_SSYNCED;
283
284         return cluster;
285 }
286
287 /*
288  * Add a reference to a cluster and its underlying chains.
289  *
290  * We must also ref the underlying chains in order to allow ref/unlock
291  * sequences to later re-lock.
292  */
293 void
294 hammer2_cluster_ref(hammer2_cluster_t *cluster)
295 {
296         atomic_add_int(&cluster->refs, 1);
297 }
298
299 /*
300  * Drop the caller's reference to the cluster.  When the ref count drops to
301  * zero this function frees the cluster and drops all underlying chains.
302  *
303  * In-progress read I/Os are typically detached from the cluster once the
304  * first one returns (the remaining stay attached to the DIOs but are then
305  * ignored and drop naturally).
306  */
307 void
308 hammer2_cluster_drop(hammer2_cluster_t *cluster)
309 {
310         hammer2_chain_t *chain;
311         int i;
312
313         KKASSERT(cluster->refs > 0);
314         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
315                 cluster->focus = NULL;          /* safety XXX chg to assert */
316                 cluster->focus_index = 0;
317
318                 for (i = 0; i < cluster->nchains; ++i) {
319                         chain = cluster->array[i].chain;
320                         if (chain) {
321                                 hammer2_chain_drop(chain);
322                                 cluster->array[i].chain = NULL; /* safety */
323                         }
324                 }
325                 cluster->nchains = 0;                           /* safety */
326
327                 kfree(cluster, M_HAMMER2);
328                 /* cluster is invalid */
329         }
330 }
331
332 void
333 hammer2_cluster_wait(hammer2_cluster_t *cluster)
334 {
335         tsleep(cluster->focus, 0, "h2clcw", 1);
336 }
337
338 /*
339  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained. 
340  *
341  * WARNING! This function expects the caller to handle resolution of the
342  *          cluster.  We never re-resolve the cluster in this function,
343  *          because it might be used to temporarily unlock/relock a cparent
344  *          in an iteration or recursrion, and the cparents elements do not
345  *          necessarily match.
346  */
347 void
348 hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
349 {
350         hammer2_chain_t *chain;
351         int i;
352
353         /* cannot be on inode-embedded cluster template, must be on copy */
354         KKASSERT(cluster->refs > 0);
355         KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
356         if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
357                 panic("hammer2_cluster_lock: cluster %p already locked!\n",
358                         cluster);
359         }
360         atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
361
362         /*
363          * Lock chains and resolve state.
364          */
365         for (i = 0; i < cluster->nchains; ++i) {
366                 if (i == idx)
367                         continue;
368                 chain = cluster->array[i].chain;
369                 if (chain == NULL)
370                         continue;
371                 hammer2_chain_lock(chain, how);
372         }
373 }
374
375 void
376 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
377 {
378         hammer2_cluster_lock_except(cluster, -1, how);
379 }
380
381 /*
382  * Calculate the clustering state for the cluster and set its focus.
383  * This routine must be called with care.  For example, it should not
384  * normally be called after relocking a non-leaf cluster because parent
385  * clusters help iterations and each element might be at a slightly different
386  * indirect node (each node's topology is independently indexed).
387  *
388  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
389  * operations.  Typically this is only set on a quorum of MASTERs or
390  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
391  * is present, this bit is *not* set on a quorum of MASTERs.  The
392  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
393  * that create/modify/delete elements use it.
394  *
395  * The chains making up the cluster may be narrowed down based on quorum
396  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
397  * narrowed down to a single chain as long as the entire subtopology is known
398  * to be intact.  So, for example, we can narrow a read-only op to a single
399  * fast SLAVE but if we focus a CACHE chain we must still retain at least
400  * a SLAVE to ensure that the subtopology can be accessed.
401  *
402  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
403  * to be maintained once the topology is validated as-of the top level of
404  * the operation.
405  *
406  * If a failure occurs the operation must be aborted by higher-level code and
407  * retried. XXX
408  */
409 void
410 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
411 {
412         hammer2_chain_t *chain;
413         hammer2_chain_t *focus;
414         hammer2_pfs_t *pmp;
415         hammer2_tid_t quorum_tid;
416         hammer2_tid_t last_best_quorum_tid;
417         int focus_pfs_type;
418         uint32_t nflags;
419         int ttlmasters;
420         int ttlslaves;
421         int nmasters;
422         int nslaves;
423         int nquorum;
424         int smpresent;
425         int i;
426
427         cluster->error = 0;
428         cluster->focus = NULL;
429
430         focus_pfs_type = 0;
431         nflags = 0;
432         ttlmasters = 0;
433         ttlslaves = 0;
434         nmasters = 0;
435         nslaves = 0;
436
437         /*
438          * Calculate quorum
439          */
440         pmp = cluster->pmp;
441         KKASSERT(pmp != NULL || cluster->nchains == 0);
442         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
443         smpresent = 0;
444
445         /*
446          * Pass 1
447          *
448          * NOTE: A NULL chain is not necessarily an error, it could be
449          *       e.g. a lookup failure or the end of an iteration.
450          *       Process normally.
451          */
452         for (i = 0; i < cluster->nchains; ++i) {
453                 chain = cluster->array[i].chain;
454                 if (chain && chain->error) {
455                         if (cluster->focus == NULL || cluster->focus == chain) {
456                                 /* error will be overridden by valid focus */
457                                 cluster->error = chain->error;
458                         }
459
460                         /*
461                          * Must count total masters and slaves whether the
462                          * chain is errored or not.
463                          */
464                         switch (cluster->pmp->pfs_types[i]) {
465                         case HAMMER2_PFSTYPE_MASTER:
466                                 ++ttlmasters;
467                                 break;
468                         case HAMMER2_PFSTYPE_SLAVE:
469                                 ++ttlslaves;
470                                 break;
471                         }
472                         continue;
473                 }
474                 switch (cluster->pmp->pfs_types[i]) {
475                 case HAMMER2_PFSTYPE_MASTER:
476                         ++ttlmasters;
477                         break;
478                 case HAMMER2_PFSTYPE_SLAVE:
479                         ++ttlslaves;
480                         break;
481                 case HAMMER2_PFSTYPE_SOFT_MASTER:
482                         nflags |= HAMMER2_CLUSTER_WRSOFT;
483                         nflags |= HAMMER2_CLUSTER_RDSOFT;
484                         smpresent = 1;
485                         break;
486                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
487                         nflags |= HAMMER2_CLUSTER_RDSOFT;
488                         break;
489                 case HAMMER2_PFSTYPE_SUPROOT:
490                         /*
491                          * Degenerate cluster representing the super-root
492                          * topology on a single device.  Fake stuff so
493                          * cluster ops work as expected.
494                          */
495                         nflags |= HAMMER2_CLUSTER_WRHARD;
496                         nflags |= HAMMER2_CLUSTER_RDHARD;
497                         cluster->focus_index = i;
498                         cluster->focus = chain;
499                         cluster->error = chain ? chain->error : 0;
500                         break;
501                 default:
502                         break;
503                 }
504         }
505
506         /*
507          * Pass 2
508          *
509          * Resolve masters.  Calculate nmasters for the highest matching
510          * TID, if a quorum cannot be attained try the next lower matching
511          * TID until we exhaust TIDs.
512          *
513          * NOTE: A NULL chain is not necessarily an error, it could be
514          *       e.g. a lookup failure or the end of an iteration.
515          *       Process normally.
516          */
517         last_best_quorum_tid = HAMMER2_TID_MAX;
518         quorum_tid = 0;         /* fix gcc warning */
519
520         while (nmasters < nquorum && last_best_quorum_tid != 0) {
521                 nmasters = 0;
522                 quorum_tid = 0;
523
524                 for (i = 0; i < cluster->nchains; ++i) {
525                         if (cluster->pmp->pfs_types[i] !=
526                             HAMMER2_PFSTYPE_MASTER) {
527                                 continue;
528                         }
529                         chain = cluster->array[i].chain;
530
531                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
532                                 /*
533                                  * Invalid as in unsynchronized, cannot be
534                                  * used to calculate the quorum.
535                                  */
536                         } else if (chain == NULL && quorum_tid == 0) {
537                                 /*
538                                  * NULL chain on master matches NULL chains
539                                  * on other masters.
540                                  */
541                                 ++nmasters;
542                         } else if (quorum_tid < last_best_quorum_tid &&
543                                    chain != NULL &&
544                                    (quorum_tid < chain->bref.modify_tid ||
545                                     nmasters == 0)) {
546                                 /*
547                                  * Better TID located, reset nmasters count.
548                                  */
549                                 nmasters = 1;
550                                 quorum_tid = chain->bref.modify_tid;
551                         } else if (chain &&
552                                    quorum_tid == chain->bref.modify_tid) {
553                                 /*
554                                  * TID matches current collection.
555                                  */
556                                 ++nmasters;
557                         }
558                 }
559                 if (nmasters >= nquorum)
560                         break;
561                 last_best_quorum_tid = quorum_tid;
562         }
563
564         /*
565          * Pass 3
566          *
567          * NOTE: A NULL chain is not necessarily an error, it could be
568          *       e.g. a lookup failure or the end of an iteration.
569          *       Process normally.
570          */
571         for (i = 0; i < cluster->nchains; ++i) {
572                 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
573                 chain = cluster->array[i].chain;
574                 if (chain && chain->error) {
575                         if (cluster->focus == NULL || cluster->focus == chain) {
576                                 /* error will be overridden by valid focus */
577                                 cluster->error = chain->error;
578                         }
579                         continue;
580                 }
581
582                 switch (cluster->pmp->pfs_types[i]) {
583                 case HAMMER2_PFSTYPE_MASTER:
584                         /*
585                          * We must have enough up-to-date masters to reach
586                          * a quorum and the master modify_tid must match
587                          * the quorum's modify_tid.
588                          *
589                          * Do not select an errored or out-of-sync master.
590                          */
591                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
592                                 nflags |= HAMMER2_CLUSTER_UNHARD;
593                         } else if (nmasters >= nquorum &&
594                                    (chain == NULL || chain->error == 0) &&
595                                    ((chain == NULL && quorum_tid == 0) ||
596                                     (chain != NULL && quorum_tid ==
597                                                   chain->bref.modify_tid))) {
598                                 nflags |= HAMMER2_CLUSTER_WRHARD;
599                                 nflags |= HAMMER2_CLUSTER_RDHARD;
600                                 if (!smpresent) {
601                                         cluster->array[i].flags |=
602                                                         HAMMER2_CITEM_FEMOD;
603                                 }
604                                 if (cluster->focus == NULL ||
605                                     focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
606                                         focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
607                                         cluster->focus_index = i;
608                                         cluster->focus = chain; /* NULL ok */
609                                         cluster->error = chain ? chain->error :
610                                                                  0;
611                                 }
612                         } else if (chain == NULL || chain->error == 0) {
613                                 nflags |= HAMMER2_CLUSTER_UNHARD;
614                         }
615                         break;
616                 case HAMMER2_PFSTYPE_SLAVE:
617                         /*
618                          * We must have enough up-to-date masters to reach
619                          * a quorum and the slave modify_tid must match the
620                          * quorum's modify_tid.
621                          *
622                          * Do not select an errored slave.
623                          */
624                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
625                                 nflags |= HAMMER2_CLUSTER_UNHARD;
626                         } else if (nmasters >= nquorum &&
627                                    (chain == NULL || chain->error == 0) &&
628                                    ((chain == NULL && quorum_tid == 0) ||
629                                     (chain && quorum_tid ==
630                                               chain->bref.modify_tid))) {
631                                 ++nslaves;
632                                 nflags |= HAMMER2_CLUSTER_RDHARD;
633 #if 0
634                                 /* XXX optimize for RESOLVE_RDONLY */
635                                 if (cluster->focus == NULL) {
636                                         focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
637                                         cluster->focus_index = i;
638                                         cluster->focus = chain; /* NULL ok */
639                                         cluster->error = chain ? chain->error :
640                                                                  0;
641                                 }
642 #endif
643                         } else if (chain == NULL || chain->error == 0) {
644                                 nflags |= HAMMER2_CLUSTER_UNSOFT;
645                         }
646                         break;
647                 case HAMMER2_PFSTYPE_SOFT_MASTER:
648                         /*
649                          * Directly mounted soft master always wins.  There
650                          * should be only one.
651                          */
652                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
653                         cluster->focus_index = i;
654                         cluster->focus = chain;
655                         cluster->error = chain ? chain->error : 0;
656                         focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
657                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
658                         break;
659                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
660                         /*
661                          * Directly mounted soft slave always wins.  There
662                          * should be only one.
663                          */
664                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
665                         if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
666                                 cluster->focus_index = i;
667                                 cluster->focus = chain;
668                                 cluster->error = chain ? chain->error : 0;
669                                 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
670                         }
671                         break;
672                 case HAMMER2_PFSTYPE_SUPROOT:
673                         /*
674                          * spmp (degenerate case)
675                          */
676                         KKASSERT(i == 0);
677                         cluster->focus_index = i;
678                         cluster->focus = chain;
679                         cluster->error = chain ? chain->error : 0;
680                         focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
681                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
682                         break;
683                 default:
684                         break;
685                 }
686         }
687
688         /*
689          * Focus now set, adjust ddflag.  Skip this pass if the focus
690          * is bad or if we are at the PFS root (the bref won't match at
691          * the PFS root, obviously).
692          */
693         focus = cluster->focus;
694         if (focus) {
695                 cluster->ddflag =
696                         (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
697         } else {
698                 cluster->ddflag = 0;
699                 goto skip4;
700         }
701         if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
702                 goto skip4;
703
704         /*
705          * Pass 4
706          *
707          * Validate the elements that were not marked invalid.  They should
708          * match.
709          */
710         for (i = 0; i < cluster->nchains; ++i) {
711                 int ddflag;
712
713                 chain = cluster->array[i].chain;
714
715                 if (chain == NULL)
716                         continue;
717                 if (chain == focus)
718                         continue;
719                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
720                         continue;
721
722                 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
723                 if (chain->bref.type != focus->bref.type ||
724                     chain->bref.key != focus->bref.key ||
725                     chain->bref.keybits != focus->bref.keybits ||
726                     chain->bref.modify_tid != focus->bref.modify_tid ||
727                     chain->bytes != focus->bytes ||
728                     ddflag != cluster->ddflag) {
729                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
730                         if (hammer2_debug & 1)
731                         kprintf("cluster_resolve: matching modify_tid failed "
732                                 "bref test: idx=%d type=%02x/%02x "
733                                 "key=%016jx/%d-%016jx/%d "
734                                 "mod=%016jx/%016jx bytes=%u/%u\n",
735                                 i,
736                                 chain->bref.type, focus->bref.type,
737                                 chain->bref.key, chain->bref.keybits,
738                                 focus->bref.key, focus->bref.keybits,
739                                 chain->bref.modify_tid, focus->bref.modify_tid,
740                                 chain->bytes, focus->bytes);
741                         if (hammer2_debug & 0x4000)
742                                 panic("cluster_resolve");
743                         /* flag issue and force resync? */
744                 }
745         }
746 skip4:
747
748         if (ttlslaves == 0)
749                 nflags |= HAMMER2_CLUSTER_NOSOFT;
750         if (ttlmasters == 0)
751                 nflags |= HAMMER2_CLUSTER_NOHARD;
752
753         /*
754          * Set SSYNCED or MSYNCED for slaves and masters respectively if
755          * all available nodes (even if 0 are available) are fully
756          * synchronized.  This is used by the synchronization thread to
757          * determine if there is work it could potentially accomplish.
758          */
759         if (nslaves == ttlslaves)
760                 nflags |= HAMMER2_CLUSTER_SSYNCED;
761         if (nmasters == ttlmasters)
762                 nflags |= HAMMER2_CLUSTER_MSYNCED;
763
764         /*
765          * Determine if the cluster was successfully locked for the
766          * requested operation and generate an error code.  The cluster
767          * will not be locked (or ref'd) if an error is returned.
768          *
769          * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
770          * to determine if reading or writing is possible.  If writing, the
771          * cluster still requires a call to hammer2_cluster_modify() first.
772          */
773         atomic_set_int(&cluster->flags, nflags);
774         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
775 }
776
777 /*
778  * This is used by the XOPS subsystem to calculate the state of
779  * the collection and tell hammer2_xop_collect() what to do with it.
780  * The collection can be in various states of desynchronization, the
781  * caller specifically wants to resolve the passed-in key.
782  *
783  * Return values:
784  *      0               - Quorum agreement, key is valid
785  *
786  *      ENOENT          - Quorum agreement, end of scan
787  *
788  *      ESRCH           - Quorum agreement, key is INVALID (caller should
789  *                        skip key).
790  *
791  *      EIO             - Quorum agreement but all elements had errors.
792  *
793  *      EDEADLK         - No quorum agreement possible for key, a repair
794  *                        may be needed.  Caller has to decide what to do,
795  *                        possibly iterating the key or generating an EIO.
796  *
797  *      EINPROGRESS     - No quorum agreement yet, but agreement is still
798  *                        possible if caller waits for more responses.  Caller
799  *                        should not iterate key.
800  *
801  * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
802  */
803 int
804 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
805 {
806         hammer2_chain_t *chain;
807         hammer2_chain_t *focus;
808         hammer2_pfs_t *pmp;
809         hammer2_tid_t quorum_tid;
810         hammer2_tid_t last_best_quorum_tid;
811         uint32_t nflags;
812         int ttlmasters;
813         int ttlslaves;
814         int nmasters;
815         int nmasters_keymatch;
816         int nslaves;
817         int nquorum;
818         int umasters;   /* unknown masters (still in progress) */
819         int smpresent;
820         int i;
821
822         cluster->error = 0;
823         cluster->focus = NULL;
824
825         nflags = 0;
826         ttlmasters = 0;
827         ttlslaves = 0;
828         nmasters = 0;
829         nmasters_keymatch = 0;
830         umasters = 0;
831         nslaves = 0;
832
833         /*
834          * Calculate quorum
835          */
836         pmp = cluster->pmp;
837         KKASSERT(pmp != NULL || cluster->nchains == 0);
838         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
839         smpresent = 0;
840
841         /*
842          * Pass 1
843          *
844          * NOTE: A NULL chain is not necessarily an error, it could be
845          *       e.g. a lookup failure or the end of an iteration.
846          *       Process normally.
847          */
848         for (i = 0; i < cluster->nchains; ++i) {
849                 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
850                 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
851
852                 chain = cluster->array[i].chain;
853                 if (chain && chain->error) {
854                         if (cluster->focus == NULL || cluster->focus == chain) {
855                                 /* error will be overridden by valid focus */
856                                 cluster->error = chain->error;
857                         }
858
859                         /*
860                          * Must count total masters and slaves whether the
861                          * chain is errored or not.
862                          */
863                         switch (cluster->pmp->pfs_types[i]) {
864                         case HAMMER2_PFSTYPE_MASTER:
865                                 ++ttlmasters;
866                                 break;
867                         case HAMMER2_PFSTYPE_SLAVE:
868                                 ++ttlslaves;
869                                 break;
870                         }
871                         continue;
872                 }
873                 switch (cluster->pmp->pfs_types[i]) {
874                 case HAMMER2_PFSTYPE_MASTER:
875                         ++ttlmasters;
876                         break;
877                 case HAMMER2_PFSTYPE_SLAVE:
878                         ++ttlslaves;
879                         break;
880                 case HAMMER2_PFSTYPE_SOFT_MASTER:
881                         nflags |= HAMMER2_CLUSTER_WRSOFT;
882                         nflags |= HAMMER2_CLUSTER_RDSOFT;
883                         smpresent = 1;
884                         break;
885                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
886                         nflags |= HAMMER2_CLUSTER_RDSOFT;
887                         break;
888                 case HAMMER2_PFSTYPE_SUPROOT:
889                         /*
890                          * Degenerate cluster representing the super-root
891                          * topology on a single device.  Fake stuff so
892                          * cluster ops work as expected.
893                          */
894                         nflags |= HAMMER2_CLUSTER_WRHARD;
895                         nflags |= HAMMER2_CLUSTER_RDHARD;
896                         cluster->focus_index = i;
897                         cluster->focus = chain;
898                         cluster->error = chain ? chain->error : 0;
899                         break;
900                 default:
901                         break;
902                 }
903         }
904
905         /*
906          * Pass 2
907          *
908          * Resolve nmasters             - master nodes fully match
909          *
910          * Resolve umasters             - master nodes operation still
911          *                                in progress
912          *
913          * Resolve nmasters_keymatch    - master nodes match the passed-in
914          *                                key and may or may not match
915          *                                the quorum-agreed tid.
916          * 
917          * The quorum-agreed TID is the highest matching TID.
918          */
919         last_best_quorum_tid = HAMMER2_TID_MAX;
920         quorum_tid = 0;         /* fix gcc warning */
921
922         while (nmasters < nquorum && last_best_quorum_tid != 0) {
923                 nmasters = 0;
924                 quorum_tid = 0;
925
926                 for (i = 0; i < cluster->nchains; ++i) {
927                         /* XXX SOFT smpresent handling */
928                         if (cluster->pmp->pfs_types[i] !=
929                             HAMMER2_PFSTYPE_MASTER) {
930                                 continue;
931                         }
932
933                         chain = cluster->array[i].chain;
934
935                         /*
936                          * Skip elements still in progress.  umasters keeps
937                          * track of masters that might still be in-progress.
938                          */
939                         if (chain == NULL && (cluster->array[i].flags &
940                                               HAMMER2_CITEM_NULL) == 0) {
941                                 ++umasters;
942                                 continue;
943                         }
944
945                         /*
946                          * Key match?
947                          */
948                         if (flags & HAMMER2_CHECK_NULL) {
949                                 if (chain == NULL) {
950                                         ++nmasters;
951                                         ++nmasters_keymatch;
952                                 }
953                         } else if (chain && chain->bref.key == key) {
954                                 ++nmasters_keymatch;
955                                 if (quorum_tid < last_best_quorum_tid &&
956                                     (quorum_tid < chain->bref.modify_tid ||
957                                      nmasters == 0)) {
958                                         /*
959                                          * Better TID located, reset
960                                          * nmasters count.
961                                          */
962                                         nmasters = 0;
963                                         quorum_tid = chain->bref.modify_tid;
964                                 }
965                                 if (quorum_tid == chain->bref.modify_tid) {
966                                         /*
967                                          * TID matches current collection.
968                                          */
969                                         ++nmasters;
970                                         if (chain->error == 0) {
971                                                 cluster->focus = chain;
972                                                 cluster->focus_index = i;
973                                         }
974                                 }
975                         }
976                 }
977                 if (nmasters >= nquorum)
978                         break;
979                 last_best_quorum_tid = quorum_tid;
980         }
981
982         /*
983         kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
984                 nmasters, nquorum, nmasters_keymatch, umasters);
985         */
986
987         /*
988          * Early return if we do not have enough masters.
989          */
990         if (nmasters < nquorum) {
991                 if (nmasters + umasters >= nquorum)
992                         return EINPROGRESS;
993                 if (nmasters_keymatch < nquorum) 
994                         return ESRCH;
995                 return EDEADLK;
996         }
997
998         /*
999          * Validated end of scan.
1000          */
1001         if (flags & HAMMER2_CHECK_NULL)
1002                 return ENOENT;
1003
1004         /*
1005          * If we have a NULL focus at this point the agreeing quorum all
1006          * had chain errors.
1007          */
1008         if (cluster->focus == NULL)
1009                 return EIO;
1010
1011         /*
1012          * Pass 3
1013          *
1014          * We have quorum agreement, validate elements, not end of scan.
1015          */
1016         for (i = 0; i < cluster->nchains; ++i) {
1017                 chain = cluster->array[i].chain;
1018                 if (chain == NULL ||
1019                     chain->bref.key != key ||
1020                     chain->bref.modify_tid != quorum_tid) {
1021                         continue;
1022                 }
1023
1024                 switch (cluster->pmp->pfs_types[i]) {
1025                 case HAMMER2_PFSTYPE_MASTER:
1026                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
1027                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1028                         nflags |= HAMMER2_CLUSTER_WRHARD;
1029                         nflags |= HAMMER2_CLUSTER_RDHARD;
1030                         break;
1031                 case HAMMER2_PFSTYPE_SLAVE:
1032                         /*
1033                          * We must have enough up-to-date masters to reach
1034                          * a quorum and the slave modify_tid must match the
1035                          * quorum's modify_tid.
1036                          *
1037                          * Do not select an errored slave.
1038                          */
1039                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1040                         nflags |= HAMMER2_CLUSTER_RDHARD;
1041                         ++nslaves;
1042                         break;
1043                 case HAMMER2_PFSTYPE_SOFT_MASTER:
1044                         /*
1045                          * Directly mounted soft master always wins.  There
1046                          * should be only one.
1047                          */
1048                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
1049                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1050                         break;
1051                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
1052                         /*
1053                          * Directly mounted soft slave always wins.  There
1054                          * should be only one.
1055                          *
1056                          * XXX
1057                          */
1058                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1059                         break;
1060                 case HAMMER2_PFSTYPE_SUPROOT:
1061                         /*
1062                          * spmp (degenerate case)
1063                          */
1064                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
1065                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1066                         break;
1067                 default:
1068                         break;
1069                 }
1070         }
1071
1072         /*
1073          * Focus now set, adjust ddflag.  Skip this pass if the focus
1074          * is bad or if we are at the PFS root (the bref won't match at
1075          * the PFS root, obviously).
1076          */
1077         focus = cluster->focus;
1078         if (focus) {
1079                 cluster->ddflag =
1080                         (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
1081         } else {
1082                 cluster->ddflag = 0;
1083                 goto skip4;
1084         }
1085         if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1086                 goto skip4;
1087
1088         /*
1089          * Pass 4
1090          *
1091          * Validate the elements that were not marked invalid.  They should
1092          * match.
1093          */
1094         for (i = 0; i < cluster->nchains; ++i) {
1095                 int ddflag;
1096
1097                 chain = cluster->array[i].chain;
1098
1099                 if (chain == NULL)
1100                         continue;
1101                 if (chain == focus)
1102                         continue;
1103                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1104                         continue;
1105
1106                 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1107                 if (chain->bref.type != focus->bref.type ||
1108                     chain->bref.key != focus->bref.key ||
1109                     chain->bref.keybits != focus->bref.keybits ||
1110                     chain->bref.modify_tid != focus->bref.modify_tid ||
1111                     chain->bytes != focus->bytes ||
1112                     ddflag != cluster->ddflag) {
1113                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1114                         if (hammer2_debug & 1)
1115                         kprintf("cluster_resolve: matching modify_tid failed "
1116                                 "bref test: idx=%d type=%02x/%02x "
1117                                 "key=%016jx/%d-%016jx/%d "
1118                                 "mod=%016jx/%016jx bytes=%u/%u\n",
1119                                 i,
1120                                 chain->bref.type, focus->bref.type,
1121                                 chain->bref.key, chain->bref.keybits,
1122                                 focus->bref.key, focus->bref.keybits,
1123                                 chain->bref.modify_tid, focus->bref.modify_tid,
1124                                 chain->bytes, focus->bytes);
1125                         if (hammer2_debug & 0x4000)
1126                                 panic("cluster_resolve");
1127                         /* flag issue and force resync? */
1128                 }
1129         }
1130 skip4:
1131
1132         if (ttlslaves == 0)
1133                 nflags |= HAMMER2_CLUSTER_NOSOFT;
1134         if (ttlmasters == 0)
1135                 nflags |= HAMMER2_CLUSTER_NOHARD;
1136
1137         /*
1138          * Set SSYNCED or MSYNCED for slaves and masters respectively if
1139          * all available nodes (even if 0 are available) are fully
1140          * synchronized.  This is used by the synchronization thread to
1141          * determine if there is work it could potentially accomplish.
1142          */
1143         if (nslaves == ttlslaves)
1144                 nflags |= HAMMER2_CLUSTER_SSYNCED;
1145         if (nmasters == ttlmasters)
1146                 nflags |= HAMMER2_CLUSTER_MSYNCED;
1147
1148         /*
1149          * Determine if the cluster was successfully locked for the
1150          * requested operation and generate an error code.  The cluster
1151          * will not be locked (or ref'd) if an error is returned.
1152          *
1153          * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
1154          * to determine if reading or writing is possible.  If writing, the
1155          * cluster still requires a call to hammer2_cluster_modify() first.
1156          */
1157         atomic_set_int(&cluster->flags, nflags);
1158         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1159
1160         return 0;
1161 }
1162
1163 /*
1164  * This is used by the sync thread to force non-NULL elements of a copy
1165  * of the pmp->iroot cluster to be good which is required to prime the
1166  * sync.
1167  */
1168 void
1169 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1170 {
1171         int i;
1172
1173         for (i = 0; i < cluster->nchains; ++i) {
1174                 if (cluster->array[i].chain)
1175                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1176         }
1177 }
1178
1179 /*
1180  * Copy a cluster, returned a ref'd cluster.  All underlying chains
1181  * are also ref'd, but not locked.  Focus state is also copied.
1182  *
1183  * Original cluster does not have to be locked but usually is.
1184  * New cluster will not be flagged as locked.
1185  *
1186  * Callers using this function to initialize a new cluster from an inode
1187  * generally lock and resolve the resulting cluster.
1188  *
1189  * Callers which use this function to save/restore a cluster structure
1190  * generally retain the focus state and do not re-resolve it.  Caller should
1191  * not try to re-resolve internal (cparent) node state during an iteration
1192  * as the individual tracking elements of cparent in an iteration may not
1193  * match even though they are correct.
1194  */
1195 hammer2_cluster_t *
1196 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
1197 {
1198         hammer2_pfs_t *pmp = ocluster->pmp;
1199         hammer2_cluster_t *ncluster;
1200         hammer2_chain_t *chain;
1201         int i;
1202
1203         ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
1204         ncluster->pmp = pmp;
1205         ncluster->nchains = ocluster->nchains;
1206         ncluster->refs = 1;
1207
1208         for (i = 0; i < ocluster->nchains; ++i) {
1209                 chain = ocluster->array[i].chain;
1210                 ncluster->array[i].chain = chain;
1211                 ncluster->array[i].flags = ocluster->array[i].flags;
1212                 if (chain)
1213                         hammer2_chain_ref(chain);
1214         }
1215         ncluster->focus_index = ocluster->focus_index;
1216         ncluster->focus = ocluster->focus;
1217         ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED |
1218                                               HAMMER2_CLUSTER_INODE);
1219
1220         return (ncluster);
1221 }
1222
1223 /*
1224  * Unlock a cluster.  Refcount and focus is maintained.
1225  */
1226 void
1227 hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx)
1228 {
1229         hammer2_chain_t *chain;
1230         int i;
1231
1232         if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1233                 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1234                         cluster);
1235         }
1236         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1237         KKASSERT(cluster->refs > 0);
1238         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1239
1240         for (i = 0; i < cluster->nchains; ++i) {
1241                 if (i == idx)
1242                         continue;
1243                 chain = cluster->array[i].chain;
1244                 if (chain)
1245                         hammer2_chain_unlock(chain);
1246         }
1247 }
1248
1249 void
1250 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1251 {
1252         hammer2_cluster_unlock_except(cluster, -1);
1253 }
1254
1255 /*
1256  * Resize the cluster's physical storage allocation in-place.  This may
1257  * replace the cluster's chains.
1258  */
1259 void
1260 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
1261                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1262                        int nradix, int flags)
1263 {
1264         hammer2_chain_t *chain;
1265         int i;
1266
1267         KKASSERT(cparent->pmp == cluster->pmp);         /* can be NULL */
1268         KKASSERT(cparent->nchains == cluster->nchains);
1269
1270         for (i = 0; i < cluster->nchains; ++i) {
1271                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1272                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1273                         continue;
1274                 }
1275                 chain = cluster->array[i].chain;
1276                 if (chain) {
1277                         KKASSERT(cparent->array[i].chain);
1278                         hammer2_chain_resize(trans, ip,
1279                                              cparent->array[i].chain, chain,
1280                                              nradix, flags);
1281                 }
1282         }
1283 }
1284
1285 /*
1286  * Set an inode's cluster modified, marking the related chains RW and
1287  * duplicating them if necessary.
1288  *
1289  * The passed-in chain is a localized copy of the chain previously acquired
1290  * when the inode was locked (and possilby replaced in the mean time), and
1291  * must also be updated.  In fact, we update it first and then synchronize
1292  * the inode's cluster cache.
1293  */
1294 hammer2_inode_data_t *
1295 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
1296                           hammer2_cluster_t *cluster, int flags)
1297 {
1298         hammer2_inode_modify(trans, ip);
1299         hammer2_cluster_modify(trans, cluster, flags);
1300         hammer2_inode_repoint(ip, NULL, cluster);
1301         return (&hammer2_cluster_wdata(cluster)->ipdata);
1302 }
1303
1304 /*
1305  * Adjust the cluster's chains to allow modification and adjust the
1306  * focus.  Data will be accessible on return.
1307  *
1308  * If our focused master errors on modify, re-resolve the cluster to
1309  * try to select a different master.
1310  */
1311 void
1312 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
1313                        int flags)
1314 {
1315         hammer2_chain_t *chain;
1316         int resolve_again;
1317         int i;
1318
1319         resolve_again = 0;
1320         for (i = 0; i < cluster->nchains; ++i) {
1321                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1322                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1323                         continue;
1324                 }
1325                 chain = cluster->array[i].chain;
1326                 if (chain == NULL)
1327                         continue;
1328                 if (chain->error)
1329                         continue;
1330                 hammer2_chain_modify(trans, chain, flags);
1331                 if (cluster->focus == chain && chain->error) {
1332                         cluster->error = chain->error;
1333                         resolve_again = 1;
1334                 }
1335         }
1336         if (resolve_again)
1337                 hammer2_cluster_resolve(cluster);
1338 }
1339
1340 /*
1341  * Synchronize modifications from the focus to other chains in a cluster.
1342  * Convenient because nominal API users can just modify the contents of the
1343  * focus (at least for non-blockref data).
1344  *
1345  * Nominal front-end operations only edit non-block-table data in a single
1346  * chain.  This code copies such modifications to the other chains in the
1347  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
1348  * by both the frontend and the backend and will explode in fireworks if
1349  * blindly copied.
1350  */
1351 void
1352 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
1353 {
1354         hammer2_chain_t *focus;
1355         hammer2_chain_t *scan;
1356         const hammer2_inode_data_t *ripdata;
1357         hammer2_inode_data_t *wipdata;
1358         int i;
1359
1360         focus = cluster->focus;
1361         KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
1362
1363         for (i = 0; i < cluster->nchains; ++i) {
1364                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1365                         continue;
1366                 scan = cluster->array[i].chain;
1367                 if (scan == NULL || scan == focus)
1368                         continue;
1369                 if (scan->error)
1370                         continue;
1371                 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
1372                 KKASSERT(focus->bytes == scan->bytes &&
1373                          focus->bref.type == scan->bref.type);
1374                 switch(focus->bref.type) {
1375                 case HAMMER2_BREF_TYPE_INODE:
1376                         ripdata = &focus->data->ipdata;
1377                         wipdata = &scan->data->ipdata;
1378                         if ((ripdata->meta.op_flags &
1379                             HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1380                                 bcopy(ripdata, wipdata,
1381                                       offsetof(hammer2_inode_data_t, u));
1382                                 break;
1383                         }
1384                         /* fall through to full copy */
1385                 case HAMMER2_BREF_TYPE_DATA:
1386                         bcopy(focus->data, scan->data, focus->bytes);
1387                         break;
1388                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1389                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1390                 case HAMMER2_BREF_TYPE_FREEMAP:
1391                 case HAMMER2_BREF_TYPE_VOLUME:
1392                         panic("hammer2_cluster_modsync: illegal node type");
1393                         /* NOT REACHED */
1394                         break;
1395                 default:
1396                         panic("hammer2_cluster_modsync: unknown node type");
1397                         break;
1398                 }
1399         }
1400 }
1401
1402 /*
1403  * Lookup initialization/completion API.  Returns a locked, fully resolved
1404  * cluster with one ref.
1405  */
1406 hammer2_cluster_t *
1407 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
1408 {
1409         hammer2_cluster_t *cluster;
1410
1411         cluster = hammer2_cluster_copy(cparent);
1412         if (flags & HAMMER2_LOOKUP_SHARED) {
1413                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
1414                                               HAMMER2_RESOLVE_SHARED);
1415         } else {
1416                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1417         }
1418         hammer2_cluster_resolve(cluster);
1419
1420         return (cluster);
1421 }
1422
1423 void
1424 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1425 {
1426         if (cparent) {
1427                 hammer2_cluster_unlock(cparent);
1428                 hammer2_cluster_drop(cparent);
1429         }
1430 }
1431
1432 /*
1433  * Locate first match or overlap under parent, return a new, locked, resolved
1434  * cluster with one ref.
1435  *
1436  * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1437  */
1438 hammer2_cluster_t *
1439 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
1440                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1441 {
1442         hammer2_pfs_t *pmp;
1443         hammer2_cluster_t *cluster;
1444         hammer2_chain_t *chain;
1445         hammer2_key_t key_accum;
1446         hammer2_key_t key_next;
1447         int null_count;
1448         int rflags;
1449         int i;
1450
1451         KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1452
1453         pmp = cparent->pmp;                             /* can be NULL */
1454         key_accum = *key_nextp;
1455         null_count = 0;
1456         if (flags & HAMMER2_LOOKUP_SHARED)
1457                 rflags = HAMMER2_RESOLVE_SHARED;
1458         else
1459                 rflags = 0;
1460
1461         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1462         cluster->pmp = pmp;                             /* can be NULL */
1463         cluster->refs = 1;
1464         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1465                 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1466
1467         /*
1468          * Iterating earlier cluster elements with later elements still
1469          * locked is a problem, so we have to unlock the parent and then
1470          * re-lock as we go.
1471          */
1472         hammer2_cluster_unlock(cparent);
1473         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1474
1475         /*
1476          * Pass-1, issue lookups.
1477          */
1478         for (i = 0; i < cparent->nchains; ++i) {
1479                 cluster->array[i].flags = cparent->array[i].flags;
1480                 key_next = *key_nextp;
1481
1482                 /*
1483                  * Always relock the parent as we go.
1484                  */
1485                 if (cparent->array[i].chain) {
1486                         hammer2_chain_lock(cparent->array[i].chain, rflags);
1487                 }
1488
1489                 /*
1490                  * Nothing to base the lookup, or parent was not synchronized.
1491                  */
1492                 if (cparent->array[i].chain == NULL ||
1493                     (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
1494                         ++null_count;
1495                         continue;
1496                 }
1497
1498                 chain = hammer2_chain_lookup(&cparent->array[i].chain,
1499                                              &key_next,
1500                                              key_beg, key_end,
1501                                              &cparent->array[i].cache_index,
1502                                              flags);
1503                 cluster->array[i].chain = chain;
1504                 if (chain == NULL) {
1505                         ++null_count;
1506                 }
1507                 if (key_accum > key_next)
1508                         key_accum = key_next;
1509         }
1510
1511         /*
1512          * Cleanup
1513          */
1514         cluster->nchains = i;
1515         *key_nextp = key_accum;
1516
1517         /*
1518          * The cluster must be resolved, out of sync elements may be present.
1519          *
1520          * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1521          */
1522         if (null_count != i)
1523                 hammer2_cluster_resolve(cluster);
1524         if (null_count == i ||
1525             (cluster->focus == NULL &&
1526              (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1527                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1528                         hammer2_cluster_unlock(cluster);
1529                 hammer2_cluster_drop(cluster);
1530                 cluster = NULL;
1531         }
1532
1533         return (cluster);
1534 }
1535
1536 /*
1537  * Locate next match or overlap under parent, replace the passed-in cluster.
1538  * The returned cluster is a new, locked, resolved cluster with one ref.
1539  *
1540  * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1541  */
1542 hammer2_cluster_t *
1543 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1544                      hammer2_key_t *key_nextp,
1545                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1546 {
1547         hammer2_chain_t *ochain;
1548         hammer2_chain_t *nchain;
1549         hammer2_key_t key_accum;
1550         hammer2_key_t key_next;
1551         int parent_index;
1552         int cluster_index;
1553         int null_count;
1554         int rflags;
1555         int i;
1556
1557         KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1558
1559         key_accum = *key_nextp;
1560         null_count = 0;
1561         parent_index = cparent->focus_index;    /* save prior focus */
1562         cluster_index = cluster->focus_index;
1563         if (flags & HAMMER2_LOOKUP_SHARED)
1564                 rflags = HAMMER2_RESOLVE_SHARED;
1565         else
1566                 rflags = 0;
1567
1568         cluster->focus = NULL;          /* XXX needed any more? */
1569         /*cparent->focus = NULL;*/
1570         cluster->focus_index = 0;       /* XXX needed any more? */
1571         /*cparent->focus_index = 0;*/
1572
1573         cluster->ddflag = 0;
1574
1575         /*
1576          * The parent is always locked on entry, the iterator may be locked
1577          * depending on flags.
1578          *
1579          * We must temporarily unlock the passed-in clusters to avoid a
1580          * deadlock between elements of the cluster with other threads.
1581          * We will fixup the lock in the loop.
1582          *
1583          * Note that this will clear the focus.
1584          *
1585          * Reflag the clusters as locked, because we will relock them
1586          * as we go.
1587          */
1588         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1589                 hammer2_cluster_unlock(cluster);
1590                 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1591         }
1592         hammer2_cluster_unlock(cparent);
1593         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1594
1595         for (i = 0; i < cparent->nchains; ++i) {
1596                 key_next = *key_nextp;
1597                 ochain = cluster->array[i].chain;
1598
1599                 /*
1600                  * Always relock the parent as we go.
1601                  */
1602                 if (cparent->array[i].chain)
1603                         hammer2_chain_lock(cparent->array[i].chain, rflags);
1604
1605                 /*
1606                  * Nothing to iterate from.  These cases can occur under
1607                  * normal operations.  For example, during synchronization
1608                  * a slave might reach the end of its scan while records
1609                  * are still left on the master(s).
1610                  */
1611                 if (ochain == NULL) {
1612                         ++null_count;
1613                         continue;
1614                 }
1615                 if (cparent->array[i].chain == NULL ||
1616                     (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1617                     (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1618                         /* ochain has not yet been relocked */
1619                         hammer2_chain_drop(ochain);
1620                         cluster->array[i].chain = NULL;
1621                         ++null_count;
1622                         continue;
1623                 }
1624
1625                 /*
1626                  * Relock the child if necessary.  Parent and child will then
1627                  * be locked as expected by hammer2_chain_next() and flags.
1628                  */
1629                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1630                         hammer2_chain_lock(ochain, rflags);
1631                 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1632                                             &key_next, key_beg, key_end,
1633                                             &cparent->array[i].cache_index,
1634                                             flags);
1635                 /* ochain now invalid but can still be used for focus check */
1636                 if (parent_index == i) {
1637                         cparent->focus_index = i;
1638                         cparent->focus = cparent->array[i].chain;
1639                 }
1640
1641                 cluster->array[i].chain = nchain;
1642                 if (nchain == NULL) {
1643                         ++null_count;
1644                 }
1645                 if (key_accum > key_next)
1646                         key_accum = key_next;
1647         }
1648
1649         /*
1650          * Cleanup
1651          */
1652         cluster->nchains = i;
1653         *key_nextp = key_accum;
1654
1655         /*
1656          * The cluster must be resolved, out of sync elements may be present.
1657          *
1658          * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1659          */
1660         if (null_count != i)
1661                 hammer2_cluster_resolve(cluster);
1662         if (null_count == i ||
1663             (cluster->focus == NULL &&
1664              (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1665                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1666                         hammer2_cluster_unlock(cluster);
1667                 hammer2_cluster_drop(cluster);
1668                 cluster = NULL;
1669         }
1670         return(cluster);
1671 }
1672
1673 /*
1674  * Advance just one chain in the cluster and recalculate the invalid bit.
1675  * The cluster index is allowed to be flagged invalid on input and is
1676  * recalculated on return.
1677  *
1678  * (used during synchronization to advance past a chain being deleted).
1679  *
1680  * The chain being advanced must not be the focus and the clusters in
1681  * question must have already passed normal cluster_lookup/cluster_next
1682  * checks.
1683  *
1684  * The cluster always remains intact on return, so void function.
1685  */
1686 void
1687 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1688                                   hammer2_cluster_t *cluster,
1689                                   hammer2_key_t *key_nextp,
1690                                   hammer2_key_t key_beg,
1691                                   hammer2_key_t key_end,
1692                                   int i, int flags)
1693 {
1694         hammer2_chain_t *ochain;
1695         hammer2_chain_t *nchain;
1696         hammer2_chain_t *focus;
1697         hammer2_key_t key_accum;
1698         hammer2_key_t key_next;
1699         int ddflag;
1700
1701         key_accum = *key_nextp;
1702         key_next = *key_nextp;
1703         ochain = cluster->array[i].chain;
1704         if (ochain == NULL)
1705                 goto done;
1706         KKASSERT(ochain != cluster->focus);
1707
1708         nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1709                                     &key_next, key_beg, key_end,
1710                                     &cparent->array[i].cache_index,
1711                                     flags);
1712         /* ochain now invalid */
1713         if (cparent->focus_index == i)
1714                 cparent->focus = cparent->array[i].chain;
1715
1716         /*
1717          * Install nchain.  Note that nchain can be NULL, and can also
1718          * be in an unlocked state depending on flags.
1719          */
1720         cluster->array[i].chain = nchain;
1721         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1722
1723         if (key_accum > key_next)
1724                 key_accum = key_next;
1725
1726         focus = cluster->focus;
1727         if (focus == NULL)
1728                 goto done;
1729         if (nchain == NULL)
1730                 goto done;
1731 #if 0
1732         if (nchain == focus)    /* ASSERTED NOT TRUE */
1733                 ...
1734 #endif
1735         ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1736         if (nchain->bref.type != focus->bref.type ||
1737             nchain->bref.key != focus->bref.key ||
1738             nchain->bref.keybits != focus->bref.keybits ||
1739             nchain->bref.modify_tid != focus->bref.modify_tid ||
1740             nchain->bytes != focus->bytes ||
1741             ddflag != cluster->ddflag) {
1742                 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1743         }
1744
1745 done:
1746         *key_nextp = key_accum;
1747 #if 0
1748         /*
1749          * For now don't re-resolve cluster->flags.
1750          */
1751         hammer2_cluster_resolve(cluster);
1752 #endif
1753 }
1754
1755 /*
1756  * Create a new cluster using the specified key
1757  */
1758 int
1759 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1760                      hammer2_cluster_t **clusterp,
1761                      hammer2_key_t key, int keybits,
1762                      int type, size_t bytes, int flags)
1763 {
1764         hammer2_cluster_t *cluster;
1765         hammer2_pfs_t *pmp;
1766         int error;
1767         int i;
1768
1769         pmp = trans->pmp;                               /* can be NULL */
1770
1771         if ((cluster = *clusterp) == NULL) {
1772                 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1773                                   M_WAITOK | M_ZERO);
1774                 cluster->pmp = pmp;                     /* can be NULL */
1775                 cluster->refs = 1;
1776                 cluster->flags = HAMMER2_CLUSTER_LOCKED;
1777         }
1778         cluster->focus_index = 0;
1779         cluster->focus = NULL;
1780
1781         /*
1782          * NOTE: cluster->array[] entries can initially be NULL.  If
1783          *       *clusterp is supplied, skip NULL entries, otherwise
1784          *       create new chains.
1785          */
1786         for (i = 0; i < cparent->nchains; ++i) {
1787                 if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1788                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1789                         continue;
1790                 }
1791                 if (*clusterp) {
1792                         if ((cluster->array[i].flags &
1793                              HAMMER2_CITEM_FEMOD) == 0) {
1794                                 cluster->array[i].flags |=
1795                                                 HAMMER2_CITEM_INVALID;
1796                                 continue;
1797                         }
1798                         if (cluster->array[i].chain == NULL)
1799                                 continue;
1800                 }
1801                 error = hammer2_chain_create(trans, &cparent->array[i].chain,
1802                                              &cluster->array[i].chain, pmp,
1803                                              key, keybits,
1804                                              type, bytes, flags);
1805                 if (cparent->focus_index == i)
1806                         cparent->focus = cparent->array[i].chain;
1807                 KKASSERT(error == 0);
1808                 if (cluster->focus == NULL) {
1809                         cluster->focus_index = i;
1810                         cluster->focus = cluster->array[i].chain;
1811                 }
1812                 if (cparent->focus == cparent->array[i].chain) {
1813                         cluster->focus_index = i;
1814                         cluster->focus = cluster->array[i].chain;
1815                 }
1816         }
1817         cluster->nchains = i;
1818         *clusterp = cluster;
1819         hammer2_cluster_resolve(cluster);
1820
1821         return error;
1822 }
1823
1824 /*
1825  * Rename a cluster to a new parent.
1826  *
1827  * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1828  *          So the data_off field is not relevant.  Only the key and
1829  *          keybits are used.
1830  */
1831 void
1832 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1833                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1834                        int flags)
1835 {
1836         hammer2_chain_t *chain;
1837         hammer2_blockref_t xbref;
1838         int i;
1839
1840 #if 0
1841         cluster->focus = NULL;
1842         cparent->focus = NULL;
1843         cluster->focus_index = 0;
1844         cparent->focus_index = 0;
1845 #endif
1846
1847         for (i = 0; i < cluster->nchains; ++i) {
1848                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1849                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1850                         continue;
1851                 }
1852                 chain = cluster->array[i].chain;
1853                 if (chain) {
1854                         if (bref) {
1855                                 xbref = chain->bref;
1856                                 xbref.key = bref->key;
1857                                 xbref.keybits = bref->keybits;
1858                                 hammer2_chain_rename(trans, &xbref,
1859                                                      &cparent->array[i].chain,
1860                                                      chain, flags);
1861                         } else {
1862                                 hammer2_chain_rename(trans, NULL,
1863                                                      &cparent->array[i].chain,
1864                                                      chain, flags);
1865                         }
1866                         if (cparent->focus_index == i)
1867                                 cparent->focus = cparent->array[i].chain;
1868                         KKASSERT(cluster->array[i].chain == chain); /*remove*/
1869                 }
1870         }
1871 }
1872
1873 /*
1874  * Mark a cluster deleted
1875  */
1876 void
1877 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1878                        hammer2_cluster_t *cluster, int flags)
1879 {
1880         hammer2_chain_t *chain;
1881         hammer2_chain_t *parent;
1882         int i;
1883
1884         if (cparent == NULL) {
1885                 kprintf("cparent is NULL\n");
1886                 return;
1887         }
1888
1889         for (i = 0; i < cluster->nchains; ++i) {
1890                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1891                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1892                         continue;
1893                 }
1894                 parent = cparent->array[i].chain;
1895                 chain = cluster->array[i].chain;
1896                 if (chain == NULL)
1897                         continue;
1898                 if (chain->parent != parent) {
1899                         kprintf("hammer2_cluster_delete: parent "
1900                                 "mismatch chain=%p parent=%p against=%p\n",
1901                                 chain, chain->parent, parent);
1902                 } else {
1903                         hammer2_chain_delete(trans, parent, chain, flags);
1904                 }
1905         }
1906 }
1907
1908 /*
1909  * Create a snapshot of the specified {parent, ochain} with the specified
1910  * label.  The originating hammer2_inode must be exclusively locked for
1911  * safety.
1912  *
1913  * The ioctl code has already synced the filesystem.
1914  */
1915 int
1916 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1917                        hammer2_ioc_pfs_t *pfs)
1918 {
1919         hammer2_dev_t *hmp;
1920         hammer2_cluster_t *ncluster;
1921         const hammer2_inode_data_t *ripdata;
1922         hammer2_inode_data_t *wipdata;
1923         hammer2_chain_t *nchain;
1924         hammer2_inode_t *nip;
1925         size_t name_len;
1926         hammer2_key_t lhc;
1927         struct vattr vat;
1928 #if 0
1929         uuid_t opfs_clid;
1930 #endif
1931         int error;
1932         int i;
1933
1934         kprintf("snapshot %s\n", pfs->name);
1935
1936         name_len = strlen(pfs->name);
1937         lhc = hammer2_dirhash(pfs->name, name_len);
1938
1939         /*
1940          * Get the clid
1941          */
1942         ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1943 #if 0
1944         opfs_clid = ripdata->meta.pfs_clid;
1945 #endif
1946         hmp = ocluster->focus->hmp;     /* XXX find synchronized local disk */
1947
1948         /*
1949          * Create the snapshot directory under the super-root
1950          *
1951          * Set PFS type, generate a unique filesystem id, and generate
1952          * a cluster id.  Use the same clid when snapshotting a PFS root,
1953          * which theoretically allows the snapshot to be used as part of
1954          * the same cluster (perhaps as a cache).
1955          *
1956          * Copy the (flushed) blockref array.  Theoretically we could use
1957          * chain_duplicate() but it becomes difficult to disentangle
1958          * the shared core so for now just brute-force it.
1959          */
1960         VATTR_NULL(&vat);
1961         vat.va_type = VDIR;
1962         vat.va_mode = 0755;
1963         ncluster = NULL;
1964         nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1965                                    proc0.p_ucred, pfs->name, name_len,
1966                                    &ncluster,
1967                                    HAMMER2_INSERT_PFSROOT, &error);
1968
1969         if (nip) {
1970                 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1971                 wipdata->meta.pfs_type = HAMMER2_PFSTYPE_MASTER;
1972                 wipdata->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1973                 wipdata->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT;
1974                 kern_uuidgen(&wipdata->meta.pfs_fsid, 1);
1975
1976                 /*
1977                  * Give the snapshot its own private cluster.  As a snapshot
1978                  * no further synchronization with the original cluster will
1979                  * be done.
1980                  */
1981 #if 0
1982                 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1983                         wipdata->meta.pfs_clid = opfs_clid;
1984                 else
1985                         kern_uuidgen(&wipdata->meta.pfs_clid, 1);
1986 #endif
1987                 kern_uuidgen(&wipdata->meta.pfs_clid, 1);
1988
1989                 for (i = 0; i < ncluster->nchains; ++i) {
1990                         if ((ncluster->array[i].flags &
1991                              HAMMER2_CITEM_FEMOD) == 0) {
1992                                 ncluster->array[i].flags |=
1993                                         HAMMER2_CITEM_INVALID;
1994                                 continue;
1995                         }
1996                         nchain = ncluster->array[i].chain;
1997                         if (nchain)
1998                                 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1999                 }
2000
2001                 /* XXX hack blockset copy */
2002                 /* XXX doesn't work with real cluster */
2003                 KKASSERT(ocluster->nchains == 1);
2004                 wipdata->u.blockset = ripdata->u.blockset;
2005                 hammer2_cluster_modsync(ncluster);
2006                 for (i = 0; i < ncluster->nchains; ++i) {
2007                         nchain = ncluster->array[i].chain;
2008                         if (nchain)
2009                                 hammer2_flush(trans, nchain, 1);
2010                 }
2011                 hammer2_inode_unlock(nip, ncluster);
2012         }
2013         return (error);
2014 }
2015
2016 /*
2017  * Return locked parent cluster given a locked child.  The child remains
2018  * locked on return.  The new parent's focus follows the child's focus
2019  * and the parent is always resolved.
2020  *
2021  * We must temporarily unlock the passed-in cluster to avoid a deadlock
2022  * between elements of the cluster.
2023  *
2024  * We must not try to hammer2_cluster_resolve() cparent.  The individual
2025  * parent chains for the nodes are the correct parents for the cluster but
2026  * do not necessarily match, so resolve would likely implode.
2027  */
2028 hammer2_cluster_t *
2029 hammer2_cluster_parent(hammer2_cluster_t *cluster)
2030 {
2031         hammer2_cluster_t *cparent;
2032         int i;
2033
2034         cparent = hammer2_cluster_copy(cluster);
2035         hammer2_cluster_unlock(cluster);
2036
2037         for (i = 0; i < cparent->nchains; ++i) {
2038                 hammer2_chain_t *chain;
2039                 hammer2_chain_t *rchain;
2040
2041                 /*
2042                  * Calculate parent for each element.  Old chain has an extra
2043                  * ref for cparent but the lock remains with cluster.
2044                  */
2045                 chain = cparent->array[i].chain;
2046                 if (chain == NULL)
2047                         continue;
2048                 while ((rchain = chain->parent) != NULL) {
2049                         hammer2_chain_ref(rchain);
2050                         hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
2051                         if (chain->parent == rchain)
2052                                 break;
2053                         hammer2_chain_unlock(rchain);
2054                         hammer2_chain_drop(rchain);
2055                 }
2056                 cparent->array[i].chain = rchain;
2057                 hammer2_chain_drop(chain);
2058         }
2059         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
2060         /* hammer2_cluster_resolve(cparent); */
2061         hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
2062
2063         return cparent;
2064 }
2065
2066 /************************************************************************
2067  *                              CLUSTER I/O                             *
2068  ************************************************************************
2069  *
2070  *
2071  * WARNING! blockref[] array data is not universal.  These functions should
2072  *          only be used to access universal data.
2073  *
2074  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
2075  *          complete if necessary.  The I/O's should have already been
2076  *          initiated by the cluster_lock/chain_lock operation.
2077  *
2078  *          The cluster must already be in a modified state before wdata
2079  *          is called.  The data will already be available for this case.
2080  */
2081 const hammer2_media_data_t *
2082 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
2083 {
2084         KKASSERT(cluster->focus != NULL);
2085         return(cluster->focus->data);
2086 }
2087
2088 const hammer2_media_data_t *
2089 hammer2_cluster_rdata_bytes(hammer2_cluster_t *cluster, size_t *bytesp)
2090 {
2091         KKASSERT(cluster->focus != NULL);
2092         *bytesp = cluster->focus->bytes;
2093         return(cluster->focus->data);
2094 }
2095
2096 hammer2_media_data_t *
2097 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
2098 {
2099         KKASSERT(cluster->focus != NULL);
2100         KKASSERT(hammer2_cluster_modified(cluster));
2101         return(cluster->focus->data);
2102 }
2103
2104 /*
2105  * Load cluster data asynchronously with callback.
2106  *
2107  * The callback is made for the first validated data found, or NULL
2108  * if no valid data is available.
2109  *
2110  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
2111  *       in the inode with an exclusive lock held), the chain structure may be
2112  *       shared.
2113  */
2114 void
2115 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
2116                            void (*callback)(hammer2_iocb_t *iocb), void *ptr)
2117 {
2118         hammer2_chain_t *chain;
2119         hammer2_iocb_t *iocb;
2120         hammer2_dev_t *hmp;
2121         hammer2_blockref_t *bref;
2122         int i;
2123
2124         i = cluster->focus_index;
2125         chain = cluster->focus;
2126
2127         iocb = &cluster->iocb;
2128         iocb->callback = callback;
2129         iocb->dio = NULL;               /* for already-validated case */
2130         iocb->cluster = cluster;
2131         iocb->chain = chain;
2132         iocb->ptr = ptr;
2133         iocb->lbase = (off_t)i;
2134         iocb->flags = 0;
2135         iocb->error = 0;
2136
2137         /*
2138          * Data already validated
2139          */
2140         if (chain->data) {
2141                 callback(iocb);
2142                 return;
2143         }
2144
2145         /*
2146          * We must resolve to a device buffer, either by issuing I/O or
2147          * by creating a zero-fill element.  We do not mark the buffer
2148          * dirty when creating a zero-fill element (the hammer2_chain_modify()
2149          * API must still be used to do that).
2150          *
2151          * The device buffer is variable-sized in powers of 2 down
2152          * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
2153          * chunk always contains buffers of the same size. (XXX)
2154          *
2155          * The minimum physical IO size may be larger than the variable
2156          * block size.
2157          *
2158          * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
2159          *            matches hammer2_devblksize()?  Or does the freemap's
2160          *            pre-zeroing handle the case for us?
2161          */
2162         bref = &chain->bref;
2163         hmp = chain->hmp;
2164
2165 #if 0
2166         /* handled by callback? <- TODO XXX even needed for loads? */
2167         /*
2168          * The getblk() optimization for a 100% overwrite can only be used
2169          * if the physical block size matches the request.
2170          */
2171         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
2172             chain->bytes == hammer2_devblksize(chain->bytes)) {
2173                 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
2174                 KKASSERT(error == 0);
2175                 iocb->dio = dio;
2176                 callback(iocb);
2177                 return;
2178         }
2179 #endif
2180
2181         /*
2182          * Otherwise issue a read
2183          */
2184         hammer2_adjreadcounter(&chain->bref, chain->bytes);
2185         hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
2186 }