hammer2 - Refactor frontend part 2/many
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *          locks and I/O, do quorum and/or master-slave processing, and
50  *          it must operate properly even if some nodes are broken (which
51  *          can also mean indefinite locks).
52  *
53  *                              CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *              hammer2_cluster_lock()
59  *              hammer2_cluster_parent()
60  *
61  *      - Most complex functions, quorum management on transaction ids.
62  *
63  *      - Locking and data accesses must be internally asynchronous.
64  *
65  *      - Validate and manage cache coherency primitives (cache state
66  *        is stored in chain topologies but must be validated by these
67  *        functions).
68  *
69  * (2) Lookups and Scans
70  *              hammer2_cluster_lookup()
71  *              hammer2_cluster_next()
72  *
73  *      - Depend on locking & data retrieval functions, but still complex.
74  *
75  *      - Must do quorum management on transaction ids.
76  *
77  *      - Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *              hammer2_cluster_create()
81  *              hammer2_cluster_rename()
82  *              hammer2_cluster_delete()
83  *              hammer2_cluster_modify()
84  *              hammer2_cluster_modsync()
85  *
86  *      - Can usually punt on failures, operation continues unless quorum
87  *        is lost.  If quorum is lost, must wait for resynchronization
88  *        (depending on the management mode).
89  *
90  *      - Must disconnect node on failures (also not flush), remount, and
91  *        resynchronize.
92  *
93  *      - Network links (via kdmsg) are relatively easy to issue as the
94  *        complex underworkings of hammer2_chain.c don't have to messed
95  *        with (the protocol is at a higher level than block-level).
96  *
97  *      - Multiple local disk nodes (i.e. block devices) are another matter.
98  *        Chain operations have to be dispatched to per-node threads (xN)
99  *        because we can't asynchronize potentially very complex chain
100  *        operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *        (these threads are also used to terminate incoming kdmsg ops from
103  *        other machines).
104  *
105  *      - Single-node filesystems do not use threads and will simply call
106  *        hammer2_chain.c functions directly.  This short-cut is handled
107  *        at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115
116 #include "hammer2.h"
117
118 /*
119  * Returns non-zero if any chain in the cluster needs to be resized.
120  * Errored elements are not used in the calculation.
121  */
122 int
123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
124 {
125         hammer2_chain_t *chain;
126         int i;
127
128         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
129         for (i = 0; i < cluster->nchains; ++i) {
130                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
131                         continue;
132                 chain = cluster->array[i].chain;
133                 if (chain == NULL)
134                         continue;
135                 if (chain->error)
136                         continue;
137                 if (chain->bytes != bytes)
138                         return 1;
139         }
140         return 0;
141 }
142
143 /*
144  * Returns the bref type of the cluster's foucs.
145  *
146  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
147  * The cluster must be locked.
148  */
149 uint8_t
150 hammer2_cluster_type(hammer2_cluster_t *cluster)
151 {
152         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
153         if (cluster->error == 0)
154                 return(cluster->focus->bref.type);
155         return 0;
156 }
157
158 /*
159  * Returns non-zero if the cluster's focus is flagged as being modified.
160  *
161  * If the cluster is errored, returns 0.
162  */
163 int
164 hammer2_cluster_modified(hammer2_cluster_t *cluster)
165 {
166         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
167         if (cluster->error == 0)
168                 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
169         return 0;
170 }
171
172 /*
173  * Returns the bref of the cluster's focus, sans any data-offset information
174  * (since offset information is per-node and wouldn't be useful).
175  *
176  * Callers use this function to access modify_tid, mirror_tid, type,
177  * key, and keybits.
178  *
179  * If the cluster is errored, returns an empty bref.
180  * The cluster must be locked.
181  */
182 void
183 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
184 {
185         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
186         if (cluster->error == 0) {
187                 *bref = cluster->focus->bref;
188                 bref->data_off = 0;
189         } else {
190                 bzero(bref, sizeof(*bref));
191         }
192 }
193
194 /*
195  * Flag the cluster for flushing recursively up to the root.  Despite the
196  * work it does, this is relatively benign.  It just makes sure that the
197  * flusher has top-down visibility to this cluster.
198  *
199  * Errored chains are not flagged for flushing.
200  *
201  * The cluster should probably be locked.
202  */
203 void
204 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
205 {
206         hammer2_chain_t *chain;
207         int i;
208
209         for (i = 0; i < cluster->nchains; ++i) {
210                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
211                         continue;
212                 chain = cluster->array[i].chain;
213                 if (chain == NULL)
214                         continue;
215                 if (chain->error)
216                         continue;
217                 hammer2_chain_setflush(trans, chain);
218         }
219 }
220
221 /*
222  * Set the check mode for the cluster.
223  * Errored elements of the cluster are ignored.
224  *
225  * The cluster must be locked and modified.
226  */
227 void
228 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
229                                 hammer2_cluster_t *cluster,
230                                 int check_algo)
231 {
232         hammer2_chain_t *chain;
233         int i;
234
235         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
236         for (i = 0; i < cluster->nchains; ++i) {
237                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
238                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
239                         continue;
240                 }
241                 chain = cluster->array[i].chain;
242                 if (chain == NULL)
243                         continue;
244                 if (chain->error)
245                         continue;
246                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
247                 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
248                 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
249         }
250 }
251
252 /*
253  * Create a degenerate cluster with one ref from a single locked chain.
254  * The returned cluster will be focused on the chain and inherit its
255  * error state.
256  *
257  * The chain's lock and reference are transfered to the new cluster, so
258  * the caller should not try to unlock the chain separately.
259  *
260  * We fake the flags.
261  */
262 hammer2_cluster_t *
263 hammer2_cluster_from_chain(hammer2_chain_t *chain)
264 {
265         hammer2_cluster_t *cluster;
266
267         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
268         cluster->array[0].chain = chain;
269         cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
270         cluster->nchains = 1;
271         cluster->focus = chain;
272         cluster->focus_index = 0;
273         cluster->pmp = chain->pmp;
274         cluster->refs = 1;
275         cluster->error = chain->error;
276         cluster->flags = HAMMER2_CLUSTER_LOCKED |
277                          HAMMER2_CLUSTER_WRHARD |
278                          HAMMER2_CLUSTER_RDHARD |
279                          HAMMER2_CLUSTER_MSYNCED |
280                          HAMMER2_CLUSTER_SSYNCED;
281
282         return cluster;
283 }
284
285 /*
286  * Add a reference to a cluster and its underlying chains.
287  *
288  * We must also ref the underlying chains in order to allow ref/unlock
289  * sequences to later re-lock.
290  */
291 void
292 hammer2_cluster_ref(hammer2_cluster_t *cluster)
293 {
294         atomic_add_int(&cluster->refs, 1);
295 }
296
297 /*
298  * Drop the caller's reference to the cluster.  When the ref count drops to
299  * zero this function frees the cluster and drops all underlying chains.
300  *
301  * In-progress read I/Os are typically detached from the cluster once the
302  * first one returns (the remaining stay attached to the DIOs but are then
303  * ignored and drop naturally).
304  */
305 void
306 hammer2_cluster_drop(hammer2_cluster_t *cluster)
307 {
308         hammer2_chain_t *chain;
309         int i;
310
311         KKASSERT(cluster->refs > 0);
312         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
313                 cluster->focus = NULL;          /* safety XXX chg to assert */
314                 cluster->focus_index = 0;
315
316                 for (i = 0; i < cluster->nchains; ++i) {
317                         chain = cluster->array[i].chain;
318                         if (chain) {
319                                 hammer2_chain_drop(chain);
320                                 cluster->array[i].chain = NULL; /* safety */
321                         }
322                 }
323                 cluster->nchains = 0;                           /* safety */
324
325                 kfree(cluster, M_HAMMER2);
326                 /* cluster is invalid */
327         }
328 }
329
330 void
331 hammer2_cluster_wait(hammer2_cluster_t *cluster)
332 {
333         tsleep(cluster->focus, 0, "h2clcw", 1);
334 }
335
336 /*
337  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained. 
338  *
339  * WARNING! This function expects the caller to handle resolution of the
340  *          cluster.  We never re-resolve the cluster in this function,
341  *          because it might be used to temporarily unlock/relock a cparent
342  *          in an iteration or recursrion, and the cparents elements do not
343  *          necessarily match.
344  */
345 void
346 hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
347 {
348         hammer2_chain_t *chain;
349         int i;
350
351         /* cannot be on inode-embedded cluster template, must be on copy */
352         KKASSERT(cluster->refs > 0);
353         KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
354         if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
355                 panic("hammer2_cluster_lock: cluster %p already locked!\n",
356                         cluster);
357         }
358         atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
359
360         /*
361          * Lock chains and resolve state.
362          */
363         for (i = 0; i < cluster->nchains; ++i) {
364                 if (i == idx)
365                         continue;
366                 chain = cluster->array[i].chain;
367                 if (chain == NULL)
368                         continue;
369                 hammer2_chain_lock(chain, how);
370         }
371 }
372
373 void
374 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
375 {
376         hammer2_cluster_lock_except(cluster, -1, how);
377 }
378
379 /*
380  * Calculate the clustering state for the cluster and set its focus.
381  * This routine must be called with care.  For example, it should not
382  * normally be called after relocking a non-leaf cluster because parent
383  * clusters help iterations and each element might be at a slightly different
384  * indirect node (each node's topology is independently indexed).
385  *
386  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
387  * operations.  Typically this is only set on a quorum of MASTERs or
388  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
389  * is present, this bit is *not* set on a quorum of MASTERs.  The
390  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
391  * that create/modify/delete elements use it.
392  *
393  * The chains making up the cluster may be narrowed down based on quorum
394  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
395  * narrowed down to a single chain as long as the entire subtopology is known
396  * to be intact.  So, for example, we can narrow a read-only op to a single
397  * fast SLAVE but if we focus a CACHE chain we must still retain at least
398  * a SLAVE to ensure that the subtopology can be accessed.
399  *
400  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
401  * to be maintained once the topology is validated as-of the top level of
402  * the operation.
403  *
404  * If a failure occurs the operation must be aborted by higher-level code and
405  * retried. XXX
406  */
407 void
408 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
409 {
410         hammer2_chain_t *chain;
411         hammer2_chain_t *focus;
412         hammer2_pfs_t *pmp;
413         hammer2_tid_t quorum_tid;
414         hammer2_tid_t last_best_quorum_tid;
415         int focus_pfs_type;
416         uint32_t nflags;
417         int ttlmasters;
418         int ttlslaves;
419         int nmasters;
420         int nslaves;
421         int nquorum;
422         int smpresent;
423         int i;
424
425         cluster->error = 0;
426         cluster->focus = NULL;
427
428         focus_pfs_type = 0;
429         nflags = 0;
430         ttlmasters = 0;
431         ttlslaves = 0;
432         nmasters = 0;
433         nslaves = 0;
434
435         /*
436          * Calculate quorum
437          */
438         pmp = cluster->pmp;
439         KKASSERT(pmp != NULL || cluster->nchains == 0);
440         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
441         smpresent = 0;
442
443         /*
444          * Pass 1
445          *
446          * NOTE: A NULL chain is not necessarily an error, it could be
447          *       e.g. a lookup failure or the end of an iteration.
448          *       Process normally.
449          */
450         for (i = 0; i < cluster->nchains; ++i) {
451                 chain = cluster->array[i].chain;
452                 if (chain && chain->error) {
453                         if (cluster->focus == NULL || cluster->focus == chain) {
454                                 /* error will be overridden by valid focus */
455                                 cluster->error = chain->error;
456                         }
457
458                         /*
459                          * Must count total masters and slaves whether the
460                          * chain is errored or not.
461                          */
462                         switch (cluster->pmp->pfs_types[i]) {
463                         case HAMMER2_PFSTYPE_MASTER:
464                                 ++ttlmasters;
465                                 break;
466                         case HAMMER2_PFSTYPE_SLAVE:
467                                 ++ttlslaves;
468                                 break;
469                         }
470                         continue;
471                 }
472                 switch (cluster->pmp->pfs_types[i]) {
473                 case HAMMER2_PFSTYPE_MASTER:
474                         ++ttlmasters;
475                         break;
476                 case HAMMER2_PFSTYPE_SLAVE:
477                         ++ttlslaves;
478                         break;
479                 case HAMMER2_PFSTYPE_SOFT_MASTER:
480                         nflags |= HAMMER2_CLUSTER_WRSOFT;
481                         nflags |= HAMMER2_CLUSTER_RDSOFT;
482                         smpresent = 1;
483                         break;
484                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
485                         nflags |= HAMMER2_CLUSTER_RDSOFT;
486                         break;
487                 case HAMMER2_PFSTYPE_SUPROOT:
488                         /*
489                          * Degenerate cluster representing the super-root
490                          * topology on a single device.  Fake stuff so
491                          * cluster ops work as expected.
492                          */
493                         nflags |= HAMMER2_CLUSTER_WRHARD;
494                         nflags |= HAMMER2_CLUSTER_RDHARD;
495                         cluster->focus_index = i;
496                         cluster->focus = chain;
497                         cluster->error = chain ? chain->error : 0;
498                         break;
499                 default:
500                         break;
501                 }
502         }
503
504         /*
505          * Pass 2
506          *
507          * Resolve masters.  Calculate nmasters for the highest matching
508          * TID, if a quorum cannot be attained try the next lower matching
509          * TID until we exhaust TIDs.
510          *
511          * NOTE: A NULL chain is not necessarily an error, it could be
512          *       e.g. a lookup failure or the end of an iteration.
513          *       Process normally.
514          */
515         last_best_quorum_tid = HAMMER2_TID_MAX;
516         quorum_tid = 0;         /* fix gcc warning */
517
518         while (nmasters < nquorum && last_best_quorum_tid != 0) {
519                 nmasters = 0;
520                 quorum_tid = 0;
521
522                 for (i = 0; i < cluster->nchains; ++i) {
523                         if (cluster->pmp->pfs_types[i] !=
524                             HAMMER2_PFSTYPE_MASTER) {
525                                 continue;
526                         }
527                         chain = cluster->array[i].chain;
528
529                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
530                                 /*
531                                  * Invalid as in unsynchronized, cannot be
532                                  * used to calculate the quorum.
533                                  */
534                         } else if (chain == NULL && quorum_tid == 0) {
535                                 /*
536                                  * NULL chain on master matches NULL chains
537                                  * on other masters.
538                                  */
539                                 ++nmasters;
540                         } else if (quorum_tid < last_best_quorum_tid &&
541                                    chain != NULL &&
542                                    (quorum_tid < chain->bref.modify_tid ||
543                                     nmasters == 0)) {
544                                 /*
545                                  * Better TID located, reset nmasters count.
546                                  */
547                                 nmasters = 1;
548                                 quorum_tid = chain->bref.modify_tid;
549                         } else if (chain &&
550                                    quorum_tid == chain->bref.modify_tid) {
551                                 /*
552                                  * TID matches current collection.
553                                  */
554                                 ++nmasters;
555                         }
556                 }
557                 if (nmasters >= nquorum)
558                         break;
559                 last_best_quorum_tid = quorum_tid;
560         }
561
562         /*
563          * Pass 3
564          *
565          * NOTE: A NULL chain is not necessarily an error, it could be
566          *       e.g. a lookup failure or the end of an iteration.
567          *       Process normally.
568          */
569         for (i = 0; i < cluster->nchains; ++i) {
570                 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
571                 chain = cluster->array[i].chain;
572                 if (chain && chain->error) {
573                         if (cluster->focus == NULL || cluster->focus == chain) {
574                                 /* error will be overridden by valid focus */
575                                 cluster->error = chain->error;
576                         }
577                         continue;
578                 }
579
580                 switch (cluster->pmp->pfs_types[i]) {
581                 case HAMMER2_PFSTYPE_MASTER:
582                         /*
583                          * We must have enough up-to-date masters to reach
584                          * a quorum and the master modify_tid must match
585                          * the quorum's modify_tid.
586                          *
587                          * Do not select an errored or out-of-sync master.
588                          */
589                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
590                                 nflags |= HAMMER2_CLUSTER_UNHARD;
591                         } else if (nmasters >= nquorum &&
592                                    (chain == NULL || chain->error == 0) &&
593                                    ((chain == NULL && quorum_tid == 0) ||
594                                     (chain != NULL && quorum_tid ==
595                                                   chain->bref.modify_tid))) {
596                                 nflags |= HAMMER2_CLUSTER_WRHARD;
597                                 nflags |= HAMMER2_CLUSTER_RDHARD;
598                                 if (!smpresent) {
599                                         cluster->array[i].flags |=
600                                                         HAMMER2_CITEM_FEMOD;
601                                 }
602                                 if (cluster->focus == NULL ||
603                                     focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
604                                         focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
605                                         cluster->focus_index = i;
606                                         cluster->focus = chain; /* NULL ok */
607                                         cluster->error = chain ? chain->error :
608                                                                  0;
609                                 }
610                         } else if (chain == NULL || chain->error == 0) {
611                                 nflags |= HAMMER2_CLUSTER_UNHARD;
612                         }
613                         break;
614                 case HAMMER2_PFSTYPE_SLAVE:
615                         /*
616                          * We must have enough up-to-date masters to reach
617                          * a quorum and the slave modify_tid must match the
618                          * quorum's modify_tid.
619                          *
620                          * Do not select an errored slave.
621                          */
622                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
623                                 nflags |= HAMMER2_CLUSTER_UNHARD;
624                         } else if (nmasters >= nquorum &&
625                                    (chain == NULL || chain->error == 0) &&
626                                    ((chain == NULL && quorum_tid == 0) ||
627                                     (chain && quorum_tid ==
628                                               chain->bref.modify_tid))) {
629                                 ++nslaves;
630                                 nflags |= HAMMER2_CLUSTER_RDHARD;
631 #if 0
632                                 /* XXX optimize for RESOLVE_RDONLY */
633                                 if (cluster->focus == NULL) {
634                                         focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
635                                         cluster->focus_index = i;
636                                         cluster->focus = chain; /* NULL ok */
637                                         cluster->error = chain ? chain->error :
638                                                                  0;
639                                 }
640 #endif
641                         } else if (chain == NULL || chain->error == 0) {
642                                 nflags |= HAMMER2_CLUSTER_UNSOFT;
643                         }
644                         break;
645                 case HAMMER2_PFSTYPE_SOFT_MASTER:
646                         /*
647                          * Directly mounted soft master always wins.  There
648                          * should be only one.
649                          */
650                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
651                         cluster->focus_index = i;
652                         cluster->focus = chain;
653                         cluster->error = chain ? chain->error : 0;
654                         focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
655                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
656                         break;
657                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
658                         /*
659                          * Directly mounted soft slave always wins.  There
660                          * should be only one.
661                          */
662                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
663                         if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
664                                 cluster->focus_index = i;
665                                 cluster->focus = chain;
666                                 cluster->error = chain ? chain->error : 0;
667                                 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
668                         }
669                         break;
670                 case HAMMER2_PFSTYPE_SUPROOT:
671                         /*
672                          * spmp (degenerate case)
673                          */
674                         KKASSERT(i == 0);
675                         cluster->focus_index = i;
676                         cluster->focus = chain;
677                         cluster->error = chain ? chain->error : 0;
678                         focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
679                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
680                         break;
681                 default:
682                         break;
683                 }
684         }
685
686         /*
687          * Focus now set, adjust ddflag.  Skip this pass if the focus
688          * is bad or if we are at the PFS root (the bref won't match at
689          * the PFS root, obviously).
690          */
691         focus = cluster->focus;
692         if (focus) {
693                 cluster->ddflag =
694                         (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
695         } else {
696                 cluster->ddflag = 0;
697                 goto skip4;
698         }
699         if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
700                 goto skip4;
701
702         /*
703          * Pass 4
704          *
705          * Validate the elements that were not marked invalid.  They should
706          * match.
707          */
708         for (i = 0; i < cluster->nchains; ++i) {
709                 int ddflag;
710
711                 chain = cluster->array[i].chain;
712
713                 if (chain == NULL)
714                         continue;
715                 if (chain == focus)
716                         continue;
717                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
718                         continue;
719
720                 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
721                 if (chain->bref.type != focus->bref.type ||
722                     chain->bref.key != focus->bref.key ||
723                     chain->bref.keybits != focus->bref.keybits ||
724                     chain->bref.modify_tid != focus->bref.modify_tid ||
725                     chain->bytes != focus->bytes ||
726                     ddflag != cluster->ddflag) {
727                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
728                         if (hammer2_debug & 1)
729                         kprintf("cluster_resolve: matching modify_tid failed "
730                                 "bref test: idx=%d type=%02x/%02x "
731                                 "key=%016jx/%d-%016jx/%d "
732                                 "mod=%016jx/%016jx bytes=%u/%u\n",
733                                 i,
734                                 chain->bref.type, focus->bref.type,
735                                 chain->bref.key, chain->bref.keybits,
736                                 focus->bref.key, focus->bref.keybits,
737                                 chain->bref.modify_tid, focus->bref.modify_tid,
738                                 chain->bytes, focus->bytes);
739                         if (hammer2_debug & 0x4000)
740                                 panic("cluster_resolve");
741                         /* flag issue and force resync? */
742                 }
743         }
744 skip4:
745
746         if (ttlslaves == 0)
747                 nflags |= HAMMER2_CLUSTER_NOSOFT;
748         if (ttlmasters == 0)
749                 nflags |= HAMMER2_CLUSTER_NOHARD;
750
751         /*
752          * Set SSYNCED or MSYNCED for slaves and masters respectively if
753          * all available nodes (even if 0 are available) are fully
754          * synchronized.  This is used by the synchronization thread to
755          * determine if there is work it could potentially accomplish.
756          */
757         if (nslaves == ttlslaves)
758                 nflags |= HAMMER2_CLUSTER_SSYNCED;
759         if (nmasters == ttlmasters)
760                 nflags |= HAMMER2_CLUSTER_MSYNCED;
761
762         /*
763          * Determine if the cluster was successfully locked for the
764          * requested operation and generate an error code.  The cluster
765          * will not be locked (or ref'd) if an error is returned.
766          *
767          * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
768          * to determine if reading or writing is possible.  If writing, the
769          * cluster still requires a call to hammer2_cluster_modify() first.
770          */
771         atomic_set_int(&cluster->flags, nflags);
772         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
773 }
774
775 /*
776  * This is used by the sync thread to force non-NULL elements of a copy
777  * of the pmp->iroot cluster to be good which is required to prime the
778  * sync.
779  */
780 void
781 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
782 {
783         int i;
784
785         for (i = 0; i < cluster->nchains; ++i) {
786                 if (cluster->array[i].chain)
787                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
788         }
789 }
790
791 /*
792  * Copy a cluster, returned a ref'd cluster.  All underlying chains
793  * are also ref'd, but not locked.  Focus state is also copied.
794  *
795  * Original cluster does not have to be locked but usually is.
796  * New cluster will not be flagged as locked.
797  *
798  * Callers using this function to initialize a new cluster from an inode
799  * generally lock and resolve the resulting cluster.
800  *
801  * Callers which use this function to save/restore a cluster structure
802  * generally retain the focus state and do not re-resolve it.  Caller should
803  * not try to re-resolve internal (cparent) node state during an iteration
804  * as the individual tracking elements of cparent in an iteration may not
805  * match even though they are correct.
806  */
807 hammer2_cluster_t *
808 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
809 {
810         hammer2_pfs_t *pmp = ocluster->pmp;
811         hammer2_cluster_t *ncluster;
812         hammer2_chain_t *chain;
813         int i;
814
815         ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
816         ncluster->pmp = pmp;
817         ncluster->nchains = ocluster->nchains;
818         ncluster->refs = 1;
819
820         for (i = 0; i < ocluster->nchains; ++i) {
821                 chain = ocluster->array[i].chain;
822                 ncluster->array[i].chain = chain;
823                 ncluster->array[i].flags = ocluster->array[i].flags;
824                 if (chain)
825                         hammer2_chain_ref(chain);
826         }
827         ncluster->focus_index = ocluster->focus_index;
828         ncluster->focus = ocluster->focus;
829         ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED |
830                                               HAMMER2_CLUSTER_INODE);
831
832         return (ncluster);
833 }
834
835 /*
836  * Unlock a cluster.  Refcount and focus is maintained.
837  */
838 void
839 hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx)
840 {
841         hammer2_chain_t *chain;
842         int i;
843
844         if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
845                 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
846                         cluster);
847         }
848         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
849         KKASSERT(cluster->refs > 0);
850         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
851
852         for (i = 0; i < cluster->nchains; ++i) {
853                 if (i == idx)
854                         continue;
855                 chain = cluster->array[i].chain;
856                 if (chain)
857                         hammer2_chain_unlock(chain);
858         }
859 }
860
861 void
862 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
863 {
864         hammer2_cluster_unlock_except(cluster, -1);
865 }
866
867 /*
868  * Resize the cluster's physical storage allocation in-place.  This may
869  * replace the cluster's chains.
870  */
871 void
872 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
873                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
874                        int nradix, int flags)
875 {
876         hammer2_chain_t *chain;
877         int i;
878
879         KKASSERT(cparent->pmp == cluster->pmp);         /* can be NULL */
880         KKASSERT(cparent->nchains == cluster->nchains);
881
882         for (i = 0; i < cluster->nchains; ++i) {
883                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
884                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
885                         continue;
886                 }
887                 chain = cluster->array[i].chain;
888                 if (chain) {
889                         KKASSERT(cparent->array[i].chain);
890                         hammer2_chain_resize(trans, ip,
891                                              cparent->array[i].chain, chain,
892                                              nradix, flags);
893                 }
894         }
895 }
896
897 /*
898  * Set an inode's cluster modified, marking the related chains RW and
899  * duplicating them if necessary.
900  *
901  * The passed-in chain is a localized copy of the chain previously acquired
902  * when the inode was locked (and possilby replaced in the mean time), and
903  * must also be updated.  In fact, we update it first and then synchronize
904  * the inode's cluster cache.
905  */
906 hammer2_inode_data_t *
907 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
908                           hammer2_cluster_t *cluster, int flags)
909 {
910         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
911         hammer2_cluster_modify(trans, cluster, flags);
912
913         hammer2_inode_repoint(ip, NULL, cluster);
914         if (ip->vp)
915                 vsetisdirty(ip->vp);
916         return (&hammer2_cluster_wdata(cluster)->ipdata);
917 }
918
919 /*
920  * Adjust the cluster's chains to allow modification and adjust the
921  * focus.  Data will be accessible on return.
922  *
923  * If our focused master errors on modify, re-resolve the cluster to
924  * try to select a different master.
925  */
926 void
927 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
928                        int flags)
929 {
930         hammer2_chain_t *chain;
931         int resolve_again;
932         int i;
933
934         resolve_again = 0;
935         for (i = 0; i < cluster->nchains; ++i) {
936                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
937                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
938                         continue;
939                 }
940                 chain = cluster->array[i].chain;
941                 if (chain == NULL)
942                         continue;
943                 if (chain->error)
944                         continue;
945                 hammer2_chain_modify(trans, chain, flags);
946                 if (cluster->focus == chain && chain->error) {
947                         cluster->error = chain->error;
948                         resolve_again = 1;
949                 }
950         }
951         if (resolve_again)
952                 hammer2_cluster_resolve(cluster);
953 }
954
955 /*
956  * Synchronize modifications from the focus to other chains in a cluster.
957  * Convenient because nominal API users can just modify the contents of the
958  * focus (at least for non-blockref data).
959  *
960  * Nominal front-end operations only edit non-block-table data in a single
961  * chain.  This code copies such modifications to the other chains in the
962  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
963  * by both the frontend and the backend and will explode in fireworks if
964  * blindly copied.
965  */
966 void
967 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
968 {
969         hammer2_chain_t *focus;
970         hammer2_chain_t *scan;
971         const hammer2_inode_data_t *ripdata;
972         hammer2_inode_data_t *wipdata;
973         int i;
974
975         focus = cluster->focus;
976         KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
977
978         for (i = 0; i < cluster->nchains; ++i) {
979                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
980                         continue;
981                 scan = cluster->array[i].chain;
982                 if (scan == NULL || scan == focus)
983                         continue;
984                 if (scan->error)
985                         continue;
986                 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
987                 KKASSERT(focus->bytes == scan->bytes &&
988                          focus->bref.type == scan->bref.type);
989                 switch(focus->bref.type) {
990                 case HAMMER2_BREF_TYPE_INODE:
991                         ripdata = &focus->data->ipdata;
992                         wipdata = &scan->data->ipdata;
993                         if ((ripdata->op_flags &
994                             HAMMER2_OPFLAG_DIRECTDATA) == 0) {
995                                 bcopy(ripdata, wipdata,
996                                       offsetof(hammer2_inode_data_t, u));
997                                 break;
998                         }
999                         /* fall through to full copy */
1000                 case HAMMER2_BREF_TYPE_DATA:
1001                         bcopy(focus->data, scan->data, focus->bytes);
1002                         break;
1003                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1004                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1005                 case HAMMER2_BREF_TYPE_FREEMAP:
1006                 case HAMMER2_BREF_TYPE_VOLUME:
1007                         panic("hammer2_cluster_modsync: illegal node type");
1008                         /* NOT REACHED */
1009                         break;
1010                 default:
1011                         panic("hammer2_cluster_modsync: unknown node type");
1012                         break;
1013                 }
1014         }
1015 }
1016
1017 /*
1018  * Lookup initialization/completion API.  Returns a locked, fully resolved
1019  * cluster with one ref.
1020  */
1021 hammer2_cluster_t *
1022 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
1023 {
1024         hammer2_cluster_t *cluster;
1025
1026         cluster = hammer2_cluster_copy(cparent);
1027         if (flags & HAMMER2_LOOKUP_SHARED) {
1028                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
1029                                               HAMMER2_RESOLVE_SHARED);
1030         } else {
1031                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1032         }
1033         hammer2_cluster_resolve(cluster);
1034
1035         return (cluster);
1036 }
1037
1038 void
1039 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1040 {
1041         if (cparent) {
1042                 hammer2_cluster_unlock(cparent);
1043                 hammer2_cluster_drop(cparent);
1044         }
1045 }
1046
1047 /*
1048  * Locate first match or overlap under parent, return a new, locked, resolved
1049  * cluster with one ref.
1050  *
1051  * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1052  */
1053 hammer2_cluster_t *
1054 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
1055                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1056 {
1057         hammer2_pfs_t *pmp;
1058         hammer2_cluster_t *cluster;
1059         hammer2_chain_t *chain;
1060         hammer2_key_t key_accum;
1061         hammer2_key_t key_next;
1062         int null_count;
1063         int rflags;
1064         int i;
1065
1066         KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1067
1068         pmp = cparent->pmp;                             /* can be NULL */
1069         key_accum = *key_nextp;
1070         null_count = 0;
1071         if (flags & HAMMER2_LOOKUP_SHARED)
1072                 rflags = HAMMER2_RESOLVE_SHARED;
1073         else
1074                 rflags = 0;
1075
1076         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1077         cluster->pmp = pmp;                             /* can be NULL */
1078         cluster->refs = 1;
1079         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1080                 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1081
1082         /*
1083          * Iterating earlier cluster elements with later elements still
1084          * locked is a problem, so we have to unlock the parent and then
1085          * re-lock as we go.
1086          */
1087         hammer2_cluster_unlock(cparent);
1088         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1089
1090         /*
1091          * Pass-1, issue lookups.
1092          */
1093         for (i = 0; i < cparent->nchains; ++i) {
1094                 cluster->array[i].flags = cparent->array[i].flags;
1095                 key_next = *key_nextp;
1096
1097                 /*
1098                  * Always relock the parent as we go.
1099                  */
1100                 if (cparent->array[i].chain) {
1101                         hammer2_chain_lock(cparent->array[i].chain, rflags);
1102                 }
1103
1104                 /*
1105                  * Nothing to base the lookup, or parent was not synchronized.
1106                  */
1107                 if (cparent->array[i].chain == NULL ||
1108                     (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
1109                         ++null_count;
1110                         continue;
1111                 }
1112
1113                 chain = hammer2_chain_lookup(&cparent->array[i].chain,
1114                                              &key_next,
1115                                              key_beg, key_end,
1116                                              &cparent->array[i].cache_index,
1117                                              flags);
1118                 cluster->array[i].chain = chain;
1119                 if (chain == NULL) {
1120                         ++null_count;
1121                 }
1122                 if (key_accum > key_next)
1123                         key_accum = key_next;
1124         }
1125
1126         /*
1127          * Cleanup
1128          */
1129         cluster->nchains = i;
1130         *key_nextp = key_accum;
1131
1132         /*
1133          * The cluster must be resolved, out of sync elements may be present.
1134          *
1135          * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1136          */
1137         if (null_count != i)
1138                 hammer2_cluster_resolve(cluster);
1139         if (null_count == i ||
1140             (cluster->focus == NULL &&
1141              (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1142                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1143                         hammer2_cluster_unlock(cluster);
1144                 hammer2_cluster_drop(cluster);
1145                 cluster = NULL;
1146         }
1147
1148         return (cluster);
1149 }
1150
1151 /*
1152  * Locate next match or overlap under parent, replace the passed-in cluster.
1153  * The returned cluster is a new, locked, resolved cluster with one ref.
1154  *
1155  * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1156  */
1157 hammer2_cluster_t *
1158 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1159                      hammer2_key_t *key_nextp,
1160                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1161 {
1162         hammer2_chain_t *ochain;
1163         hammer2_chain_t *nchain;
1164         hammer2_key_t key_accum;
1165         hammer2_key_t key_next;
1166         int parent_index;
1167         int cluster_index;
1168         int null_count;
1169         int rflags;
1170         int i;
1171
1172         KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1173
1174         key_accum = *key_nextp;
1175         null_count = 0;
1176         parent_index = cparent->focus_index;    /* save prior focus */
1177         cluster_index = cluster->focus_index;
1178         if (flags & HAMMER2_LOOKUP_SHARED)
1179                 rflags = HAMMER2_RESOLVE_SHARED;
1180         else
1181                 rflags = 0;
1182
1183         cluster->focus = NULL;          /* XXX needed any more? */
1184         /*cparent->focus = NULL;*/
1185         cluster->focus_index = 0;       /* XXX needed any more? */
1186         /*cparent->focus_index = 0;*/
1187
1188         cluster->ddflag = 0;
1189
1190         /*
1191          * The parent is always locked on entry, the iterator may be locked
1192          * depending on flags.
1193          *
1194          * We must temporarily unlock the passed-in clusters to avoid a
1195          * deadlock between elements of the cluster with other threads.
1196          * We will fixup the lock in the loop.
1197          *
1198          * Note that this will clear the focus.
1199          *
1200          * Reflag the clusters as locked, because we will relock them
1201          * as we go.
1202          */
1203         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1204                 hammer2_cluster_unlock(cluster);
1205                 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1206         }
1207         hammer2_cluster_unlock(cparent);
1208         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1209
1210         for (i = 0; i < cparent->nchains; ++i) {
1211                 key_next = *key_nextp;
1212                 ochain = cluster->array[i].chain;
1213
1214                 /*
1215                  * Always relock the parent as we go.
1216                  */
1217                 if (cparent->array[i].chain)
1218                         hammer2_chain_lock(cparent->array[i].chain, rflags);
1219
1220                 /*
1221                  * Nothing to iterate from.  These cases can occur under
1222                  * normal operations.  For example, during synchronization
1223                  * a slave might reach the end of its scan while records
1224                  * are still left on the master(s).
1225                  */
1226                 if (ochain == NULL) {
1227                         ++null_count;
1228                         continue;
1229                 }
1230                 if (cparent->array[i].chain == NULL ||
1231                     (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1232                     (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1233                         /* ochain has not yet been relocked */
1234                         hammer2_chain_drop(ochain);
1235                         cluster->array[i].chain = NULL;
1236                         ++null_count;
1237                         continue;
1238                 }
1239
1240                 /*
1241                  * Relock the child if necessary.  Parent and child will then
1242                  * be locked as expected by hammer2_chain_next() and flags.
1243                  */
1244                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1245                         hammer2_chain_lock(ochain, rflags);
1246                 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1247                                             &key_next, key_beg, key_end,
1248                                             &cparent->array[i].cache_index,
1249                                             flags);
1250                 /* ochain now invalid but can still be used for focus check */
1251                 if (parent_index == i) {
1252                         cparent->focus_index = i;
1253                         cparent->focus = cparent->array[i].chain;
1254                 }
1255
1256                 cluster->array[i].chain = nchain;
1257                 if (nchain == NULL) {
1258                         ++null_count;
1259                 }
1260                 if (key_accum > key_next)
1261                         key_accum = key_next;
1262         }
1263
1264         /*
1265          * Cleanup
1266          */
1267         cluster->nchains = i;
1268         *key_nextp = key_accum;
1269
1270         /*
1271          * The cluster must be resolved, out of sync elements may be present.
1272          *
1273          * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1274          */
1275         if (null_count != i)
1276                 hammer2_cluster_resolve(cluster);
1277         if (null_count == i ||
1278             (cluster->focus == NULL &&
1279              (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1280                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1281                         hammer2_cluster_unlock(cluster);
1282                 hammer2_cluster_drop(cluster);
1283                 cluster = NULL;
1284         }
1285         return(cluster);
1286 }
1287
1288 /*
1289  * Advance just one chain in the cluster and recalculate the invalid bit.
1290  * The cluster index is allowed to be flagged invalid on input and is
1291  * recalculated on return.
1292  *
1293  * (used during synchronization to advance past a chain being deleted).
1294  *
1295  * The chain being advanced must not be the focus and the clusters in
1296  * question must have already passed normal cluster_lookup/cluster_next
1297  * checks.
1298  *
1299  * The cluster always remains intact on return, so void function.
1300  */
1301 void
1302 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1303                                   hammer2_cluster_t *cluster,
1304                                   hammer2_key_t *key_nextp,
1305                                   hammer2_key_t key_beg,
1306                                   hammer2_key_t key_end,
1307                                   int i, int flags)
1308 {
1309         hammer2_chain_t *ochain;
1310         hammer2_chain_t *nchain;
1311         hammer2_chain_t *focus;
1312         hammer2_key_t key_accum;
1313         hammer2_key_t key_next;
1314         int ddflag;
1315
1316         key_accum = *key_nextp;
1317         key_next = *key_nextp;
1318         ochain = cluster->array[i].chain;
1319         if (ochain == NULL)
1320                 goto done;
1321         KKASSERT(ochain != cluster->focus);
1322
1323         nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1324                                     &key_next, key_beg, key_end,
1325                                     &cparent->array[i].cache_index,
1326                                     flags);
1327         /* ochain now invalid */
1328         if (cparent->focus_index == i)
1329                 cparent->focus = cparent->array[i].chain;
1330
1331         /*
1332          * Install nchain.  Note that nchain can be NULL, and can also
1333          * be in an unlocked state depending on flags.
1334          */
1335         cluster->array[i].chain = nchain;
1336         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1337
1338         if (key_accum > key_next)
1339                 key_accum = key_next;
1340
1341         focus = cluster->focus;
1342         if (focus == NULL)
1343                 goto done;
1344         if (nchain == NULL)
1345                 goto done;
1346 #if 0
1347         if (nchain == focus)    /* ASSERTED NOT TRUE */
1348                 ...
1349 #endif
1350         ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1351         if (nchain->bref.type != focus->bref.type ||
1352             nchain->bref.key != focus->bref.key ||
1353             nchain->bref.keybits != focus->bref.keybits ||
1354             nchain->bref.modify_tid != focus->bref.modify_tid ||
1355             nchain->bytes != focus->bytes ||
1356             ddflag != cluster->ddflag) {
1357                 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1358         }
1359
1360 done:
1361         *key_nextp = key_accum;
1362 #if 0
1363         /*
1364          * For now don't re-resolve cluster->flags.
1365          */
1366         hammer2_cluster_resolve(cluster);
1367 #endif
1368 }
1369
1370 /*
1371  * Create a new cluster using the specified key
1372  */
1373 int
1374 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1375                      hammer2_cluster_t **clusterp,
1376                      hammer2_key_t key, int keybits,
1377                      int type, size_t bytes, int flags)
1378 {
1379         hammer2_cluster_t *cluster;
1380         hammer2_pfs_t *pmp;
1381         int error;
1382         int i;
1383
1384         pmp = trans->pmp;                               /* can be NULL */
1385
1386         if ((cluster = *clusterp) == NULL) {
1387                 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1388                                   M_WAITOK | M_ZERO);
1389                 cluster->pmp = pmp;                     /* can be NULL */
1390                 cluster->refs = 1;
1391                 cluster->flags = HAMMER2_CLUSTER_LOCKED;
1392         }
1393         cluster->focus_index = 0;
1394         cluster->focus = NULL;
1395
1396         /*
1397          * NOTE: cluster->array[] entries can initially be NULL.  If
1398          *       *clusterp is supplied, skip NULL entries, otherwise
1399          *       create new chains.
1400          */
1401         for (i = 0; i < cparent->nchains; ++i) {
1402                 if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1403                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1404                         continue;
1405                 }
1406                 if (*clusterp) {
1407                         if ((cluster->array[i].flags &
1408                              HAMMER2_CITEM_FEMOD) == 0) {
1409                                 cluster->array[i].flags |=
1410                                                 HAMMER2_CITEM_INVALID;
1411                                 continue;
1412                         }
1413                         if (cluster->array[i].chain == NULL)
1414                                 continue;
1415                 }
1416                 error = hammer2_chain_create(trans, &cparent->array[i].chain,
1417                                              &cluster->array[i].chain, pmp,
1418                                              key, keybits,
1419                                              type, bytes, flags);
1420                 if (cparent->focus_index == i)
1421                         cparent->focus = cparent->array[i].chain;
1422                 KKASSERT(error == 0);
1423                 if (cluster->focus == NULL) {
1424                         cluster->focus_index = i;
1425                         cluster->focus = cluster->array[i].chain;
1426                 }
1427                 if (cparent->focus == cparent->array[i].chain) {
1428                         cluster->focus_index = i;
1429                         cluster->focus = cluster->array[i].chain;
1430                 }
1431         }
1432         cluster->nchains = i;
1433         *clusterp = cluster;
1434         hammer2_cluster_resolve(cluster);
1435
1436         return error;
1437 }
1438
1439 /*
1440  * Rename a cluster to a new parent.
1441  *
1442  * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1443  *          So the data_off field is not relevant.  Only the key and
1444  *          keybits are used.
1445  */
1446 void
1447 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1448                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1449                        int flags)
1450 {
1451         hammer2_chain_t *chain;
1452         hammer2_blockref_t xbref;
1453         int i;
1454
1455 #if 0
1456         cluster->focus = NULL;
1457         cparent->focus = NULL;
1458         cluster->focus_index = 0;
1459         cparent->focus_index = 0;
1460 #endif
1461
1462         for (i = 0; i < cluster->nchains; ++i) {
1463                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1464                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1465                         continue;
1466                 }
1467                 chain = cluster->array[i].chain;
1468                 if (chain) {
1469                         if (bref) {
1470                                 xbref = chain->bref;
1471                                 xbref.key = bref->key;
1472                                 xbref.keybits = bref->keybits;
1473                                 hammer2_chain_rename(trans, &xbref,
1474                                                      &cparent->array[i].chain,
1475                                                      chain, flags);
1476                         } else {
1477                                 hammer2_chain_rename(trans, NULL,
1478                                                      &cparent->array[i].chain,
1479                                                      chain, flags);
1480                         }
1481                         if (cparent->focus_index == i)
1482                                 cparent->focus = cparent->array[i].chain;
1483                         KKASSERT(cluster->array[i].chain == chain); /*remove*/
1484                 }
1485         }
1486 }
1487
1488 /*
1489  * Mark a cluster deleted
1490  */
1491 void
1492 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1493                        hammer2_cluster_t *cluster, int flags)
1494 {
1495         hammer2_chain_t *chain;
1496         hammer2_chain_t *parent;
1497         int i;
1498
1499         if (cparent == NULL) {
1500                 kprintf("cparent is NULL\n");
1501                 return;
1502         }
1503
1504         for (i = 0; i < cluster->nchains; ++i) {
1505                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1506                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1507                         continue;
1508                 }
1509                 parent = cparent->array[i].chain;
1510                 chain = cluster->array[i].chain;
1511                 if (chain == NULL)
1512                         continue;
1513                 if (chain->parent != parent) {
1514                         kprintf("hammer2_cluster_delete: parent "
1515                                 "mismatch chain=%p parent=%p against=%p\n",
1516                                 chain, chain->parent, parent);
1517                 } else {
1518                         hammer2_chain_delete(trans, parent, chain, flags);
1519                 }
1520         }
1521 }
1522
1523 /*
1524  * Create a snapshot of the specified {parent, ochain} with the specified
1525  * label.  The originating hammer2_inode must be exclusively locked for
1526  * safety.
1527  *
1528  * The ioctl code has already synced the filesystem.
1529  */
1530 int
1531 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1532                        hammer2_ioc_pfs_t *pfs)
1533 {
1534         hammer2_dev_t *hmp;
1535         hammer2_cluster_t *ncluster;
1536         const hammer2_inode_data_t *ripdata;
1537         hammer2_inode_data_t *wipdata;
1538         hammer2_chain_t *nchain;
1539         hammer2_inode_t *nip;
1540         size_t name_len;
1541         hammer2_key_t lhc;
1542         struct vattr vat;
1543 #if 0
1544         uuid_t opfs_clid;
1545 #endif
1546         int error;
1547         int i;
1548
1549         kprintf("snapshot %s\n", pfs->name);
1550
1551         name_len = strlen(pfs->name);
1552         lhc = hammer2_dirhash(pfs->name, name_len);
1553
1554         /*
1555          * Get the clid
1556          */
1557         ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1558 #if 0
1559         opfs_clid = ripdata->pfs_clid;
1560 #endif
1561         hmp = ocluster->focus->hmp;     /* XXX find synchronized local disk */
1562
1563         /*
1564          * Create the snapshot directory under the super-root
1565          *
1566          * Set PFS type, generate a unique filesystem id, and generate
1567          * a cluster id.  Use the same clid when snapshotting a PFS root,
1568          * which theoretically allows the snapshot to be used as part of
1569          * the same cluster (perhaps as a cache).
1570          *
1571          * Copy the (flushed) blockref array.  Theoretically we could use
1572          * chain_duplicate() but it becomes difficult to disentangle
1573          * the shared core so for now just brute-force it.
1574          */
1575         VATTR_NULL(&vat);
1576         vat.va_type = VDIR;
1577         vat.va_mode = 0755;
1578         ncluster = NULL;
1579         nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1580                                    proc0.p_ucred, pfs->name, name_len,
1581                                    &ncluster,
1582                                    HAMMER2_INSERT_PFSROOT, &error);
1583
1584         if (nip) {
1585                 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1586                 wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER;
1587                 wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1588                 wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT;
1589                 kern_uuidgen(&wipdata->pfs_fsid, 1);
1590
1591                 /*
1592                  * Give the snapshot its own private cluster.  As a snapshot
1593                  * no further synchronization with the original cluster will
1594                  * be done.
1595                  */
1596 #if 0
1597                 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1598                         wipdata->pfs_clid = opfs_clid;
1599                 else
1600                         kern_uuidgen(&wipdata->pfs_clid, 1);
1601 #endif
1602                 kern_uuidgen(&wipdata->pfs_clid, 1);
1603
1604                 for (i = 0; i < ncluster->nchains; ++i) {
1605                         if ((ncluster->array[i].flags &
1606                              HAMMER2_CITEM_FEMOD) == 0) {
1607                                 ncluster->array[i].flags |=
1608                                         HAMMER2_CITEM_INVALID;
1609                                 continue;
1610                         }
1611                         nchain = ncluster->array[i].chain;
1612                         if (nchain)
1613                                 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1614                 }
1615
1616                 /* XXX hack blockset copy */
1617                 /* XXX doesn't work with real cluster */
1618                 KKASSERT(ocluster->nchains == 1);
1619                 wipdata->u.blockset = ripdata->u.blockset;
1620                 hammer2_cluster_modsync(ncluster);
1621                 for (i = 0; i < ncluster->nchains; ++i) {
1622                         nchain = ncluster->array[i].chain;
1623                         if (nchain)
1624                                 hammer2_flush(trans, nchain, 1);
1625                 }
1626                 hammer2_inode_unlock(nip, ncluster);
1627         }
1628         return (error);
1629 }
1630
1631 /*
1632  * Return locked parent cluster given a locked child.  The child remains
1633  * locked on return.  The new parent's focus follows the child's focus
1634  * and the parent is always resolved.
1635  *
1636  * We must temporarily unlock the passed-in cluster to avoid a deadlock
1637  * between elements of the cluster.
1638  *
1639  * We must not try to hammer2_cluster_resolve() cparent.  The individual
1640  * parent chains for the nodes are the correct parents for the cluster but
1641  * do not necessarily match, so resolve would likely implode.
1642  */
1643 hammer2_cluster_t *
1644 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1645 {
1646         hammer2_cluster_t *cparent;
1647         int i;
1648
1649         cparent = hammer2_cluster_copy(cluster);
1650         hammer2_cluster_unlock(cluster);
1651
1652         for (i = 0; i < cparent->nchains; ++i) {
1653                 hammer2_chain_t *chain;
1654                 hammer2_chain_t *rchain;
1655
1656                 /*
1657                  * Calculate parent for each element.  Old chain has an extra
1658                  * ref for cparent but the lock remains with cluster.
1659                  */
1660                 chain = cparent->array[i].chain;
1661                 if (chain == NULL)
1662                         continue;
1663                 while ((rchain = chain->parent) != NULL) {
1664                         hammer2_chain_ref(rchain);
1665                         hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1666                         if (chain->parent == rchain)
1667                                 break;
1668                         hammer2_chain_unlock(rchain);
1669                         hammer2_chain_drop(rchain);
1670                 }
1671                 cparent->array[i].chain = rchain;
1672                 hammer2_chain_drop(chain);
1673         }
1674         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1675         /* hammer2_cluster_resolve(cparent); */
1676         hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1677
1678         return cparent;
1679 }
1680
1681 /************************************************************************
1682  *                              CLUSTER I/O                             *
1683  ************************************************************************
1684  *
1685  *
1686  * WARNING! blockref[] array data is not universal.  These functions should
1687  *          only be used to access universal data.
1688  *
1689  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1690  *          complete if necessary.  The I/O's should have already been
1691  *          initiated by the cluster_lock/chain_lock operation.
1692  *
1693  *          The cluster must already be in a modified state before wdata
1694  *          is called.  The data will already be available for this case.
1695  */
1696 const hammer2_media_data_t *
1697 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1698 {
1699         return(cluster->focus->data);
1700 }
1701
1702 hammer2_media_data_t *
1703 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1704 {
1705         KKASSERT(hammer2_cluster_modified(cluster));
1706         return(cluster->focus->data);
1707 }
1708
1709 /*
1710  * Load cluster data asynchronously with callback.
1711  *
1712  * The callback is made for the first validated data found, or NULL
1713  * if no valid data is available.
1714  *
1715  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1716  *       in the inode with an exclusive lock held), the chain structure may be
1717  *       shared.
1718  */
1719 void
1720 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1721                            void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1722 {
1723         hammer2_chain_t *chain;
1724         hammer2_iocb_t *iocb;
1725         hammer2_dev_t *hmp;
1726         hammer2_blockref_t *bref;
1727         int i;
1728
1729         i = cluster->focus_index;
1730         chain = cluster->focus;
1731
1732         iocb = &cluster->iocb;
1733         iocb->callback = callback;
1734         iocb->dio = NULL;               /* for already-validated case */
1735         iocb->cluster = cluster;
1736         iocb->chain = chain;
1737         iocb->ptr = ptr;
1738         iocb->lbase = (off_t)i;
1739         iocb->flags = 0;
1740         iocb->error = 0;
1741
1742         /*
1743          * Data already validated
1744          */
1745         if (chain->data) {
1746                 callback(iocb);
1747                 return;
1748         }
1749
1750         /*
1751          * We must resolve to a device buffer, either by issuing I/O or
1752          * by creating a zero-fill element.  We do not mark the buffer
1753          * dirty when creating a zero-fill element (the hammer2_chain_modify()
1754          * API must still be used to do that).
1755          *
1756          * The device buffer is variable-sized in powers of 2 down
1757          * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1758          * chunk always contains buffers of the same size. (XXX)
1759          *
1760          * The minimum physical IO size may be larger than the variable
1761          * block size.
1762          *
1763          * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
1764          *            matches hammer2_devblksize()?  Or does the freemap's
1765          *            pre-zeroing handle the case for us?
1766          */
1767         bref = &chain->bref;
1768         hmp = chain->hmp;
1769
1770 #if 0
1771         /* handled by callback? <- TODO XXX even needed for loads? */
1772         /*
1773          * The getblk() optimization for a 100% overwrite can only be used
1774          * if the physical block size matches the request.
1775          */
1776         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1777             chain->bytes == hammer2_devblksize(chain->bytes)) {
1778                 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1779                 KKASSERT(error == 0);
1780                 iocb->dio = dio;
1781                 callback(iocb);
1782                 return;
1783         }
1784 #endif
1785
1786         /*
1787          * Otherwise issue a read
1788          */
1789         hammer2_adjreadcounter(&chain->bref, chain->bytes);
1790         hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1791 }