hammer2 - Refactor frontend part 14/many
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *          locks and I/O, do quorum and/or master-slave processing, and
50  *          it must operate properly even if some nodes are broken (which
51  *          can also mean indefinite locks).
52  *
53  *                              CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *              hammer2_cluster_lock()
59  *              hammer2_cluster_parent()
60  *
61  *      - Most complex functions, quorum management on transaction ids.
62  *
63  *      - Locking and data accesses must be internally asynchronous.
64  *
65  *      - Validate and manage cache coherency primitives (cache state
66  *        is stored in chain topologies but must be validated by these
67  *        functions).
68  *
69  * (2) Lookups and Scans
70  *              hammer2_cluster_lookup()
71  *              hammer2_cluster_next()
72  *
73  *      - Depend on locking & data retrieval functions, but still complex.
74  *
75  *      - Must do quorum management on transaction ids.
76  *
77  *      - Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *              hammer2_cluster_create()
81  *              hammer2_cluster_rename()
82  *              hammer2_cluster_delete()
83  *              hammer2_cluster_modify()
84  *              hammer2_cluster_modsync()
85  *
86  *      - Can usually punt on failures, operation continues unless quorum
87  *        is lost.  If quorum is lost, must wait for resynchronization
88  *        (depending on the management mode).
89  *
90  *      - Must disconnect node on failures (also not flush), remount, and
91  *        resynchronize.
92  *
93  *      - Network links (via kdmsg) are relatively easy to issue as the
94  *        complex underworkings of hammer2_chain.c don't have to messed
95  *        with (the protocol is at a higher level than block-level).
96  *
97  *      - Multiple local disk nodes (i.e. block devices) are another matter.
98  *        Chain operations have to be dispatched to per-node threads (xN)
99  *        because we can't asynchronize potentially very complex chain
100  *        operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *        (these threads are also used to terminate incoming kdmsg ops from
103  *        other machines).
104  *
105  *      - Single-node filesystems do not use threads and will simply call
106  *        hammer2_chain.c functions directly.  This short-cut is handled
107  *        at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115
116 #include "hammer2.h"
117
118 /*
119  * Returns non-zero if any chain in the cluster needs to be resized.
120  * Errored elements are not used in the calculation.
121  */
122 int
123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
124 {
125         hammer2_chain_t *chain;
126         int i;
127
128         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
129         for (i = 0; i < cluster->nchains; ++i) {
130                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
131                         continue;
132                 chain = cluster->array[i].chain;
133                 if (chain == NULL)
134                         continue;
135                 if (chain->error)
136                         continue;
137                 if (chain->bytes != bytes)
138                         return 1;
139         }
140         return 0;
141 }
142
143 /*
144  * Returns the bref type of the cluster's foucs.
145  *
146  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
147  * The cluster must be locked.
148  */
149 uint8_t
150 hammer2_cluster_type(hammer2_cluster_t *cluster)
151 {
152         if (cluster->error == 0) {
153                 KKASSERT(cluster->focus != NULL);
154                 return(cluster->focus->bref.type);
155         }
156         return 0;
157 }
158
159 /*
160  * Returns non-zero if the cluster's focus is flagged as being modified.
161  *
162  * If the cluster is errored, returns 0.
163  */
164 int
165 hammer2_cluster_modified(hammer2_cluster_t *cluster)
166 {
167         if (cluster->error == 0) {
168                 KKASSERT(cluster->focus != NULL);
169                 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
170         }
171         return 0;
172 }
173
174 /*
175  * Returns the bref of the cluster's focus, sans any data-offset information
176  * (since offset information is per-node and wouldn't be useful).
177  *
178  * Callers use this function to access modify_tid, mirror_tid, type,
179  * key, and keybits.
180  *
181  * If the cluster is errored, returns an empty bref.
182  * The cluster must be locked.
183  */
184 void
185 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
186 {
187         if (cluster->error == 0) {
188                 KKASSERT(cluster->focus != NULL);
189                 *bref = cluster->focus->bref;
190                 bref->data_off = 0;
191         } else {
192                 bzero(bref, sizeof(*bref));
193         }
194 }
195
196 /*
197  * Flag the cluster for flushing recursively up to the root.  Despite the
198  * work it does, this is relatively benign.  It just makes sure that the
199  * flusher has top-down visibility to this cluster.
200  *
201  * Errored chains are not flagged for flushing.
202  *
203  * The cluster should probably be locked.
204  */
205 void
206 hammer2_cluster_setflush(hammer2_cluster_t *cluster)
207 {
208         hammer2_chain_t *chain;
209         int i;
210
211         for (i = 0; i < cluster->nchains; ++i) {
212                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
213                         continue;
214                 chain = cluster->array[i].chain;
215                 if (chain == NULL)
216                         continue;
217                 if (chain->error)
218                         continue;
219                 hammer2_chain_setflush(chain);
220         }
221 }
222
223 /*
224  * Set the check mode for the cluster.
225  * Errored elements of the cluster are ignored.
226  *
227  * The cluster must be locked and modified.
228  */
229 void
230 hammer2_cluster_setmethod_check(hammer2_cluster_t *cluster, int check_algo)
231 {
232         hammer2_chain_t *chain;
233         int i;
234
235         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
236         for (i = 0; i < cluster->nchains; ++i) {
237                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
238                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
239                         continue;
240                 }
241                 chain = cluster->array[i].chain;
242                 if (chain == NULL)
243                         continue;
244                 if (chain->error)
245                         continue;
246                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
247                 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
248                 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
249         }
250 }
251
252 /*
253  * Create a degenerate cluster with one ref from a single locked chain.
254  * The returned cluster will be focused on the chain and inherit its
255  * error state.
256  *
257  * The chain's lock and reference are transfered to the new cluster, so
258  * the caller should not try to unlock the chain separately.
259  *
260  * We fake the flags.
261  */
262 hammer2_cluster_t *
263 hammer2_cluster_from_chain(hammer2_chain_t *chain)
264 {
265         hammer2_cluster_t *cluster;
266
267         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
268         cluster->array[0].chain = chain;
269         cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
270         cluster->nchains = 1;
271         cluster->focus = chain;
272         cluster->focus_index = 0;
273         cluster->pmp = chain->pmp;
274         cluster->refs = 1;
275         cluster->error = chain->error;
276         cluster->flags = HAMMER2_CLUSTER_LOCKED |
277                          HAMMER2_CLUSTER_WRHARD |
278                          HAMMER2_CLUSTER_RDHARD |
279                          HAMMER2_CLUSTER_MSYNCED |
280                          HAMMER2_CLUSTER_SSYNCED;
281
282         return cluster;
283 }
284
285 /*
286  * Add a reference to a cluster and its underlying chains.
287  *
288  * We must also ref the underlying chains in order to allow ref/unlock
289  * sequences to later re-lock.
290  */
291 void
292 hammer2_cluster_ref(hammer2_cluster_t *cluster)
293 {
294         atomic_add_int(&cluster->refs, 1);
295 }
296
297 /*
298  * Drop the caller's reference to the cluster.  When the ref count drops to
299  * zero this function frees the cluster and drops all underlying chains.
300  *
301  * In-progress read I/Os are typically detached from the cluster once the
302  * first one returns (the remaining stay attached to the DIOs but are then
303  * ignored and drop naturally).
304  */
305 void
306 hammer2_cluster_drop(hammer2_cluster_t *cluster)
307 {
308         hammer2_chain_t *chain;
309         int i;
310
311         KKASSERT(cluster->refs > 0);
312         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
313                 cluster->focus = NULL;          /* safety XXX chg to assert */
314                 cluster->focus_index = 0;
315
316                 for (i = 0; i < cluster->nchains; ++i) {
317                         chain = cluster->array[i].chain;
318                         if (chain) {
319                                 hammer2_chain_drop(chain);
320                                 cluster->array[i].chain = NULL; /* safety */
321                         }
322                 }
323                 cluster->nchains = 0;                           /* safety */
324
325                 kfree(cluster, M_HAMMER2);
326                 /* cluster is invalid */
327         }
328 }
329
330 void
331 hammer2_cluster_wait(hammer2_cluster_t *cluster)
332 {
333         tsleep(cluster->focus, 0, "h2clcw", 1);
334 }
335
336 /*
337  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained. 
338  *
339  * WARNING! This function expects the caller to handle resolution of the
340  *          cluster.  We never re-resolve the cluster in this function,
341  *          because it might be used to temporarily unlock/relock a cparent
342  *          in an iteration or recursrion, and the cparents elements do not
343  *          necessarily match.
344  */
345 void
346 hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
347 {
348         hammer2_chain_t *chain;
349         int i;
350
351         /* cannot be on inode-embedded cluster template, must be on copy */
352         KKASSERT(cluster->refs > 0);
353         KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
354         if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
355                 panic("hammer2_cluster_lock: cluster %p already locked!\n",
356                         cluster);
357         }
358         atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
359
360         /*
361          * Lock chains and resolve state.
362          */
363         for (i = 0; i < cluster->nchains; ++i) {
364                 if (i == idx)
365                         continue;
366                 chain = cluster->array[i].chain;
367                 if (chain == NULL)
368                         continue;
369                 hammer2_chain_lock(chain, how);
370         }
371 }
372
373 void
374 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
375 {
376         hammer2_cluster_lock_except(cluster, -1, how);
377 }
378
379 /*
380  * Calculate the clustering state for the cluster and set its focus.
381  * This routine must be called with care.  For example, it should not
382  * normally be called after relocking a non-leaf cluster because parent
383  * clusters help iterations and each element might be at a slightly different
384  * indirect node (each node's topology is independently indexed).
385  *
386  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
387  * operations.  Typically this is only set on a quorum of MASTERs or
388  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
389  * is present, this bit is *not* set on a quorum of MASTERs.  The
390  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
391  * that create/modify/delete elements use it.
392  *
393  * The chains making up the cluster may be narrowed down based on quorum
394  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
395  * narrowed down to a single chain as long as the entire subtopology is known
396  * to be intact.  So, for example, we can narrow a read-only op to a single
397  * fast SLAVE but if we focus a CACHE chain we must still retain at least
398  * a SLAVE to ensure that the subtopology can be accessed.
399  *
400  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
401  * to be maintained once the topology is validated as-of the top level of
402  * the operation.
403  *
404  * If a failure occurs the operation must be aborted by higher-level code and
405  * retried. XXX
406  */
407 void
408 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
409 {
410         hammer2_chain_t *chain;
411         hammer2_chain_t *focus;
412         hammer2_pfs_t *pmp;
413         hammer2_tid_t quorum_tid;
414         hammer2_tid_t last_best_quorum_tid;
415         int focus_pfs_type;
416         uint32_t nflags;
417         int ttlmasters;
418         int ttlslaves;
419         int nmasters;
420         int nslaves;
421         int nquorum;
422         int smpresent;
423         int i;
424
425         cluster->error = 0;
426         cluster->focus = NULL;
427
428         focus_pfs_type = 0;
429         nflags = 0;
430         ttlmasters = 0;
431         ttlslaves = 0;
432         nmasters = 0;
433         nslaves = 0;
434
435         /*
436          * Calculate quorum
437          */
438         pmp = cluster->pmp;
439         KKASSERT(pmp != NULL || cluster->nchains == 0);
440         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
441         smpresent = 0;
442
443         /*
444          * Pass 1
445          *
446          * NOTE: A NULL chain is not necessarily an error, it could be
447          *       e.g. a lookup failure or the end of an iteration.
448          *       Process normally.
449          */
450         for (i = 0; i < cluster->nchains; ++i) {
451                 chain = cluster->array[i].chain;
452                 if (chain && chain->error) {
453                         if (cluster->focus == NULL || cluster->focus == chain) {
454                                 /* error will be overridden by valid focus */
455                                 cluster->error = chain->error;
456                         }
457
458                         /*
459                          * Must count total masters and slaves whether the
460                          * chain is errored or not.
461                          */
462                         switch (cluster->pmp->pfs_types[i]) {
463                         case HAMMER2_PFSTYPE_MASTER:
464                                 ++ttlmasters;
465                                 break;
466                         case HAMMER2_PFSTYPE_SLAVE:
467                                 ++ttlslaves;
468                                 break;
469                         }
470                         continue;
471                 }
472                 switch (cluster->pmp->pfs_types[i]) {
473                 case HAMMER2_PFSTYPE_MASTER:
474                         ++ttlmasters;
475                         break;
476                 case HAMMER2_PFSTYPE_SLAVE:
477                         ++ttlslaves;
478                         break;
479                 case HAMMER2_PFSTYPE_SOFT_MASTER:
480                         nflags |= HAMMER2_CLUSTER_WRSOFT;
481                         nflags |= HAMMER2_CLUSTER_RDSOFT;
482                         smpresent = 1;
483                         break;
484                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
485                         nflags |= HAMMER2_CLUSTER_RDSOFT;
486                         break;
487                 case HAMMER2_PFSTYPE_SUPROOT:
488                         /*
489                          * Degenerate cluster representing the super-root
490                          * topology on a single device.  Fake stuff so
491                          * cluster ops work as expected.
492                          */
493                         nflags |= HAMMER2_CLUSTER_WRHARD;
494                         nflags |= HAMMER2_CLUSTER_RDHARD;
495                         cluster->focus_index = i;
496                         cluster->focus = chain;
497                         cluster->error = chain ? chain->error : 0;
498                         break;
499                 default:
500                         break;
501                 }
502         }
503
504         /*
505          * Pass 2
506          *
507          * Resolve masters.  Calculate nmasters for the highest matching
508          * TID, if a quorum cannot be attained try the next lower matching
509          * TID until we exhaust TIDs.
510          *
511          * NOTE: A NULL chain is not necessarily an error, it could be
512          *       e.g. a lookup failure or the end of an iteration.
513          *       Process normally.
514          */
515         last_best_quorum_tid = HAMMER2_TID_MAX;
516         quorum_tid = 0;         /* fix gcc warning */
517
518         while (nmasters < nquorum && last_best_quorum_tid != 0) {
519                 nmasters = 0;
520                 quorum_tid = 0;
521
522                 for (i = 0; i < cluster->nchains; ++i) {
523                         if (cluster->pmp->pfs_types[i] !=
524                             HAMMER2_PFSTYPE_MASTER) {
525                                 continue;
526                         }
527                         chain = cluster->array[i].chain;
528
529                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
530                                 /*
531                                  * Invalid as in unsynchronized, cannot be
532                                  * used to calculate the quorum.
533                                  */
534                         } else if (chain == NULL && quorum_tid == 0) {
535                                 /*
536                                  * NULL chain on master matches NULL chains
537                                  * on other masters.
538                                  */
539                                 ++nmasters;
540                         } else if (quorum_tid < last_best_quorum_tid &&
541                                    chain != NULL &&
542                                    (quorum_tid < chain->bref.modify_tid ||
543                                     nmasters == 0)) {
544                                 /*
545                                  * Better TID located, reset nmasters count.
546                                  */
547                                 nmasters = 1;
548                                 quorum_tid = chain->bref.modify_tid;
549                         } else if (chain &&
550                                    quorum_tid == chain->bref.modify_tid) {
551                                 /*
552                                  * TID matches current collection.
553                                  */
554                                 ++nmasters;
555                         }
556                 }
557                 if (nmasters >= nquorum)
558                         break;
559                 last_best_quorum_tid = quorum_tid;
560         }
561
562         /*
563          * Pass 3
564          *
565          * NOTE: A NULL chain is not necessarily an error, it could be
566          *       e.g. a lookup failure or the end of an iteration.
567          *       Process normally.
568          */
569         for (i = 0; i < cluster->nchains; ++i) {
570                 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
571                 chain = cluster->array[i].chain;
572                 if (chain && chain->error) {
573                         if (cluster->focus == NULL || cluster->focus == chain) {
574                                 /* error will be overridden by valid focus */
575                                 cluster->error = chain->error;
576                         }
577                         continue;
578                 }
579
580                 switch (cluster->pmp->pfs_types[i]) {
581                 case HAMMER2_PFSTYPE_MASTER:
582                         /*
583                          * We must have enough up-to-date masters to reach
584                          * a quorum and the master modify_tid must match
585                          * the quorum's modify_tid.
586                          *
587                          * Do not select an errored or out-of-sync master.
588                          */
589                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
590                                 nflags |= HAMMER2_CLUSTER_UNHARD;
591                         } else if (nmasters >= nquorum &&
592                                    (chain == NULL || chain->error == 0) &&
593                                    ((chain == NULL && quorum_tid == 0) ||
594                                     (chain != NULL && quorum_tid ==
595                                                   chain->bref.modify_tid))) {
596                                 nflags |= HAMMER2_CLUSTER_WRHARD;
597                                 nflags |= HAMMER2_CLUSTER_RDHARD;
598                                 if (!smpresent) {
599                                         cluster->array[i].flags |=
600                                                         HAMMER2_CITEM_FEMOD;
601                                 }
602                                 if (cluster->focus == NULL ||
603                                     focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
604                                         focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
605                                         cluster->focus_index = i;
606                                         cluster->focus = chain; /* NULL ok */
607                                         cluster->error = chain ? chain->error :
608                                                                  0;
609                                 }
610                         } else if (chain == NULL || chain->error == 0) {
611                                 nflags |= HAMMER2_CLUSTER_UNHARD;
612                         }
613                         break;
614                 case HAMMER2_PFSTYPE_SLAVE:
615                         /*
616                          * We must have enough up-to-date masters to reach
617                          * a quorum and the slave modify_tid must match the
618                          * quorum's modify_tid.
619                          *
620                          * Do not select an errored slave.
621                          */
622                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
623                                 nflags |= HAMMER2_CLUSTER_UNHARD;
624                         } else if (nmasters >= nquorum &&
625                                    (chain == NULL || chain->error == 0) &&
626                                    ((chain == NULL && quorum_tid == 0) ||
627                                     (chain && quorum_tid ==
628                                               chain->bref.modify_tid))) {
629                                 ++nslaves;
630                                 nflags |= HAMMER2_CLUSTER_RDHARD;
631 #if 0
632                                 /* XXX optimize for RESOLVE_RDONLY */
633                                 if (cluster->focus == NULL) {
634                                         focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
635                                         cluster->focus_index = i;
636                                         cluster->focus = chain; /* NULL ok */
637                                         cluster->error = chain ? chain->error :
638                                                                  0;
639                                 }
640 #endif
641                         } else if (chain == NULL || chain->error == 0) {
642                                 nflags |= HAMMER2_CLUSTER_UNSOFT;
643                         }
644                         break;
645                 case HAMMER2_PFSTYPE_SOFT_MASTER:
646                         /*
647                          * Directly mounted soft master always wins.  There
648                          * should be only one.
649                          */
650                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
651                         cluster->focus_index = i;
652                         cluster->focus = chain;
653                         cluster->error = chain ? chain->error : 0;
654                         focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
655                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
656                         break;
657                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
658                         /*
659                          * Directly mounted soft slave always wins.  There
660                          * should be only one.
661                          */
662                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
663                         if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
664                                 cluster->focus_index = i;
665                                 cluster->focus = chain;
666                                 cluster->error = chain ? chain->error : 0;
667                                 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
668                         }
669                         break;
670                 case HAMMER2_PFSTYPE_SUPROOT:
671                         /*
672                          * spmp (degenerate case)
673                          */
674                         KKASSERT(i == 0);
675                         cluster->focus_index = i;
676                         cluster->focus = chain;
677                         cluster->error = chain ? chain->error : 0;
678                         focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
679                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
680                         break;
681                 default:
682                         break;
683                 }
684         }
685
686         /*
687          * Focus now set, adjust ddflag.  Skip this pass if the focus
688          * is bad or if we are at the PFS root (the bref won't match at
689          * the PFS root, obviously).
690          */
691         focus = cluster->focus;
692         if (focus) {
693                 cluster->ddflag =
694                         (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
695         } else {
696                 cluster->ddflag = 0;
697                 goto skip4;
698         }
699         if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
700                 goto skip4;
701
702         /*
703          * Pass 4
704          *
705          * Validate the elements that were not marked invalid.  They should
706          * match.
707          */
708         for (i = 0; i < cluster->nchains; ++i) {
709                 int ddflag;
710
711                 chain = cluster->array[i].chain;
712
713                 if (chain == NULL)
714                         continue;
715                 if (chain == focus)
716                         continue;
717                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
718                         continue;
719
720                 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
721                 if (chain->bref.type != focus->bref.type ||
722                     chain->bref.key != focus->bref.key ||
723                     chain->bref.keybits != focus->bref.keybits ||
724                     chain->bref.modify_tid != focus->bref.modify_tid ||
725                     chain->bytes != focus->bytes ||
726                     ddflag != cluster->ddflag) {
727                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
728                         if (hammer2_debug & 1)
729                         kprintf("cluster_resolve: matching modify_tid failed "
730                                 "bref test: idx=%d type=%02x/%02x "
731                                 "key=%016jx/%d-%016jx/%d "
732                                 "mod=%016jx/%016jx bytes=%u/%u\n",
733                                 i,
734                                 chain->bref.type, focus->bref.type,
735                                 chain->bref.key, chain->bref.keybits,
736                                 focus->bref.key, focus->bref.keybits,
737                                 chain->bref.modify_tid, focus->bref.modify_tid,
738                                 chain->bytes, focus->bytes);
739                         if (hammer2_debug & 0x4000)
740                                 panic("cluster_resolve");
741                         /* flag issue and force resync? */
742                 }
743         }
744 skip4:
745
746         if (ttlslaves == 0)
747                 nflags |= HAMMER2_CLUSTER_NOSOFT;
748         if (ttlmasters == 0)
749                 nflags |= HAMMER2_CLUSTER_NOHARD;
750
751         /*
752          * Set SSYNCED or MSYNCED for slaves and masters respectively if
753          * all available nodes (even if 0 are available) are fully
754          * synchronized.  This is used by the synchronization thread to
755          * determine if there is work it could potentially accomplish.
756          */
757         if (nslaves == ttlslaves)
758                 nflags |= HAMMER2_CLUSTER_SSYNCED;
759         if (nmasters == ttlmasters)
760                 nflags |= HAMMER2_CLUSTER_MSYNCED;
761
762         /*
763          * Determine if the cluster was successfully locked for the
764          * requested operation and generate an error code.  The cluster
765          * will not be locked (or ref'd) if an error is returned.
766          *
767          * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
768          * to determine if reading or writing is possible.  If writing, the
769          * cluster still requires a call to hammer2_cluster_modify() first.
770          */
771         atomic_set_int(&cluster->flags, nflags);
772         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
773 }
774
775 /*
776  * This is used by the XOPS subsystem to calculate the state of
777  * the collection and tell hammer2_xop_collect() what to do with it.
778  * The collection can be in various states of desynchronization, the
779  * caller specifically wants to resolve the passed-in key.
780  *
781  * Return values:
782  *      0               - Quorum agreement, key is valid
783  *
784  *      ENOENT          - Quorum agreement, end of scan
785  *
786  *      ESRCH           - Quorum agreement, key is INVALID (caller should
787  *                        skip key).
788  *
789  *      EIO             - Quorum agreement but all elements had errors.
790  *
791  *      EDEADLK         - No quorum agreement possible for key, a repair
792  *                        may be needed.  Caller has to decide what to do,
793  *                        possibly iterating the key or generating an EIO.
794  *
795  *      EINPROGRESS     - No quorum agreement yet, but agreement is still
796  *                        possible if caller waits for more responses.  Caller
797  *                        should not iterate key.
798  *
799  * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
800  */
801 int
802 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
803 {
804         hammer2_chain_t *chain;
805         hammer2_chain_t *focus;
806         hammer2_pfs_t *pmp;
807         hammer2_tid_t quorum_tid;
808         hammer2_tid_t last_best_quorum_tid;
809         uint32_t nflags;
810         int ttlmasters;
811         int ttlslaves;
812         int nmasters;
813         int nmasters_keymatch;
814         int nslaves;
815         int nquorum;
816         int umasters;   /* unknown masters (still in progress) */
817         int smpresent;
818         int i;
819
820         cluster->error = 0;
821         cluster->focus = NULL;
822
823         nflags = 0;
824         ttlmasters = 0;
825         ttlslaves = 0;
826         nmasters = 0;
827         nmasters_keymatch = 0;
828         umasters = 0;
829         nslaves = 0;
830
831         /*
832          * Calculate quorum
833          */
834         pmp = cluster->pmp;
835         KKASSERT(pmp != NULL || cluster->nchains == 0);
836         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
837         smpresent = 0;
838
839         /*
840          * Pass 1
841          *
842          * NOTE: A NULL chain is not necessarily an error, it could be
843          *       e.g. a lookup failure or the end of an iteration.
844          *       Process normally.
845          */
846         for (i = 0; i < cluster->nchains; ++i) {
847                 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
848                 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
849
850                 chain = cluster->array[i].chain;
851                 if (chain && chain->error) {
852                         if (cluster->focus == NULL || cluster->focus == chain) {
853                                 /* error will be overridden by valid focus */
854                                 cluster->error = chain->error;
855                         }
856
857                         /*
858                          * Must count total masters and slaves whether the
859                          * chain is errored or not.
860                          */
861                         switch (cluster->pmp->pfs_types[i]) {
862                         case HAMMER2_PFSTYPE_MASTER:
863                                 ++ttlmasters;
864                                 break;
865                         case HAMMER2_PFSTYPE_SLAVE:
866                                 ++ttlslaves;
867                                 break;
868                         }
869                         continue;
870                 }
871                 switch (cluster->pmp->pfs_types[i]) {
872                 case HAMMER2_PFSTYPE_MASTER:
873                         ++ttlmasters;
874                         break;
875                 case HAMMER2_PFSTYPE_SLAVE:
876                         ++ttlslaves;
877                         break;
878                 case HAMMER2_PFSTYPE_SOFT_MASTER:
879                         nflags |= HAMMER2_CLUSTER_WRSOFT;
880                         nflags |= HAMMER2_CLUSTER_RDSOFT;
881                         smpresent = 1;
882                         break;
883                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
884                         nflags |= HAMMER2_CLUSTER_RDSOFT;
885                         break;
886                 case HAMMER2_PFSTYPE_SUPROOT:
887                         /*
888                          * Degenerate cluster representing the super-root
889                          * topology on a single device.  Fake stuff so
890                          * cluster ops work as expected.
891                          */
892                         nflags |= HAMMER2_CLUSTER_WRHARD;
893                         nflags |= HAMMER2_CLUSTER_RDHARD;
894                         cluster->focus_index = i;
895                         cluster->focus = chain;
896                         cluster->error = chain ? chain->error : 0;
897                         break;
898                 default:
899                         break;
900                 }
901         }
902
903         /*
904          * Pass 2
905          *
906          * Resolve nmasters             - master nodes fully match
907          *
908          * Resolve umasters             - master nodes operation still
909          *                                in progress
910          *
911          * Resolve nmasters_keymatch    - master nodes match the passed-in
912          *                                key and may or may not match
913          *                                the quorum-agreed tid.
914          * 
915          * The quorum-agreed TID is the highest matching TID.
916          */
917         last_best_quorum_tid = HAMMER2_TID_MAX;
918         quorum_tid = 0;         /* fix gcc warning */
919
920         while (nmasters < nquorum && last_best_quorum_tid != 0) {
921                 nmasters = 0;
922                 quorum_tid = 0;
923
924                 for (i = 0; i < cluster->nchains; ++i) {
925                         /* XXX SOFT smpresent handling */
926                         if (cluster->pmp->pfs_types[i] !=
927                             HAMMER2_PFSTYPE_MASTER) {
928                                 continue;
929                         }
930
931                         chain = cluster->array[i].chain;
932
933                         /*
934                          * Skip elements still in progress.  umasters keeps
935                          * track of masters that might still be in-progress.
936                          */
937                         if (chain == NULL && (cluster->array[i].flags &
938                                               HAMMER2_CITEM_NULL) == 0) {
939                                 ++umasters;
940                                 continue;
941                         }
942
943                         /*
944                          * Key match?
945                          */
946                         if (flags & HAMMER2_CHECK_NULL) {
947                                 if (chain == NULL) {
948                                         ++nmasters;
949                                         ++nmasters_keymatch;
950                                 }
951                         } else if (chain && chain->bref.key == key) {
952                                 ++nmasters_keymatch;
953                                 if (quorum_tid < last_best_quorum_tid &&
954                                     (quorum_tid < chain->bref.modify_tid ||
955                                      nmasters == 0)) {
956                                         /*
957                                          * Better TID located, reset
958                                          * nmasters count.
959                                          */
960                                         nmasters = 0;
961                                         quorum_tid = chain->bref.modify_tid;
962                                 }
963                                 if (quorum_tid == chain->bref.modify_tid) {
964                                         /*
965                                          * TID matches current collection.
966                                          */
967                                         ++nmasters;
968                                         if (chain->error == 0) {
969                                                 cluster->focus = chain;
970                                                 cluster->focus_index = i;
971                                         }
972                                 }
973                         }
974                 }
975                 if (nmasters >= nquorum)
976                         break;
977                 last_best_quorum_tid = quorum_tid;
978         }
979
980         /*
981         kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
982                 nmasters, nquorum, nmasters_keymatch, umasters);
983         */
984
985         /*
986          * Early return if we do not have enough masters.
987          */
988         if (nmasters < nquorum) {
989                 if (nmasters + umasters >= nquorum)
990                         return EINPROGRESS;
991                 if (nmasters_keymatch < nquorum) 
992                         return ESRCH;
993                 return EDEADLK;
994         }
995
996         /*
997          * Validated end of scan.
998          */
999         if (flags & HAMMER2_CHECK_NULL)
1000                 return ENOENT;
1001
1002         /*
1003          * If we have a NULL focus at this point the agreeing quorum all
1004          * had chain errors.
1005          */
1006         if (cluster->focus == NULL)
1007                 return EIO;
1008
1009         /*
1010          * Pass 3
1011          *
1012          * We have quorum agreement, validate elements, not end of scan.
1013          */
1014         for (i = 0; i < cluster->nchains; ++i) {
1015                 chain = cluster->array[i].chain;
1016                 if (chain == NULL ||
1017                     chain->bref.key != key ||
1018                     chain->bref.modify_tid != quorum_tid) {
1019                         continue;
1020                 }
1021
1022                 switch (cluster->pmp->pfs_types[i]) {
1023                 case HAMMER2_PFSTYPE_MASTER:
1024                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
1025                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1026                         nflags |= HAMMER2_CLUSTER_WRHARD;
1027                         nflags |= HAMMER2_CLUSTER_RDHARD;
1028                         break;
1029                 case HAMMER2_PFSTYPE_SLAVE:
1030                         /*
1031                          * We must have enough up-to-date masters to reach
1032                          * a quorum and the slave modify_tid must match the
1033                          * quorum's modify_tid.
1034                          *
1035                          * Do not select an errored slave.
1036                          */
1037                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1038                         nflags |= HAMMER2_CLUSTER_RDHARD;
1039                         ++nslaves;
1040                         break;
1041                 case HAMMER2_PFSTYPE_SOFT_MASTER:
1042                         /*
1043                          * Directly mounted soft master always wins.  There
1044                          * should be only one.
1045                          */
1046                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
1047                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1048                         break;
1049                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
1050                         /*
1051                          * Directly mounted soft slave always wins.  There
1052                          * should be only one.
1053                          *
1054                          * XXX
1055                          */
1056                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1057                         break;
1058                 case HAMMER2_PFSTYPE_SUPROOT:
1059                         /*
1060                          * spmp (degenerate case)
1061                          */
1062                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
1063                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1064                         break;
1065                 default:
1066                         break;
1067                 }
1068         }
1069
1070         /*
1071          * Focus now set, adjust ddflag.  Skip this pass if the focus
1072          * is bad or if we are at the PFS root (the bref won't match at
1073          * the PFS root, obviously).
1074          */
1075         focus = cluster->focus;
1076         if (focus) {
1077                 cluster->ddflag =
1078                         (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
1079         } else {
1080                 cluster->ddflag = 0;
1081                 goto skip4;
1082         }
1083         if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1084                 goto skip4;
1085
1086         /*
1087          * Pass 4
1088          *
1089          * Validate the elements that were not marked invalid.  They should
1090          * match.
1091          */
1092         for (i = 0; i < cluster->nchains; ++i) {
1093                 int ddflag;
1094
1095                 chain = cluster->array[i].chain;
1096
1097                 if (chain == NULL)
1098                         continue;
1099                 if (chain == focus)
1100                         continue;
1101                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1102                         continue;
1103
1104                 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1105                 if (chain->bref.type != focus->bref.type ||
1106                     chain->bref.key != focus->bref.key ||
1107                     chain->bref.keybits != focus->bref.keybits ||
1108                     chain->bref.modify_tid != focus->bref.modify_tid ||
1109                     chain->bytes != focus->bytes ||
1110                     ddflag != cluster->ddflag) {
1111                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1112                         if (hammer2_debug & 1)
1113                         kprintf("cluster_resolve: matching modify_tid failed "
1114                                 "bref test: idx=%d type=%02x/%02x "
1115                                 "key=%016jx/%d-%016jx/%d "
1116                                 "mod=%016jx/%016jx bytes=%u/%u\n",
1117                                 i,
1118                                 chain->bref.type, focus->bref.type,
1119                                 chain->bref.key, chain->bref.keybits,
1120                                 focus->bref.key, focus->bref.keybits,
1121                                 chain->bref.modify_tid, focus->bref.modify_tid,
1122                                 chain->bytes, focus->bytes);
1123                         if (hammer2_debug & 0x4000)
1124                                 panic("cluster_resolve");
1125                         /* flag issue and force resync? */
1126                 }
1127         }
1128 skip4:
1129
1130         if (ttlslaves == 0)
1131                 nflags |= HAMMER2_CLUSTER_NOSOFT;
1132         if (ttlmasters == 0)
1133                 nflags |= HAMMER2_CLUSTER_NOHARD;
1134
1135         /*
1136          * Set SSYNCED or MSYNCED for slaves and masters respectively if
1137          * all available nodes (even if 0 are available) are fully
1138          * synchronized.  This is used by the synchronization thread to
1139          * determine if there is work it could potentially accomplish.
1140          */
1141         if (nslaves == ttlslaves)
1142                 nflags |= HAMMER2_CLUSTER_SSYNCED;
1143         if (nmasters == ttlmasters)
1144                 nflags |= HAMMER2_CLUSTER_MSYNCED;
1145
1146         /*
1147          * Determine if the cluster was successfully locked for the
1148          * requested operation and generate an error code.  The cluster
1149          * will not be locked (or ref'd) if an error is returned.
1150          *
1151          * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
1152          * to determine if reading or writing is possible.  If writing, the
1153          * cluster still requires a call to hammer2_cluster_modify() first.
1154          */
1155         atomic_set_int(&cluster->flags, nflags);
1156         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1157
1158         return 0;
1159 }
1160
1161 /*
1162  * This is used by the sync thread to force non-NULL elements of a copy
1163  * of the pmp->iroot cluster to be good which is required to prime the
1164  * sync.
1165  */
1166 void
1167 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1168 {
1169         int i;
1170
1171         for (i = 0; i < cluster->nchains; ++i) {
1172                 if (cluster->array[i].chain)
1173                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1174         }
1175 }
1176
1177 /*
1178  * Copy a cluster, returned a ref'd cluster.  All underlying chains
1179  * are also ref'd, but not locked.  Focus state is also copied.
1180  *
1181  * Original cluster does not have to be locked but usually is.
1182  * New cluster will not be flagged as locked.
1183  *
1184  * Callers using this function to initialize a new cluster from an inode
1185  * generally lock and resolve the resulting cluster.
1186  *
1187  * Callers which use this function to save/restore a cluster structure
1188  * generally retain the focus state and do not re-resolve it.  Caller should
1189  * not try to re-resolve internal (cparent) node state during an iteration
1190  * as the individual tracking elements of cparent in an iteration may not
1191  * match even though they are correct.
1192  */
1193 hammer2_cluster_t *
1194 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
1195 {
1196         hammer2_pfs_t *pmp = ocluster->pmp;
1197         hammer2_cluster_t *ncluster;
1198         hammer2_chain_t *chain;
1199         int i;
1200
1201         ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
1202         ncluster->pmp = pmp;
1203         ncluster->nchains = ocluster->nchains;
1204         ncluster->refs = 1;
1205
1206         for (i = 0; i < ocluster->nchains; ++i) {
1207                 chain = ocluster->array[i].chain;
1208                 ncluster->array[i].chain = chain;
1209                 ncluster->array[i].flags = ocluster->array[i].flags;
1210                 if (chain)
1211                         hammer2_chain_ref(chain);
1212         }
1213         ncluster->focus_index = ocluster->focus_index;
1214         ncluster->focus = ocluster->focus;
1215         ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED |
1216                                               HAMMER2_CLUSTER_INODE);
1217
1218         return (ncluster);
1219 }
1220
1221 /*
1222  * Unlock a cluster.  Refcount and focus is maintained.
1223  */
1224 void
1225 hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx)
1226 {
1227         hammer2_chain_t *chain;
1228         int i;
1229
1230         if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1231                 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1232                         cluster);
1233         }
1234         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1235         KKASSERT(cluster->refs > 0);
1236         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1237
1238         for (i = 0; i < cluster->nchains; ++i) {
1239                 if (i == idx)
1240                         continue;
1241                 chain = cluster->array[i].chain;
1242                 if (chain)
1243                         hammer2_chain_unlock(chain);
1244         }
1245 }
1246
1247 void
1248 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1249 {
1250         hammer2_cluster_unlock_except(cluster, -1);
1251 }
1252
1253 /*
1254  * Resize the cluster's physical storage allocation in-place.  This may
1255  * replace the cluster's chains.
1256  */
1257 void
1258 hammer2_cluster_resize(hammer2_inode_t *ip,
1259                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1260                        int nradix, int flags)
1261 {
1262         hammer2_chain_t *chain;
1263         int i;
1264
1265         KKASSERT(cparent->pmp == cluster->pmp);         /* can be NULL */
1266         KKASSERT(cparent->nchains == cluster->nchains);
1267
1268         for (i = 0; i < cluster->nchains; ++i) {
1269                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1270                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1271                         continue;
1272                 }
1273                 chain = cluster->array[i].chain;
1274                 if (chain) {
1275                         KKASSERT(cparent->array[i].chain);
1276                         hammer2_chain_resize(ip,
1277                                              cparent->array[i].chain, chain,
1278                                              nradix, flags);
1279                 }
1280         }
1281 }
1282
1283 /*
1284  * Set an inode's cluster modified, marking the related chains RW and
1285  * duplicating them if necessary.
1286  *
1287  * The passed-in chain is a localized copy of the chain previously acquired
1288  * when the inode was locked (and possilby replaced in the mean time), and
1289  * must also be updated.  In fact, we update it first and then synchronize
1290  * the inode's cluster cache.
1291  */
1292 hammer2_inode_data_t *
1293 hammer2_cluster_modify_ip(hammer2_inode_t *ip,
1294                           hammer2_cluster_t *cluster, int flags)
1295 {
1296         hammer2_inode_modify(ip);
1297         hammer2_cluster_modify(cluster, flags);
1298         hammer2_inode_repoint(ip, NULL, cluster);
1299         return (&hammer2_cluster_wdata(cluster)->ipdata);
1300 }
1301
1302 /*
1303  * Adjust the cluster's chains to allow modification and adjust the
1304  * focus.  Data will be accessible on return.
1305  *
1306  * If our focused master errors on modify, re-resolve the cluster to
1307  * try to select a different master.
1308  */
1309 void
1310 hammer2_cluster_modify(hammer2_cluster_t *cluster, int flags)
1311 {
1312         hammer2_chain_t *chain;
1313         int resolve_again;
1314         int i;
1315
1316         resolve_again = 0;
1317         for (i = 0; i < cluster->nchains; ++i) {
1318                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1319                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1320                         continue;
1321                 }
1322                 chain = cluster->array[i].chain;
1323                 if (chain == NULL)
1324                         continue;
1325                 if (chain->error)
1326                         continue;
1327                 hammer2_chain_modify(chain, flags);
1328                 if (cluster->focus == chain && chain->error) {
1329                         cluster->error = chain->error;
1330                         resolve_again = 1;
1331                 }
1332         }
1333         if (resolve_again)
1334                 hammer2_cluster_resolve(cluster);
1335 }
1336
1337 /*
1338  * Synchronize modifications from the focus to other chains in a cluster.
1339  * Convenient because nominal API users can just modify the contents of the
1340  * focus (at least for non-blockref data).
1341  *
1342  * Nominal front-end operations only edit non-block-table data in a single
1343  * chain.  This code copies such modifications to the other chains in the
1344  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
1345  * by both the frontend and the backend and will explode in fireworks if
1346  * blindly copied.
1347  */
1348 void
1349 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
1350 {
1351         hammer2_chain_t *focus;
1352         hammer2_chain_t *scan;
1353         const hammer2_inode_data_t *ripdata;
1354         hammer2_inode_data_t *wipdata;
1355         int i;
1356
1357         focus = cluster->focus;
1358         KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
1359
1360         for (i = 0; i < cluster->nchains; ++i) {
1361                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1362                         continue;
1363                 scan = cluster->array[i].chain;
1364                 if (scan == NULL || scan == focus)
1365                         continue;
1366                 if (scan->error)
1367                         continue;
1368                 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
1369                 KKASSERT(focus->bytes == scan->bytes &&
1370                          focus->bref.type == scan->bref.type);
1371                 switch(focus->bref.type) {
1372                 case HAMMER2_BREF_TYPE_INODE:
1373                         ripdata = &focus->data->ipdata;
1374                         wipdata = &scan->data->ipdata;
1375                         if ((ripdata->meta.op_flags &
1376                             HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1377                                 bcopy(ripdata, wipdata,
1378                                       offsetof(hammer2_inode_data_t, u));
1379                                 break;
1380                         }
1381                         /* fall through to full copy */
1382                 case HAMMER2_BREF_TYPE_DATA:
1383                         bcopy(focus->data, scan->data, focus->bytes);
1384                         break;
1385                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1386                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1387                 case HAMMER2_BREF_TYPE_FREEMAP:
1388                 case HAMMER2_BREF_TYPE_VOLUME:
1389                         panic("hammer2_cluster_modsync: illegal node type");
1390                         /* NOT REACHED */
1391                         break;
1392                 default:
1393                         panic("hammer2_cluster_modsync: unknown node type");
1394                         break;
1395                 }
1396         }
1397 }
1398
1399 /*
1400  * Lookup initialization/completion API.  Returns a locked, fully resolved
1401  * cluster with one ref.
1402  */
1403 hammer2_cluster_t *
1404 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
1405 {
1406         hammer2_cluster_t *cluster;
1407
1408         cluster = hammer2_cluster_copy(cparent);
1409         if (flags & HAMMER2_LOOKUP_SHARED) {
1410                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
1411                                               HAMMER2_RESOLVE_SHARED);
1412         } else {
1413                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1414         }
1415         hammer2_cluster_resolve(cluster);
1416
1417         return (cluster);
1418 }
1419
1420 void
1421 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1422 {
1423         if (cparent) {
1424                 hammer2_cluster_unlock(cparent);
1425                 hammer2_cluster_drop(cparent);
1426         }
1427 }
1428
1429 /*
1430  * Locate first match or overlap under parent, return a new, locked, resolved
1431  * cluster with one ref.
1432  *
1433  * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1434  */
1435 hammer2_cluster_t *
1436 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
1437                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1438 {
1439         hammer2_pfs_t *pmp;
1440         hammer2_cluster_t *cluster;
1441         hammer2_chain_t *chain;
1442         hammer2_key_t key_accum;
1443         hammer2_key_t key_next;
1444         int null_count;
1445         int rflags;
1446         int i;
1447
1448         KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1449
1450         pmp = cparent->pmp;                             /* can be NULL */
1451         key_accum = *key_nextp;
1452         null_count = 0;
1453         if (flags & HAMMER2_LOOKUP_SHARED)
1454                 rflags = HAMMER2_RESOLVE_SHARED;
1455         else
1456                 rflags = 0;
1457
1458         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1459         cluster->pmp = pmp;                             /* can be NULL */
1460         cluster->refs = 1;
1461         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1462                 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1463
1464         /*
1465          * Iterating earlier cluster elements with later elements still
1466          * locked is a problem, so we have to unlock the parent and then
1467          * re-lock as we go.
1468          */
1469         hammer2_cluster_unlock(cparent);
1470         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1471
1472         /*
1473          * Pass-1, issue lookups.
1474          */
1475         for (i = 0; i < cparent->nchains; ++i) {
1476                 cluster->array[i].flags = cparent->array[i].flags;
1477                 key_next = *key_nextp;
1478
1479                 /*
1480                  * Always relock the parent as we go.
1481                  */
1482                 if (cparent->array[i].chain) {
1483                         hammer2_chain_lock(cparent->array[i].chain, rflags);
1484                 }
1485
1486                 /*
1487                  * Nothing to base the lookup, or parent was not synchronized.
1488                  */
1489                 if (cparent->array[i].chain == NULL ||
1490                     (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
1491                         ++null_count;
1492                         continue;
1493                 }
1494
1495                 chain = hammer2_chain_lookup(&cparent->array[i].chain,
1496                                              &key_next,
1497                                              key_beg, key_end,
1498                                              &cparent->array[i].cache_index,
1499                                              flags);
1500                 cluster->array[i].chain = chain;
1501                 if (chain == NULL) {
1502                         ++null_count;
1503                 }
1504                 if (key_accum > key_next)
1505                         key_accum = key_next;
1506         }
1507
1508         /*
1509          * Cleanup
1510          */
1511         cluster->nchains = i;
1512         *key_nextp = key_accum;
1513
1514         /*
1515          * The cluster must be resolved, out of sync elements may be present.
1516          *
1517          * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1518          */
1519         if (null_count != i)
1520                 hammer2_cluster_resolve(cluster);
1521         if (null_count == i ||
1522             (cluster->focus == NULL &&
1523              (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1524                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1525                         hammer2_cluster_unlock(cluster);
1526                 hammer2_cluster_drop(cluster);
1527                 cluster = NULL;
1528         }
1529
1530         return (cluster);
1531 }
1532
1533 /*
1534  * Locate next match or overlap under parent, replace the passed-in cluster.
1535  * The returned cluster is a new, locked, resolved cluster with one ref.
1536  *
1537  * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1538  */
1539 hammer2_cluster_t *
1540 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1541                      hammer2_key_t *key_nextp,
1542                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1543 {
1544         hammer2_chain_t *ochain;
1545         hammer2_chain_t *nchain;
1546         hammer2_key_t key_accum;
1547         hammer2_key_t key_next;
1548         int parent_index;
1549         int cluster_index;
1550         int null_count;
1551         int rflags;
1552         int i;
1553
1554         KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1555
1556         key_accum = *key_nextp;
1557         null_count = 0;
1558         parent_index = cparent->focus_index;    /* save prior focus */
1559         cluster_index = cluster->focus_index;
1560         if (flags & HAMMER2_LOOKUP_SHARED)
1561                 rflags = HAMMER2_RESOLVE_SHARED;
1562         else
1563                 rflags = 0;
1564
1565         cluster->focus = NULL;          /* XXX needed any more? */
1566         /*cparent->focus = NULL;*/
1567         cluster->focus_index = 0;       /* XXX needed any more? */
1568         /*cparent->focus_index = 0;*/
1569
1570         cluster->ddflag = 0;
1571
1572         /*
1573          * The parent is always locked on entry, the iterator may be locked
1574          * depending on flags.
1575          *
1576          * We must temporarily unlock the passed-in clusters to avoid a
1577          * deadlock between elements of the cluster with other threads.
1578          * We will fixup the lock in the loop.
1579          *
1580          * Note that this will clear the focus.
1581          *
1582          * Reflag the clusters as locked, because we will relock them
1583          * as we go.
1584          */
1585         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1586                 hammer2_cluster_unlock(cluster);
1587                 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1588         }
1589         hammer2_cluster_unlock(cparent);
1590         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1591
1592         for (i = 0; i < cparent->nchains; ++i) {
1593                 key_next = *key_nextp;
1594                 ochain = cluster->array[i].chain;
1595
1596                 /*
1597                  * Always relock the parent as we go.
1598                  */
1599                 if (cparent->array[i].chain)
1600                         hammer2_chain_lock(cparent->array[i].chain, rflags);
1601
1602                 /*
1603                  * Nothing to iterate from.  These cases can occur under
1604                  * normal operations.  For example, during synchronization
1605                  * a slave might reach the end of its scan while records
1606                  * are still left on the master(s).
1607                  */
1608                 if (ochain == NULL) {
1609                         ++null_count;
1610                         continue;
1611                 }
1612                 if (cparent->array[i].chain == NULL ||
1613                     (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1614                     (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1615                         /* ochain has not yet been relocked */
1616                         hammer2_chain_drop(ochain);
1617                         cluster->array[i].chain = NULL;
1618                         ++null_count;
1619                         continue;
1620                 }
1621
1622                 /*
1623                  * Relock the child if necessary.  Parent and child will then
1624                  * be locked as expected by hammer2_chain_next() and flags.
1625                  */
1626                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1627                         hammer2_chain_lock(ochain, rflags);
1628                 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1629                                             &key_next, key_beg, key_end,
1630                                             &cparent->array[i].cache_index,
1631                                             flags);
1632                 /* ochain now invalid but can still be used for focus check */
1633                 if (parent_index == i) {
1634                         cparent->focus_index = i;
1635                         cparent->focus = cparent->array[i].chain;
1636                 }
1637
1638                 cluster->array[i].chain = nchain;
1639                 if (nchain == NULL) {
1640                         ++null_count;
1641                 }
1642                 if (key_accum > key_next)
1643                         key_accum = key_next;
1644         }
1645
1646         /*
1647          * Cleanup
1648          */
1649         cluster->nchains = i;
1650         *key_nextp = key_accum;
1651
1652         /*
1653          * The cluster must be resolved, out of sync elements may be present.
1654          *
1655          * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1656          */
1657         if (null_count != i)
1658                 hammer2_cluster_resolve(cluster);
1659         if (null_count == i ||
1660             (cluster->focus == NULL &&
1661              (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1662                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1663                         hammer2_cluster_unlock(cluster);
1664                 hammer2_cluster_drop(cluster);
1665                 cluster = NULL;
1666         }
1667         return(cluster);
1668 }
1669
1670 /*
1671  * Advance just one chain in the cluster and recalculate the invalid bit.
1672  * The cluster index is allowed to be flagged invalid on input and is
1673  * recalculated on return.
1674  *
1675  * (used during synchronization to advance past a chain being deleted).
1676  *
1677  * The chain being advanced must not be the focus and the clusters in
1678  * question must have already passed normal cluster_lookup/cluster_next
1679  * checks.
1680  *
1681  * The cluster always remains intact on return, so void function.
1682  */
1683 void
1684 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1685                                   hammer2_cluster_t *cluster,
1686                                   hammer2_key_t *key_nextp,
1687                                   hammer2_key_t key_beg,
1688                                   hammer2_key_t key_end,
1689                                   int i, int flags)
1690 {
1691         hammer2_chain_t *ochain;
1692         hammer2_chain_t *nchain;
1693         hammer2_chain_t *focus;
1694         hammer2_key_t key_accum;
1695         hammer2_key_t key_next;
1696         int ddflag;
1697
1698         key_accum = *key_nextp;
1699         key_next = *key_nextp;
1700         ochain = cluster->array[i].chain;
1701         if (ochain == NULL)
1702                 goto done;
1703         KKASSERT(ochain != cluster->focus);
1704
1705         nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1706                                     &key_next, key_beg, key_end,
1707                                     &cparent->array[i].cache_index,
1708                                     flags);
1709         /* ochain now invalid */
1710         if (cparent->focus_index == i)
1711                 cparent->focus = cparent->array[i].chain;
1712
1713         /*
1714          * Install nchain.  Note that nchain can be NULL, and can also
1715          * be in an unlocked state depending on flags.
1716          */
1717         cluster->array[i].chain = nchain;
1718         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1719
1720         if (key_accum > key_next)
1721                 key_accum = key_next;
1722
1723         focus = cluster->focus;
1724         if (focus == NULL)
1725                 goto done;
1726         if (nchain == NULL)
1727                 goto done;
1728 #if 0
1729         if (nchain == focus)    /* ASSERTED NOT TRUE */
1730                 ...
1731 #endif
1732         ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1733         if (nchain->bref.type != focus->bref.type ||
1734             nchain->bref.key != focus->bref.key ||
1735             nchain->bref.keybits != focus->bref.keybits ||
1736             nchain->bref.modify_tid != focus->bref.modify_tid ||
1737             nchain->bytes != focus->bytes ||
1738             ddflag != cluster->ddflag) {
1739                 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1740         }
1741
1742 done:
1743         *key_nextp = key_accum;
1744 #if 0
1745         /*
1746          * For now don't re-resolve cluster->flags.
1747          */
1748         hammer2_cluster_resolve(cluster);
1749 #endif
1750 }
1751
1752 /*
1753  * Create a new cluster using the specified key
1754  */
1755 int
1756 hammer2_cluster_create(hammer2_pfs_t *pmp, hammer2_cluster_t *cparent,
1757                      hammer2_cluster_t **clusterp,
1758                      hammer2_key_t key, int keybits,
1759                      int type, size_t bytes, int flags)
1760 {
1761         hammer2_cluster_t *cluster;
1762         int error;
1763         int i;
1764
1765         if ((cluster = *clusterp) == NULL) {
1766                 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1767                                   M_WAITOK | M_ZERO);
1768                 cluster->pmp = pmp;                     /* can be NULL */
1769                 cluster->refs = 1;
1770                 cluster->flags = HAMMER2_CLUSTER_LOCKED;
1771         }
1772         cluster->focus_index = 0;
1773         cluster->focus = NULL;
1774
1775         /*
1776          * NOTE: cluster->array[] entries can initially be NULL.  If
1777          *       *clusterp is supplied, skip NULL entries, otherwise
1778          *       create new chains.
1779          */
1780         for (i = 0; i < cparent->nchains; ++i) {
1781                 if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1782                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1783                         continue;
1784                 }
1785                 if (*clusterp) {
1786                         if ((cluster->array[i].flags &
1787                              HAMMER2_CITEM_FEMOD) == 0) {
1788                                 cluster->array[i].flags |=
1789                                                 HAMMER2_CITEM_INVALID;
1790                                 continue;
1791                         }
1792                         if (cluster->array[i].chain == NULL)
1793                                 continue;
1794                 }
1795                 error = hammer2_chain_create(&cparent->array[i].chain,
1796                                              &cluster->array[i].chain, pmp,
1797                                              key, keybits,
1798                                              type, bytes, flags);
1799                 if (cparent->focus_index == i)
1800                         cparent->focus = cparent->array[i].chain;
1801                 KKASSERT(error == 0);
1802                 if (cluster->focus == NULL) {
1803                         cluster->focus_index = i;
1804                         cluster->focus = cluster->array[i].chain;
1805                 }
1806                 if (cparent->focus == cparent->array[i].chain) {
1807                         cluster->focus_index = i;
1808                         cluster->focus = cluster->array[i].chain;
1809                 }
1810         }
1811         cluster->nchains = i;
1812         *clusterp = cluster;
1813         hammer2_cluster_resolve(cluster);
1814
1815         return error;
1816 }
1817
1818 /*
1819  * Rename a cluster to a new parent.
1820  *
1821  * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1822  *          So the data_off field is not relevant.  Only the key and
1823  *          keybits are used.
1824  */
1825 void
1826 hammer2_cluster_rename(hammer2_blockref_t *bref,
1827                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1828                        int flags)
1829 {
1830         hammer2_chain_t *chain;
1831         hammer2_blockref_t xbref;
1832         int i;
1833
1834 #if 0
1835         cluster->focus = NULL;
1836         cparent->focus = NULL;
1837         cluster->focus_index = 0;
1838         cparent->focus_index = 0;
1839 #endif
1840
1841         for (i = 0; i < cluster->nchains; ++i) {
1842                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1843                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1844                         continue;
1845                 }
1846                 chain = cluster->array[i].chain;
1847                 if (chain) {
1848                         if (bref) {
1849                                 xbref = chain->bref;
1850                                 xbref.key = bref->key;
1851                                 xbref.keybits = bref->keybits;
1852                                 hammer2_chain_rename(&xbref,
1853                                                      &cparent->array[i].chain,
1854                                                      chain, flags);
1855                         } else {
1856                                 hammer2_chain_rename(NULL,
1857                                                      &cparent->array[i].chain,
1858                                                      chain, flags);
1859                         }
1860                         if (cparent->focus_index == i)
1861                                 cparent->focus = cparent->array[i].chain;
1862                         KKASSERT(cluster->array[i].chain == chain); /*remove*/
1863                 }
1864         }
1865 }
1866
1867 /*
1868  * Mark a cluster deleted
1869  */
1870 void
1871 hammer2_cluster_delete(hammer2_cluster_t *cparent,
1872                        hammer2_cluster_t *cluster, int flags)
1873 {
1874         hammer2_chain_t *chain;
1875         hammer2_chain_t *parent;
1876         int i;
1877
1878         if (cparent == NULL) {
1879                 kprintf("cparent is NULL\n");
1880                 return;
1881         }
1882
1883         for (i = 0; i < cluster->nchains; ++i) {
1884                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1885                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1886                         continue;
1887                 }
1888                 parent = cparent->array[i].chain;
1889                 chain = cluster->array[i].chain;
1890                 if (chain == NULL)
1891                         continue;
1892                 if (chain->parent != parent) {
1893                         kprintf("hammer2_cluster_delete: parent "
1894                                 "mismatch chain=%p parent=%p against=%p\n",
1895                                 chain, chain->parent, parent);
1896                 } else {
1897                         hammer2_chain_delete(parent, chain, flags);
1898                 }
1899         }
1900 }
1901
1902 /*
1903  * Create a snapshot of the specified {parent, ochain} with the specified
1904  * label.  The originating hammer2_inode must be exclusively locked for
1905  * safety.
1906  *
1907  * The ioctl code has already synced the filesystem.
1908  */
1909 int
1910 hammer2_cluster_snapshot(hammer2_cluster_t *ocluster,
1911                        hammer2_ioc_pfs_t *pmp)
1912 {
1913         hammer2_dev_t *hmp;
1914         const hammer2_inode_data_t *ripdata;
1915         hammer2_inode_data_t *wipdata;
1916         hammer2_chain_t *nchain;
1917         hammer2_inode_t *nip;
1918         size_t name_len;
1919         hammer2_key_t lhc;
1920         struct vattr vat;
1921 #if 0
1922         uuid_t opfs_clid;
1923 #endif
1924         int error;
1925
1926         kprintf("snapshot %s\n", pmp->name);
1927
1928         name_len = strlen(pmp->name);
1929         lhc = hammer2_dirhash(pmp->name, name_len);
1930
1931         /*
1932          * Get the clid
1933          */
1934         ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1935 #if 0
1936         opfs_clid = ripdata->meta.pfs_clid;
1937 #endif
1938         hmp = ocluster->focus->hmp;     /* XXX find synchronized local disk */
1939
1940         /*
1941          * Create the snapshot directory under the super-root
1942          *
1943          * Set PFS type, generate a unique filesystem id, and generate
1944          * a cluster id.  Use the same clid when snapshotting a PFS root,
1945          * which theoretically allows the snapshot to be used as part of
1946          * the same cluster (perhaps as a cache).
1947          *
1948          * Copy the (flushed) blockref array.  Theoretically we could use
1949          * chain_duplicate() but it becomes difficult to disentangle
1950          * the shared core so for now just brute-force it.
1951          */
1952         VATTR_NULL(&vat);
1953         vat.va_type = VDIR;
1954         vat.va_mode = 0755;
1955         nip = hammer2_inode_create(hmp->spmp->iroot, &vat, proc0.p_ucred,
1956                                    pmp->name, name_len,
1957                                    1, 0, 0,
1958                                    HAMMER2_INSERT_PFSROOT, &error);
1959
1960         if (nip) {
1961                 hammer2_inode_modify(nip);
1962                 nchain = hammer2_inode_chain(nip, 0, HAMMER2_RESOLVE_ALWAYS);
1963                 hammer2_chain_modify(nchain, 0);
1964                 wipdata = &nchain->data->ipdata;
1965
1966                 nip->meta.pfs_type = HAMMER2_PFSTYPE_MASTER;
1967                 nip->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1968                 nip->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT;
1969                 kern_uuidgen(&nip->meta.pfs_fsid, 1);
1970
1971                 /*
1972                  * Give the snapshot its own private cluster id.  As a
1973                  * snapshot no further synchronization with the original
1974                  * cluster will be done.
1975                  */
1976 #if 0
1977                 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1978                         nip->meta.pfs_clid = opfs_clid;
1979                 else
1980                         kern_uuidgen(&nip->meta.pfs_clid, 1);
1981 #endif
1982                 kern_uuidgen(&nip->meta.pfs_clid, 1);
1983                 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1984
1985                 /* XXX hack blockset copy */
1986                 /* XXX doesn't work with real cluster */
1987                 KKASSERT(ocluster->nchains == 1);
1988                 wipdata->meta = nip->meta;
1989                 wipdata->u.blockset = ripdata->u.blockset;
1990                 hammer2_flush(nchain, 1);
1991                 hammer2_chain_unlock(nchain);
1992                 hammer2_chain_drop(nchain);
1993                 hammer2_inode_unlock(nip, NULL);
1994         }
1995         return (error);
1996 }
1997
1998 /*
1999  * Return locked parent cluster given a locked child.  The child remains
2000  * locked on return.  The new parent's focus follows the child's focus
2001  * and the parent is always resolved.
2002  *
2003  * We must temporarily unlock the passed-in cluster to avoid a deadlock
2004  * between elements of the cluster.
2005  *
2006  * We must not try to hammer2_cluster_resolve() cparent.  The individual
2007  * parent chains for the nodes are the correct parents for the cluster but
2008  * do not necessarily match, so resolve would likely implode.
2009  */
2010 hammer2_cluster_t *
2011 hammer2_cluster_parent(hammer2_cluster_t *cluster)
2012 {
2013         hammer2_cluster_t *cparent;
2014         int i;
2015
2016         cparent = hammer2_cluster_copy(cluster);
2017         hammer2_cluster_unlock(cluster);
2018
2019         for (i = 0; i < cparent->nchains; ++i) {
2020                 hammer2_chain_t *chain;
2021                 hammer2_chain_t *rchain;
2022
2023                 /*
2024                  * Calculate parent for each element.  Old chain has an extra
2025                  * ref for cparent but the lock remains with cluster.
2026                  */
2027                 chain = cparent->array[i].chain;
2028                 if (chain == NULL)
2029                         continue;
2030                 while ((rchain = chain->parent) != NULL) {
2031                         hammer2_chain_ref(rchain);
2032                         hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
2033                         if (chain->parent == rchain)
2034                                 break;
2035                         hammer2_chain_unlock(rchain);
2036                         hammer2_chain_drop(rchain);
2037                 }
2038                 cparent->array[i].chain = rchain;
2039                 hammer2_chain_drop(chain);
2040         }
2041         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
2042         /* hammer2_cluster_resolve(cparent); */
2043         hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
2044
2045         return cparent;
2046 }
2047
2048 /************************************************************************
2049  *                              CLUSTER I/O                             *
2050  ************************************************************************
2051  *
2052  *
2053  * WARNING! blockref[] array data is not universal.  These functions should
2054  *          only be used to access universal data.
2055  *
2056  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
2057  *          complete if necessary.  The I/O's should have already been
2058  *          initiated by the cluster_lock/chain_lock operation.
2059  *
2060  *          The cluster must already be in a modified state before wdata
2061  *          is called.  The data will already be available for this case.
2062  */
2063 const hammer2_media_data_t *
2064 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
2065 {
2066         KKASSERT(cluster->focus != NULL);
2067         return(cluster->focus->data);
2068 }
2069
2070 const hammer2_media_data_t *
2071 hammer2_cluster_rdata_bytes(hammer2_cluster_t *cluster, size_t *bytesp)
2072 {
2073         KKASSERT(cluster->focus != NULL);
2074         *bytesp = cluster->focus->bytes;
2075         return(cluster->focus->data);
2076 }
2077
2078 hammer2_media_data_t *
2079 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
2080 {
2081         KKASSERT(cluster->focus != NULL);
2082         KKASSERT(hammer2_cluster_modified(cluster));
2083         return(cluster->focus->data);
2084 }
2085
2086 /*
2087  * Load cluster data asynchronously with callback.
2088  *
2089  * The callback is made for the first validated data found, or NULL
2090  * if no valid data is available.
2091  *
2092  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
2093  *       in the inode with an exclusive lock held), the chain structure may be
2094  *       shared.
2095  */
2096 void
2097 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
2098                            void (*callback)(hammer2_iocb_t *iocb), void *ptr)
2099 {
2100         hammer2_chain_t *chain;
2101         hammer2_iocb_t *iocb;
2102         hammer2_dev_t *hmp;
2103         hammer2_blockref_t *bref;
2104         int i;
2105
2106         i = cluster->focus_index;
2107         chain = cluster->focus;
2108
2109         iocb = &cluster->iocb;
2110         iocb->callback = callback;
2111         iocb->dio = NULL;               /* for already-validated case */
2112         iocb->cluster = cluster;
2113         iocb->chain = chain;
2114         iocb->ptr = ptr;
2115         iocb->lbase = (off_t)i;
2116         iocb->flags = 0;
2117         iocb->error = 0;
2118
2119         /*
2120          * Data already validated
2121          */
2122         if (chain->data) {
2123                 callback(iocb);
2124                 return;
2125         }
2126
2127         /*
2128          * We must resolve to a device buffer, either by issuing I/O or
2129          * by creating a zero-fill element.  We do not mark the buffer
2130          * dirty when creating a zero-fill element (the hammer2_chain_modify()
2131          * API must still be used to do that).
2132          *
2133          * The device buffer is variable-sized in powers of 2 down
2134          * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
2135          * chunk always contains buffers of the same size. (XXX)
2136          *
2137          * The minimum physical IO size may be larger than the variable
2138          * block size.
2139          *
2140          * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
2141          *            matches hammer2_devblksize()?  Or does the freemap's
2142          *            pre-zeroing handle the case for us?
2143          */
2144         bref = &chain->bref;
2145         hmp = chain->hmp;
2146
2147 #if 0
2148         /* handled by callback? <- TODO XXX even needed for loads? */
2149         /*
2150          * The getblk() optimization for a 100% overwrite can only be used
2151          * if the physical block size matches the request.
2152          */
2153         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
2154             chain->bytes == hammer2_devblksize(chain->bytes)) {
2155                 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
2156                 KKASSERT(error == 0);
2157                 iocb->dio = dio;
2158                 callback(iocb);
2159                 return;
2160         }
2161 #endif
2162
2163         /*
2164          * Otherwise issue a read
2165          */
2166         hammer2_adjreadcounter(&chain->bref, chain->bytes);
2167         hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
2168 }