Merge branch 'vendor/OPENSSL'
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *          locks and I/O, do quorum and/or master-slave processing, and
50  *          it must operate properly even if some nodes are broken (which
51  *          can also mean indefinite locks).
52  *
53  *                              CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *              hammer2_cluster_lock()
59  *              hammer2_cluster_parent()
60  *
61  *      - Most complex functions, quorum management on transaction ids.
62  *
63  *      - Locking and data accesses must be internally asynchronous.
64  *
65  *      - Validate and manage cache coherency primitives (cache state
66  *        is stored in chain topologies but must be validated by these
67  *        functions).
68  *
69  * (2) Lookups and Scans
70  *              hammer2_cluster_lookup()
71  *              hammer2_cluster_next()
72  *
73  *      - Depend on locking & data retrieval functions, but still complex.
74  *
75  *      - Must do quorum management on transaction ids.
76  *
77  *      - Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *              hammer2_cluster_create()
81  *              hammer2_cluster_rename()
82  *              hammer2_cluster_delete()
83  *              hammer2_cluster_modify()
84  *              hammer2_cluster_modsync()
85  *
86  *      - Can usually punt on failures, operation continues unless quorum
87  *        is lost.  If quorum is lost, must wait for resynchronization
88  *        (depending on the management mode).
89  *
90  *      - Must disconnect node on failures (also not flush), remount, and
91  *        resynchronize.
92  *
93  *      - Network links (via kdmsg) are relatively easy to issue as the
94  *        complex underworkings of hammer2_chain.c don't have to messed
95  *        with (the protocol is at a higher level than block-level).
96  *
97  *      - Multiple local disk nodes (i.e. block devices) are another matter.
98  *        Chain operations have to be dispatched to per-node threads (xN)
99  *        because we can't asynchronize potentially very complex chain
100  *        operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *        (these threads are also used to terminate incoming kdmsg ops from
103  *        other machines).
104  *
105  *      - Single-node filesystems do not use threads and will simply call
106  *        hammer2_chain.c functions directly.  This short-cut is handled
107  *        at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115
116 #include "hammer2.h"
117
118 /*
119  * Returns non-zero if any chain in the cluster needs to be resized.
120  * Errored elements are not used in the calculation.
121  */
122 int
123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
124 {
125         hammer2_chain_t *chain;
126         int i;
127
128         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
129         for (i = 0; i < cluster->nchains; ++i) {
130                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
131                         continue;
132                 chain = cluster->array[i].chain;
133                 if (chain == NULL)
134                         continue;
135                 if (chain->error)
136                         continue;
137                 if (chain->bytes != bytes)
138                         return 1;
139         }
140         return 0;
141 }
142
143 /*
144  * Returns the bref type of the cluster's foucs.
145  *
146  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
147  * The cluster must be locked.
148  */
149 uint8_t
150 hammer2_cluster_type(hammer2_cluster_t *cluster)
151 {
152         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
153         if (cluster->error == 0)
154                 return(cluster->focus->bref.type);
155         return 0;
156 }
157
158 /*
159  * Returns non-zero if the cluster's focus is flagged as being modified.
160  *
161  * If the cluster is errored, returns 0.
162  */
163 int
164 hammer2_cluster_modified(hammer2_cluster_t *cluster)
165 {
166         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
167         if (cluster->error == 0)
168                 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
169         return 0;
170 }
171
172 /*
173  * Returns the bref of the cluster's focus, sans any data-offset information
174  * (since offset information is per-node and wouldn't be useful).
175  *
176  * Callers use this function to access modify_tid, mirror_tid, type,
177  * key, and keybits.
178  *
179  * If the cluster is errored, returns an empty bref.
180  * The cluster must be locked.
181  */
182 void
183 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
184 {
185         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
186         if (cluster->error == 0) {
187                 *bref = cluster->focus->bref;
188                 bref->data_off = 0;
189         } else {
190                 bzero(bref, sizeof(*bref));
191         }
192 }
193
194 /*
195  * Return non-zero if the chain representing an inode has been flagged
196  * as having been unlinked.  Allows the vnode reclaim to avoid loading
197  * the inode data from disk e.g. when unmount or recycling old, clean
198  * vnodes.
199  *
200  * The cluster does not need to be locked.
201  * The focus cannot be used since the cluster might not be locked.
202  */
203 int
204 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster)
205 {
206         hammer2_chain_t *chain;
207         int flags;
208         int i;
209
210         flags = 0;
211         for (i = 0; i < cluster->nchains; ++i) {
212                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
213                         continue;
214                 chain = cluster->array[i].chain;
215                 if (chain)
216                         flags |= chain->flags;
217         }
218         return (flags & HAMMER2_CHAIN_UNLINKED);
219 }
220
221 /*
222  * Set a bitmask of flags in all chains related to a cluster.
223  * The cluster should probably be locked.
224  *
225  * XXX Only operate on FEMOD elements?
226  */
227 void
228 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
229 {
230         hammer2_chain_t *chain;
231         int i;
232
233         for (i = 0; i < cluster->nchains; ++i) {
234                 chain = cluster->array[i].chain;
235                 if (chain)
236                         atomic_set_int(&chain->flags, flags);
237         }
238 }
239
240 /*
241  * Set a bitmask of flags in all chains related to a cluster.
242  * The cluster should probably be locked.
243  *
244  * XXX Only operate on FEMOD elements?
245  */
246 void
247 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
248 {
249         hammer2_chain_t *chain;
250         int i;
251
252         for (i = 0; i < cluster->nchains; ++i) {
253                 chain = cluster->array[i].chain;
254                 if (chain)
255                         atomic_clear_int(&chain->flags, flags);
256         }
257 }
258
259 /*
260  * Flag the cluster for flushing recursively up to the root.  Despite the
261  * work it does, this is relatively benign.  It just makes sure that the
262  * flusher has top-down visibility to this cluster.
263  *
264  * Errored chains are not flagged for flushing.
265  *
266  * The cluster should probably be locked.
267  */
268 void
269 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
270 {
271         hammer2_chain_t *chain;
272         int i;
273
274         for (i = 0; i < cluster->nchains; ++i) {
275                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
276                         continue;
277                 chain = cluster->array[i].chain;
278                 if (chain == NULL)
279                         continue;
280                 if (chain->error)
281                         continue;
282                 hammer2_chain_setflush(trans, chain);
283         }
284 }
285
286 /*
287  * Set the check mode for the cluster.
288  * Errored elements of the cluster are ignored.
289  *
290  * The cluster must be locked and modified.
291  */
292 void
293 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
294                                 hammer2_cluster_t *cluster,
295                                 int check_algo)
296 {
297         hammer2_chain_t *chain;
298         int i;
299
300         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
301         for (i = 0; i < cluster->nchains; ++i) {
302                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
303                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
304                         continue;
305                 }
306                 chain = cluster->array[i].chain;
307                 if (chain == NULL)
308                         continue;
309                 if (chain->error)
310                         continue;
311                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
312                 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
313                 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
314         }
315 }
316
317 /*
318  * Create a degenerate cluster with one ref from a single locked chain.
319  * The returned cluster will be focused on the chain and inherit its
320  * error state.
321  *
322  * The chain's lock and reference are transfered to the new cluster, so
323  * the caller should not try to unlock the chain separately.
324  *
325  * We fake the flags.
326  */
327 hammer2_cluster_t *
328 hammer2_cluster_from_chain(hammer2_chain_t *chain)
329 {
330         hammer2_cluster_t *cluster;
331
332         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
333         cluster->array[0].chain = chain;
334         cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
335         cluster->nchains = 1;
336         cluster->focus = chain;
337         cluster->focus_index = 0;
338         cluster->pmp = chain->pmp;
339         cluster->refs = 1;
340         cluster->error = chain->error;
341         cluster->flags = HAMMER2_CLUSTER_LOCKED |
342                          HAMMER2_CLUSTER_WRHARD |
343                          HAMMER2_CLUSTER_RDHARD |
344                          HAMMER2_CLUSTER_MSYNCED |
345                          HAMMER2_CLUSTER_SSYNCED;
346
347         return cluster;
348 }
349
350 /*
351  * Add a reference to a cluster and its underlying chains.
352  *
353  * We must also ref the underlying chains in order to allow ref/unlock
354  * sequences to later re-lock.
355  */
356 void
357 hammer2_cluster_ref(hammer2_cluster_t *cluster)
358 {
359         atomic_add_int(&cluster->refs, 1);
360 }
361
362 /*
363  * Drop the caller's reference to the cluster.  When the ref count drops to
364  * zero this function frees the cluster and drops all underlying chains.
365  *
366  * In-progress read I/Os are typically detached from the cluster once the
367  * first one returns (the remaining stay attached to the DIOs but are then
368  * ignored and drop naturally).
369  */
370 void
371 hammer2_cluster_drop(hammer2_cluster_t *cluster)
372 {
373         hammer2_chain_t *chain;
374         int i;
375
376         KKASSERT(cluster->refs > 0);
377         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
378                 cluster->focus = NULL;          /* safety XXX chg to assert */
379                 cluster->focus_index = 0;
380
381                 for (i = 0; i < cluster->nchains; ++i) {
382                         chain = cluster->array[i].chain;
383                         if (chain) {
384                                 hammer2_chain_drop(chain);
385                                 cluster->array[i].chain = NULL; /* safety */
386                         }
387                 }
388                 cluster->nchains = 0;                           /* safety */
389
390                 kfree(cluster, M_HAMMER2);
391                 /* cluster is invalid */
392         }
393 }
394
395 void
396 hammer2_cluster_wait(hammer2_cluster_t *cluster)
397 {
398         tsleep(cluster->focus, 0, "h2clcw", 1);
399 }
400
401 /*
402  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained. 
403  *
404  * WARNING! This function expects the caller to handle resolution of the
405  *          cluster.  We never re-resolve the cluster in this function,
406  *          because it might be used to temporarily unlock/relock a cparent
407  *          in an iteration or recursrion, and the cparents elements do not
408  *          necessarily match.
409  */
410 void
411 hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
412 {
413         hammer2_chain_t *chain;
414         int i;
415
416         /* cannot be on inode-embedded cluster template, must be on copy */
417         KKASSERT(cluster->refs > 0);
418         KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
419         if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
420                 panic("hammer2_cluster_lock: cluster %p already locked!\n",
421                         cluster);
422         }
423         atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
424
425         /*
426          * Lock chains and resolve state.
427          */
428         for (i = 0; i < cluster->nchains; ++i) {
429                 if (i == idx)
430                         continue;
431                 chain = cluster->array[i].chain;
432                 if (chain == NULL)
433                         continue;
434                 hammer2_chain_lock(chain, how);
435         }
436 }
437
438 void
439 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
440 {
441         hammer2_cluster_lock_except(cluster, -1, how);
442 }
443
444 /*
445  * Calculate the clustering state for the cluster and set its focus.
446  * This routine must be called with care.  For example, it should not
447  * normally be called after relocking a non-leaf cluster because parent
448  * clusters help iterations and each element might be at a slightly different
449  * indirect node (each node's topology is independently indexed).
450  *
451  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
452  * operations.  Typically this is only set on a quorum of MASTERs or
453  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
454  * is present, this bit is *not* set on a quorum of MASTERs.  The
455  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
456  * that create/modify/delete elements use it.
457  *
458  * The chains making up the cluster may be narrowed down based on quorum
459  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
460  * narrowed down to a single chain as long as the entire subtopology is known
461  * to be intact.  So, for example, we can narrow a read-only op to a single
462  * fast SLAVE but if we focus a CACHE chain we must still retain at least
463  * a SLAVE to ensure that the subtopology can be accessed.
464  *
465  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
466  * to be maintained once the topology is validated as-of the top level of
467  * the operation.
468  *
469  * If a failure occurs the operation must be aborted by higher-level code and
470  * retried. XXX
471  */
472 void
473 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
474 {
475         hammer2_chain_t *chain;
476         hammer2_chain_t *focus;
477         hammer2_pfs_t *pmp;
478         hammer2_tid_t quorum_tid;
479         hammer2_tid_t last_best_quorum_tid;
480         int focus_pfs_type;
481         uint32_t nflags;
482         int ttlmasters;
483         int ttlslaves;
484         int nmasters;
485         int nslaves;
486         int nquorum;
487         int smpresent;
488         int i;
489
490         cluster->error = 0;
491         cluster->focus = NULL;
492
493         focus_pfs_type = 0;
494         nflags = 0;
495         ttlmasters = 0;
496         ttlslaves = 0;
497         nmasters = 0;
498         nslaves = 0;
499
500         /*
501          * Calculate quorum
502          */
503         pmp = cluster->pmp;
504         KKASSERT(pmp != NULL || cluster->nchains == 0);
505         nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
506         smpresent = 0;
507
508         /*
509          * Pass 1
510          *
511          * NOTE: A NULL chain is not necessarily an error, it could be
512          *       e.g. a lookup failure or the end of an iteration.
513          *       Process normally.
514          */
515         for (i = 0; i < cluster->nchains; ++i) {
516                 chain = cluster->array[i].chain;
517                 if (chain && chain->error) {
518                         if (cluster->focus == NULL || cluster->focus == chain) {
519                                 /* error will be overridden by valid focus */
520                                 cluster->error = chain->error;
521                         }
522
523                         /*
524                          * Must count total masters and slaves whether the
525                          * chain is errored or not.
526                          */
527                         switch (cluster->pmp->pfs_types[i]) {
528                         case HAMMER2_PFSTYPE_MASTER:
529                                 ++ttlmasters;
530                                 break;
531                         case HAMMER2_PFSTYPE_SLAVE:
532                                 ++ttlslaves;
533                                 break;
534                         }
535                         continue;
536                 }
537                 switch (cluster->pmp->pfs_types[i]) {
538                 case HAMMER2_PFSTYPE_MASTER:
539                         ++ttlmasters;
540                         break;
541                 case HAMMER2_PFSTYPE_SLAVE:
542                         ++ttlslaves;
543                         break;
544                 case HAMMER2_PFSTYPE_SOFT_MASTER:
545                         nflags |= HAMMER2_CLUSTER_WRSOFT;
546                         nflags |= HAMMER2_CLUSTER_RDSOFT;
547                         smpresent = 1;
548                         break;
549                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
550                         nflags |= HAMMER2_CLUSTER_RDSOFT;
551                         break;
552                 case HAMMER2_PFSTYPE_SUPROOT:
553                         /*
554                          * Degenerate cluster representing the super-root
555                          * topology on a single device.  Fake stuff so
556                          * cluster ops work as expected.
557                          */
558                         nflags |= HAMMER2_CLUSTER_WRHARD;
559                         nflags |= HAMMER2_CLUSTER_RDHARD;
560                         cluster->focus_index = i;
561                         cluster->focus = chain;
562                         cluster->error = chain ? chain->error : 0;
563                         break;
564                 default:
565                         break;
566                 }
567         }
568
569         /*
570          * Pass 2
571          *
572          * Resolve masters.  Calculate nmasters for the highest matching
573          * TID, if a quorum cannot be attained try the next lower matching
574          * TID until we exhaust TIDs.
575          *
576          * NOTE: A NULL chain is not necessarily an error, it could be
577          *       e.g. a lookup failure or the end of an iteration.
578          *       Process normally.
579          */
580         last_best_quorum_tid = HAMMER2_TID_MAX;
581         quorum_tid = 0;         /* fix gcc warning */
582
583         while (nmasters < nquorum && last_best_quorum_tid != 0) {
584                 nmasters = 0;
585                 quorum_tid = 0;
586
587                 for (i = 0; i < cluster->nchains; ++i) {
588                         if (cluster->pmp->pfs_types[i] !=
589                             HAMMER2_PFSTYPE_MASTER) {
590                                 continue;
591                         }
592                         chain = cluster->array[i].chain;
593
594                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
595                                 /*
596                                  * Invalid as in unsynchronized, cannot be
597                                  * used to calculate the quorum.
598                                  */
599                         } else if (chain == NULL && quorum_tid == 0) {
600                                 /*
601                                  * NULL chain on master matches NULL chains
602                                  * on other masters.
603                                  */
604                                 ++nmasters;
605                         } else if (quorum_tid < last_best_quorum_tid &&
606                                    chain != NULL &&
607                                    (quorum_tid < chain->bref.modify_tid ||
608                                     nmasters == 0)) {
609                                 /*
610                                  * Better TID located, reset nmasters count.
611                                  */
612                                 nmasters = 1;
613                                 quorum_tid = chain->bref.modify_tid;
614                         } else if (chain &&
615                                    quorum_tid == chain->bref.modify_tid) {
616                                 /*
617                                  * TID matches current collection.
618                                  */
619                                 ++nmasters;
620                         }
621                 }
622                 if (nmasters >= nquorum)
623                         break;
624                 last_best_quorum_tid = quorum_tid;
625         }
626
627         /*
628          * Pass 3
629          *
630          * NOTE: A NULL chain is not necessarily an error, it could be
631          *       e.g. a lookup failure or the end of an iteration.
632          *       Process normally.
633          */
634         for (i = 0; i < cluster->nchains; ++i) {
635                 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
636                 chain = cluster->array[i].chain;
637                 if (chain && chain->error) {
638                         if (cluster->focus == NULL || cluster->focus == chain) {
639                                 /* error will be overridden by valid focus */
640                                 cluster->error = chain->error;
641                         }
642                         continue;
643                 }
644
645                 switch (cluster->pmp->pfs_types[i]) {
646                 case HAMMER2_PFSTYPE_MASTER:
647                         /*
648                          * We must have enough up-to-date masters to reach
649                          * a quorum and the master modify_tid must match
650                          * the quorum's modify_tid.
651                          *
652                          * Do not select an errored or out-of-sync master.
653                          */
654                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
655                                 nflags |= HAMMER2_CLUSTER_UNHARD;
656                         } else if (nmasters >= nquorum &&
657                                    (chain == NULL || chain->error == 0) &&
658                                    ((chain == NULL && quorum_tid == 0) ||
659                                     (chain != NULL && quorum_tid ==
660                                                   chain->bref.modify_tid))) {
661                                 nflags |= HAMMER2_CLUSTER_WRHARD;
662                                 nflags |= HAMMER2_CLUSTER_RDHARD;
663                                 if (!smpresent) {
664                                         cluster->array[i].flags |=
665                                                         HAMMER2_CITEM_FEMOD;
666                                 }
667                                 if (cluster->focus == NULL ||
668                                     focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
669                                         focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
670                                         cluster->focus_index = i;
671                                         cluster->focus = chain; /* NULL ok */
672                                         cluster->error = chain ? chain->error :
673                                                                  0;
674                                 }
675                         } else if (chain == NULL || chain->error == 0) {
676                                 nflags |= HAMMER2_CLUSTER_UNHARD;
677                         }
678                         break;
679                 case HAMMER2_PFSTYPE_SLAVE:
680                         /*
681                          * We must have enough up-to-date masters to reach
682                          * a quorum and the slave modify_tid must match the
683                          * quorum's modify_tid.
684                          *
685                          * Do not select an errored slave.
686                          */
687                         if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
688                                 nflags |= HAMMER2_CLUSTER_UNHARD;
689                         } else if (nmasters >= nquorum &&
690                                    (chain == NULL || chain->error == 0) &&
691                                    ((chain == NULL && quorum_tid == 0) ||
692                                     (chain && quorum_tid ==
693                                               chain->bref.modify_tid))) {
694                                 ++nslaves;
695                                 nflags |= HAMMER2_CLUSTER_RDHARD;
696 #if 0
697                                 /* XXX optimize for RESOLVE_RDONLY */
698                                 if (cluster->focus == NULL) {
699                                         focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
700                                         cluster->focus_index = i;
701                                         cluster->focus = chain; /* NULL ok */
702                                         cluster->error = chain ? chain->error :
703                                                                  0;
704                                 }
705 #endif
706                         } else if (chain == NULL || chain->error == 0) {
707                                 nflags |= HAMMER2_CLUSTER_UNSOFT;
708                         }
709                         break;
710                 case HAMMER2_PFSTYPE_SOFT_MASTER:
711                         /*
712                          * Directly mounted soft master always wins.  There
713                          * should be only one.
714                          */
715                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
716                         cluster->focus_index = i;
717                         cluster->focus = chain;
718                         cluster->error = chain ? chain->error : 0;
719                         focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
720                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
721                         break;
722                 case HAMMER2_PFSTYPE_SOFT_SLAVE:
723                         /*
724                          * Directly mounted soft slave always wins.  There
725                          * should be only one.
726                          */
727                         KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
728                         if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
729                                 cluster->focus_index = i;
730                                 cluster->focus = chain;
731                                 cluster->error = chain ? chain->error : 0;
732                                 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
733                         }
734                         break;
735                 case HAMMER2_PFSTYPE_SUPROOT:
736                         /*
737                          * spmp (degenerate case)
738                          */
739                         KKASSERT(i == 0);
740                         cluster->focus_index = i;
741                         cluster->focus = chain;
742                         cluster->error = chain ? chain->error : 0;
743                         focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
744                         cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
745                         break;
746                 default:
747                         break;
748                 }
749         }
750
751         /*
752          * Focus now set, adjust ddflag.  Skip this pass if the focus
753          * is bad or if we are at the PFS root (the bref won't match at
754          * the PFS root, obviously).
755          */
756         focus = cluster->focus;
757         if (focus) {
758                 cluster->ddflag =
759                         (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
760         } else {
761                 cluster->ddflag = 0;
762                 goto skip4;
763         }
764         if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
765                 goto skip4;
766
767         /*
768          * Pass 4
769          *
770          * Validate the elements that were not marked invalid.  They should
771          * match.
772          */
773         for (i = 0; i < cluster->nchains; ++i) {
774                 int ddflag;
775
776                 chain = cluster->array[i].chain;
777
778                 if (chain == NULL)
779                         continue;
780                 if (chain == focus)
781                         continue;
782                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
783                         continue;
784
785                 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
786                 if (chain->bref.type != focus->bref.type ||
787                     chain->bref.key != focus->bref.key ||
788                     chain->bref.keybits != focus->bref.keybits ||
789                     chain->bref.modify_tid != focus->bref.modify_tid ||
790                     chain->bytes != focus->bytes ||
791                     ddflag != cluster->ddflag) {
792                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
793                         if (hammer2_debug & 1)
794                         kprintf("cluster_resolve: matching modify_tid failed "
795                                 "bref test: idx=%d type=%02x/%02x "
796                                 "key=%016jx/%d-%016jx/%d "
797                                 "mod=%016jx/%016jx bytes=%u/%u\n",
798                                 i,
799                                 chain->bref.type, focus->bref.type,
800                                 chain->bref.key, chain->bref.keybits,
801                                 focus->bref.key, focus->bref.keybits,
802                                 chain->bref.modify_tid, focus->bref.modify_tid,
803                                 chain->bytes, focus->bytes);
804                         if (hammer2_debug & 0x4000)
805                                 panic("cluster_resolve");
806                         /* flag issue and force resync? */
807                 }
808         }
809 skip4:
810
811         if (ttlslaves == 0)
812                 nflags |= HAMMER2_CLUSTER_NOSOFT;
813         if (ttlmasters == 0)
814                 nflags |= HAMMER2_CLUSTER_NOHARD;
815
816         /*
817          * Set SSYNCED or MSYNCED for slaves and masters respectively if
818          * all available nodes (even if 0 are available) are fully
819          * synchronized.  This is used by the synchronization thread to
820          * determine if there is work it could potentially accomplish.
821          */
822         if (nslaves == ttlslaves)
823                 nflags |= HAMMER2_CLUSTER_SSYNCED;
824         if (nmasters == ttlmasters)
825                 nflags |= HAMMER2_CLUSTER_MSYNCED;
826
827         /*
828          * Determine if the cluster was successfully locked for the
829          * requested operation and generate an error code.  The cluster
830          * will not be locked (or ref'd) if an error is returned.
831          *
832          * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
833          * to determine if reading or writing is possible.  If writing, the
834          * cluster still requires a call to hammer2_cluster_modify() first.
835          */
836         atomic_set_int(&cluster->flags, nflags);
837         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
838 }
839
840 /*
841  * This is used by the sync thread to force non-NULL elements of a copy
842  * of the pmp->iroot cluster to be good which is required to prime the
843  * sync.
844  */
845 void
846 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
847 {
848         int i;
849
850         for (i = 0; i < cluster->nchains; ++i) {
851                 if (cluster->array[i].chain)
852                         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
853         }
854 }
855
856 /*
857  * Copy a cluster, returned a ref'd cluster.  All underlying chains
858  * are also ref'd, but not locked.  Focus state is also copied.
859  *
860  * Original cluster does not have to be locked but usually is.
861  * New cluster will not be flagged as locked.
862  *
863  * Callers using this function to initialize a new cluster from an inode
864  * generally lock and resolve the resulting cluster.
865  *
866  * Callers which use this function to save/restore a cluster structure
867  * generally retain the focus state and do not re-resolve it.  Caller should
868  * not try to re-resolve internal (cparent) node state during an iteration
869  * as the individual tracking elements of cparent in an iteration may not
870  * match even though they are correct.
871  */
872 hammer2_cluster_t *
873 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
874 {
875         hammer2_pfs_t *pmp = ocluster->pmp;
876         hammer2_cluster_t *ncluster;
877         hammer2_chain_t *chain;
878         int i;
879
880         ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
881         ncluster->pmp = pmp;
882         ncluster->nchains = ocluster->nchains;
883         ncluster->refs = 1;
884
885         for (i = 0; i < ocluster->nchains; ++i) {
886                 chain = ocluster->array[i].chain;
887                 ncluster->array[i].chain = chain;
888                 ncluster->array[i].flags = ocluster->array[i].flags;
889                 if (chain)
890                         hammer2_chain_ref(chain);
891         }
892         ncluster->focus_index = ocluster->focus_index;
893         ncluster->focus = ocluster->focus;
894         ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED |
895                                               HAMMER2_CLUSTER_INODE);
896
897         return (ncluster);
898 }
899
900 /*
901  * Unlock a cluster.  Refcount and focus is maintained.
902  */
903 void
904 hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx)
905 {
906         hammer2_chain_t *chain;
907         int i;
908
909         if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
910                 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
911                         cluster);
912         }
913         KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
914         KKASSERT(cluster->refs > 0);
915         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
916
917         for (i = 0; i < cluster->nchains; ++i) {
918                 if (i == idx)
919                         continue;
920                 chain = cluster->array[i].chain;
921                 if (chain)
922                         hammer2_chain_unlock(chain);
923         }
924 }
925
926 void
927 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
928 {
929         hammer2_cluster_unlock_except(cluster, -1);
930 }
931
932 /*
933  * Resize the cluster's physical storage allocation in-place.  This may
934  * replace the cluster's chains.
935  */
936 void
937 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
938                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
939                        int nradix, int flags)
940 {
941         hammer2_chain_t *chain;
942         int i;
943
944         KKASSERT(cparent->pmp == cluster->pmp);         /* can be NULL */
945         KKASSERT(cparent->nchains == cluster->nchains);
946
947         for (i = 0; i < cluster->nchains; ++i) {
948                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
949                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
950                         continue;
951                 }
952                 chain = cluster->array[i].chain;
953                 if (chain) {
954                         KKASSERT(cparent->array[i].chain);
955                         hammer2_chain_resize(trans, ip,
956                                              cparent->array[i].chain, chain,
957                                              nradix, flags);
958                 }
959         }
960 }
961
962 /*
963  * Set an inode's cluster modified, marking the related chains RW and
964  * duplicating them if necessary.
965  *
966  * The passed-in chain is a localized copy of the chain previously acquired
967  * when the inode was locked (and possilby replaced in the mean time), and
968  * must also be updated.  In fact, we update it first and then synchronize
969  * the inode's cluster cache.
970  */
971 hammer2_inode_data_t *
972 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
973                           hammer2_cluster_t *cluster, int flags)
974 {
975         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
976         hammer2_cluster_modify(trans, cluster, flags);
977
978         hammer2_inode_repoint(ip, NULL, cluster);
979         if (ip->vp)
980                 vsetisdirty(ip->vp);
981         return (&hammer2_cluster_wdata(cluster)->ipdata);
982 }
983
984 /*
985  * Adjust the cluster's chains to allow modification and adjust the
986  * focus.  Data will be accessible on return.
987  *
988  * If our focused master errors on modify, re-resolve the cluster to
989  * try to select a different master.
990  */
991 void
992 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
993                        int flags)
994 {
995         hammer2_chain_t *chain;
996         int resolve_again;
997         int i;
998
999         resolve_again = 0;
1000         for (i = 0; i < cluster->nchains; ++i) {
1001                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1002                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1003                         continue;
1004                 }
1005                 chain = cluster->array[i].chain;
1006                 if (chain == NULL)
1007                         continue;
1008                 if (chain->error)
1009                         continue;
1010                 hammer2_chain_modify(trans, chain, flags);
1011                 if (cluster->focus == chain && chain->error) {
1012                         cluster->error = chain->error;
1013                         resolve_again = 1;
1014                 }
1015         }
1016         if (resolve_again)
1017                 hammer2_cluster_resolve(cluster);
1018 }
1019
1020 /*
1021  * Synchronize modifications from the focus to other chains in a cluster.
1022  * Convenient because nominal API users can just modify the contents of the
1023  * focus (at least for non-blockref data).
1024  *
1025  * Nominal front-end operations only edit non-block-table data in a single
1026  * chain.  This code copies such modifications to the other chains in the
1027  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
1028  * by both the frontend and the backend and will explode in fireworks if
1029  * blindly copied.
1030  */
1031 void
1032 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
1033 {
1034         hammer2_chain_t *focus;
1035         hammer2_chain_t *scan;
1036         const hammer2_inode_data_t *ripdata;
1037         hammer2_inode_data_t *wipdata;
1038         int i;
1039
1040         focus = cluster->focus;
1041         KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
1042
1043         for (i = 0; i < cluster->nchains; ++i) {
1044                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1045                         continue;
1046                 scan = cluster->array[i].chain;
1047                 if (scan == NULL || scan == focus)
1048                         continue;
1049                 if (scan->error)
1050                         continue;
1051                 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
1052                 KKASSERT(focus->bytes == scan->bytes &&
1053                          focus->bref.type == scan->bref.type);
1054                 switch(focus->bref.type) {
1055                 case HAMMER2_BREF_TYPE_INODE:
1056                         ripdata = &focus->data->ipdata;
1057                         wipdata = &scan->data->ipdata;
1058                         if ((ripdata->op_flags &
1059                             HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1060                                 bcopy(ripdata, wipdata,
1061                                       offsetof(hammer2_inode_data_t, u));
1062                                 break;
1063                         }
1064                         /* fall through to full copy */
1065                 case HAMMER2_BREF_TYPE_DATA:
1066                         bcopy(focus->data, scan->data, focus->bytes);
1067                         break;
1068                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1069                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1070                 case HAMMER2_BREF_TYPE_FREEMAP:
1071                 case HAMMER2_BREF_TYPE_VOLUME:
1072                         panic("hammer2_cluster_modsync: illegal node type");
1073                         /* NOT REACHED */
1074                         break;
1075                 default:
1076                         panic("hammer2_cluster_modsync: unknown node type");
1077                         break;
1078                 }
1079         }
1080 }
1081
1082 /*
1083  * Lookup initialization/completion API.  Returns a locked, fully resolved
1084  * cluster with one ref.
1085  */
1086 hammer2_cluster_t *
1087 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
1088 {
1089         hammer2_cluster_t *cluster;
1090
1091         cluster = hammer2_cluster_copy(cparent);
1092         if (flags & HAMMER2_LOOKUP_SHARED) {
1093                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
1094                                               HAMMER2_RESOLVE_SHARED);
1095         } else {
1096                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1097         }
1098         hammer2_cluster_resolve(cluster);
1099
1100         return (cluster);
1101 }
1102
1103 void
1104 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1105 {
1106         if (cparent) {
1107                 hammer2_cluster_unlock(cparent);
1108                 hammer2_cluster_drop(cparent);
1109         }
1110 }
1111
1112 /*
1113  * Locate first match or overlap under parent, return a new, locked, resolved
1114  * cluster with one ref.
1115  *
1116  * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1117  */
1118 hammer2_cluster_t *
1119 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
1120                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1121 {
1122         hammer2_pfs_t *pmp;
1123         hammer2_cluster_t *cluster;
1124         hammer2_chain_t *chain;
1125         hammer2_key_t key_accum;
1126         hammer2_key_t key_next;
1127         int null_count;
1128         int rflags;
1129         int i;
1130
1131         KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1132
1133         pmp = cparent->pmp;                             /* can be NULL */
1134         key_accum = *key_nextp;
1135         null_count = 0;
1136         if (flags & HAMMER2_LOOKUP_SHARED)
1137                 rflags = HAMMER2_RESOLVE_SHARED;
1138         else
1139                 rflags = 0;
1140
1141         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1142         cluster->pmp = pmp;                             /* can be NULL */
1143         cluster->refs = 1;
1144         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1145                 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1146
1147         /*
1148          * Iterating earlier cluster elements with later elements still
1149          * locked is a problem, so we have to unlock the parent and then
1150          * re-lock as we go.
1151          */
1152         hammer2_cluster_unlock(cparent);
1153         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1154
1155         /*
1156          * Pass-1, issue lookups.
1157          */
1158         for (i = 0; i < cparent->nchains; ++i) {
1159                 cluster->array[i].flags = cparent->array[i].flags;
1160                 key_next = *key_nextp;
1161
1162                 /*
1163                  * Always relock the parent as we go.
1164                  */
1165                 if (cparent->array[i].chain) {
1166                         hammer2_chain_lock(cparent->array[i].chain, rflags);
1167                 }
1168
1169                 /*
1170                  * Nothing to base the lookup, or parent was not synchronized.
1171                  */
1172                 if (cparent->array[i].chain == NULL ||
1173                     (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
1174                         ++null_count;
1175                         continue;
1176                 }
1177
1178                 chain = hammer2_chain_lookup(&cparent->array[i].chain,
1179                                              &key_next,
1180                                              key_beg, key_end,
1181                                              &cparent->array[i].cache_index,
1182                                              flags);
1183                 cluster->array[i].chain = chain;
1184                 if (chain == NULL) {
1185                         ++null_count;
1186                 }
1187                 if (key_accum > key_next)
1188                         key_accum = key_next;
1189         }
1190
1191         /*
1192          * Cleanup
1193          */
1194         cluster->nchains = i;
1195         *key_nextp = key_accum;
1196
1197         /*
1198          * The cluster must be resolved, out of sync elements may be present.
1199          *
1200          * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1201          */
1202         if (null_count != i)
1203                 hammer2_cluster_resolve(cluster);
1204         if (null_count == i ||
1205             (cluster->focus == NULL &&
1206              (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1207                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1208                         hammer2_cluster_unlock(cluster);
1209                 hammer2_cluster_drop(cluster);
1210                 cluster = NULL;
1211         }
1212
1213         return (cluster);
1214 }
1215
1216 /*
1217  * Locate next match or overlap under parent, replace the passed-in cluster.
1218  * The returned cluster is a new, locked, resolved cluster with one ref.
1219  *
1220  * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1221  */
1222 hammer2_cluster_t *
1223 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1224                      hammer2_key_t *key_nextp,
1225                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1226 {
1227         hammer2_chain_t *ochain;
1228         hammer2_chain_t *nchain;
1229         hammer2_key_t key_accum;
1230         hammer2_key_t key_next;
1231         int parent_index;
1232         int cluster_index;
1233         int null_count;
1234         int rflags;
1235         int i;
1236
1237         KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1238
1239         key_accum = *key_nextp;
1240         null_count = 0;
1241         parent_index = cparent->focus_index;    /* save prior focus */
1242         cluster_index = cluster->focus_index;
1243         if (flags & HAMMER2_LOOKUP_SHARED)
1244                 rflags = HAMMER2_RESOLVE_SHARED;
1245         else
1246                 rflags = 0;
1247
1248         cluster->focus = NULL;          /* XXX needed any more? */
1249         /*cparent->focus = NULL;*/
1250         cluster->focus_index = 0;       /* XXX needed any more? */
1251         /*cparent->focus_index = 0;*/
1252
1253         cluster->ddflag = 0;
1254
1255         /*
1256          * The parent is always locked on entry, the iterator may be locked
1257          * depending on flags.
1258          *
1259          * We must temporarily unlock the passed-in clusters to avoid a
1260          * deadlock between elements of the cluster with other threads.
1261          * We will fixup the lock in the loop.
1262          *
1263          * Note that this will clear the focus.
1264          *
1265          * Reflag the clusters as locked, because we will relock them
1266          * as we go.
1267          */
1268         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1269                 hammer2_cluster_unlock(cluster);
1270                 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1271         }
1272         hammer2_cluster_unlock(cparent);
1273         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1274
1275         for (i = 0; i < cparent->nchains; ++i) {
1276                 key_next = *key_nextp;
1277                 ochain = cluster->array[i].chain;
1278
1279                 /*
1280                  * Always relock the parent as we go.
1281                  */
1282                 if (cparent->array[i].chain)
1283                         hammer2_chain_lock(cparent->array[i].chain, rflags);
1284
1285                 /*
1286                  * Nothing to iterate from.  These cases can occur under
1287                  * normal operations.  For example, during synchronization
1288                  * a slave might reach the end of its scan while records
1289                  * are still left on the master(s).
1290                  */
1291                 if (ochain == NULL) {
1292                         ++null_count;
1293                         continue;
1294                 }
1295                 if (cparent->array[i].chain == NULL ||
1296                     (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1297                     (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1298                         /* ochain has not yet been relocked */
1299                         hammer2_chain_drop(ochain);
1300                         cluster->array[i].chain = NULL;
1301                         ++null_count;
1302                         continue;
1303                 }
1304
1305                 /*
1306                  * Relock the child if necessary.  Parent and child will then
1307                  * be locked as expected by hammer2_chain_next() and flags.
1308                  */
1309                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1310                         hammer2_chain_lock(ochain, rflags);
1311                 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1312                                             &key_next, key_beg, key_end,
1313                                             &cparent->array[i].cache_index,
1314                                             flags);
1315                 /* ochain now invalid but can still be used for focus check */
1316                 if (parent_index == i) {
1317                         cparent->focus_index = i;
1318                         cparent->focus = cparent->array[i].chain;
1319                 }
1320
1321                 cluster->array[i].chain = nchain;
1322                 if (nchain == NULL) {
1323                         ++null_count;
1324                 }
1325                 if (key_accum > key_next)
1326                         key_accum = key_next;
1327         }
1328
1329         /*
1330          * Cleanup
1331          */
1332         cluster->nchains = i;
1333         *key_nextp = key_accum;
1334
1335         /*
1336          * The cluster must be resolved, out of sync elements may be present.
1337          *
1338          * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1339          */
1340         if (null_count != i)
1341                 hammer2_cluster_resolve(cluster);
1342         if (null_count == i ||
1343             (cluster->focus == NULL &&
1344              (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1345                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1346                         hammer2_cluster_unlock(cluster);
1347                 hammer2_cluster_drop(cluster);
1348                 cluster = NULL;
1349         }
1350         return(cluster);
1351 }
1352
1353 /*
1354  * Advance just one chain in the cluster and recalculate the invalid bit.
1355  * The cluster index is allowed to be flagged invalid on input and is
1356  * recalculated on return.
1357  *
1358  * (used during synchronization to advance past a chain being deleted).
1359  *
1360  * The chain being advanced must not be the focus and the clusters in
1361  * question must have already passed normal cluster_lookup/cluster_next
1362  * checks.
1363  *
1364  * The cluster always remains intact on return, so void function.
1365  */
1366 void
1367 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1368                                   hammer2_cluster_t *cluster,
1369                                   hammer2_key_t *key_nextp,
1370                                   hammer2_key_t key_beg,
1371                                   hammer2_key_t key_end,
1372                                   int i, int flags)
1373 {
1374         hammer2_chain_t *ochain;
1375         hammer2_chain_t *nchain;
1376         hammer2_chain_t *focus;
1377         hammer2_key_t key_accum;
1378         hammer2_key_t key_next;
1379         int ddflag;
1380
1381         key_accum = *key_nextp;
1382         key_next = *key_nextp;
1383         ochain = cluster->array[i].chain;
1384         if (ochain == NULL)
1385                 goto done;
1386         KKASSERT(ochain != cluster->focus);
1387
1388         nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1389                                     &key_next, key_beg, key_end,
1390                                     &cparent->array[i].cache_index,
1391                                     flags);
1392         /* ochain now invalid */
1393         if (cparent->focus_index == i)
1394                 cparent->focus = cparent->array[i].chain;
1395
1396         /*
1397          * Install nchain.  Note that nchain can be NULL, and can also
1398          * be in an unlocked state depending on flags.
1399          */
1400         cluster->array[i].chain = nchain;
1401         cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1402
1403         if (key_accum > key_next)
1404                 key_accum = key_next;
1405
1406         focus = cluster->focus;
1407         if (focus == NULL)
1408                 goto done;
1409         if (nchain == NULL)
1410                 goto done;
1411 #if 0
1412         if (nchain == focus)    /* ASSERTED NOT TRUE */
1413                 ...
1414 #endif
1415         ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1416         if (nchain->bref.type != focus->bref.type ||
1417             nchain->bref.key != focus->bref.key ||
1418             nchain->bref.keybits != focus->bref.keybits ||
1419             nchain->bref.modify_tid != focus->bref.modify_tid ||
1420             nchain->bytes != focus->bytes ||
1421             ddflag != cluster->ddflag) {
1422                 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1423         }
1424
1425 done:
1426         *key_nextp = key_accum;
1427 #if 0
1428         /*
1429          * For now don't re-resolve cluster->flags.
1430          */
1431         hammer2_cluster_resolve(cluster);
1432 #endif
1433 }
1434
1435 /*
1436  * Create a new cluster using the specified key
1437  */
1438 int
1439 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1440                      hammer2_cluster_t **clusterp,
1441                      hammer2_key_t key, int keybits,
1442                      int type, size_t bytes, int flags)
1443 {
1444         hammer2_cluster_t *cluster;
1445         hammer2_pfs_t *pmp;
1446         int error;
1447         int i;
1448
1449         pmp = trans->pmp;                               /* can be NULL */
1450
1451         if ((cluster = *clusterp) == NULL) {
1452                 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1453                                   M_WAITOK | M_ZERO);
1454                 cluster->pmp = pmp;                     /* can be NULL */
1455                 cluster->refs = 1;
1456                 cluster->flags = HAMMER2_CLUSTER_LOCKED;
1457         }
1458         cluster->focus_index = 0;
1459         cluster->focus = NULL;
1460
1461         /*
1462          * NOTE: cluster->array[] entries can initially be NULL.  If
1463          *       *clusterp is supplied, skip NULL entries, otherwise
1464          *       create new chains.
1465          */
1466         for (i = 0; i < cparent->nchains; ++i) {
1467                 if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1468                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1469                         continue;
1470                 }
1471                 if (*clusterp) {
1472                         if ((cluster->array[i].flags &
1473                              HAMMER2_CITEM_FEMOD) == 0) {
1474                                 cluster->array[i].flags |=
1475                                                 HAMMER2_CITEM_INVALID;
1476                                 continue;
1477                         }
1478                         if (cluster->array[i].chain == NULL)
1479                                 continue;
1480                 }
1481                 error = hammer2_chain_create(trans, &cparent->array[i].chain,
1482                                              &cluster->array[i].chain, pmp,
1483                                              key, keybits,
1484                                              type, bytes, flags);
1485                 if (cparent->focus_index == i)
1486                         cparent->focus = cparent->array[i].chain;
1487                 KKASSERT(error == 0);
1488                 if (cluster->focus == NULL) {
1489                         cluster->focus_index = i;
1490                         cluster->focus = cluster->array[i].chain;
1491                 }
1492                 if (cparent->focus == cparent->array[i].chain) {
1493                         cluster->focus_index = i;
1494                         cluster->focus = cluster->array[i].chain;
1495                 }
1496         }
1497         cluster->nchains = i;
1498         *clusterp = cluster;
1499         hammer2_cluster_resolve(cluster);
1500
1501         return error;
1502 }
1503
1504 /*
1505  * Rename a cluster to a new parent.
1506  *
1507  * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1508  *          So the data_off field is not relevant.  Only the key and
1509  *          keybits are used.
1510  */
1511 void
1512 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1513                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1514                        int flags)
1515 {
1516         hammer2_chain_t *chain;
1517         hammer2_blockref_t xbref;
1518         int i;
1519
1520 #if 0
1521         cluster->focus = NULL;
1522         cparent->focus = NULL;
1523         cluster->focus_index = 0;
1524         cparent->focus_index = 0;
1525 #endif
1526
1527         for (i = 0; i < cluster->nchains; ++i) {
1528                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1529                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1530                         continue;
1531                 }
1532                 chain = cluster->array[i].chain;
1533                 if (chain) {
1534                         if (bref) {
1535                                 xbref = chain->bref;
1536                                 xbref.key = bref->key;
1537                                 xbref.keybits = bref->keybits;
1538                                 hammer2_chain_rename(trans, &xbref,
1539                                                      &cparent->array[i].chain,
1540                                                      chain, flags);
1541                         } else {
1542                                 hammer2_chain_rename(trans, NULL,
1543                                                      &cparent->array[i].chain,
1544                                                      chain, flags);
1545                         }
1546                         if (cparent->focus_index == i)
1547                                 cparent->focus = cparent->array[i].chain;
1548                         KKASSERT(cluster->array[i].chain == chain); /*remove*/
1549                 }
1550         }
1551 }
1552
1553 /*
1554  * Mark a cluster deleted
1555  */
1556 void
1557 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1558                        hammer2_cluster_t *cluster, int flags)
1559 {
1560         hammer2_chain_t *chain;
1561         hammer2_chain_t *parent;
1562         int i;
1563
1564         if (cparent == NULL) {
1565                 kprintf("cparent is NULL\n");
1566                 return;
1567         }
1568
1569         for (i = 0; i < cluster->nchains; ++i) {
1570                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1571                         cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1572                         continue;
1573                 }
1574                 parent = cparent->array[i].chain;
1575                 chain = cluster->array[i].chain;
1576                 if (chain == NULL)
1577                         continue;
1578                 if (chain->parent != parent) {
1579                         kprintf("hammer2_cluster_delete: parent "
1580                                 "mismatch chain=%p parent=%p against=%p\n",
1581                                 chain, chain->parent, parent);
1582                 } else {
1583                         hammer2_chain_delete(trans, parent, chain, flags);
1584                 }
1585         }
1586 }
1587
1588 /*
1589  * Create a snapshot of the specified {parent, ochain} with the specified
1590  * label.  The originating hammer2_inode must be exclusively locked for
1591  * safety.
1592  *
1593  * The ioctl code has already synced the filesystem.
1594  */
1595 int
1596 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1597                        hammer2_ioc_pfs_t *pfs)
1598 {
1599         hammer2_dev_t *hmp;
1600         hammer2_cluster_t *ncluster;
1601         const hammer2_inode_data_t *ripdata;
1602         hammer2_inode_data_t *wipdata;
1603         hammer2_chain_t *nchain;
1604         hammer2_inode_t *nip;
1605         size_t name_len;
1606         hammer2_key_t lhc;
1607         struct vattr vat;
1608 #if 0
1609         uuid_t opfs_clid;
1610 #endif
1611         int error;
1612         int i;
1613
1614         kprintf("snapshot %s\n", pfs->name);
1615
1616         name_len = strlen(pfs->name);
1617         lhc = hammer2_dirhash(pfs->name, name_len);
1618
1619         /*
1620          * Get the clid
1621          */
1622         ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1623 #if 0
1624         opfs_clid = ripdata->pfs_clid;
1625 #endif
1626         hmp = ocluster->focus->hmp;     /* XXX find synchronized local disk */
1627
1628         /*
1629          * Create the snapshot directory under the super-root
1630          *
1631          * Set PFS type, generate a unique filesystem id, and generate
1632          * a cluster id.  Use the same clid when snapshotting a PFS root,
1633          * which theoretically allows the snapshot to be used as part of
1634          * the same cluster (perhaps as a cache).
1635          *
1636          * Copy the (flushed) blockref array.  Theoretically we could use
1637          * chain_duplicate() but it becomes difficult to disentangle
1638          * the shared core so for now just brute-force it.
1639          */
1640         VATTR_NULL(&vat);
1641         vat.va_type = VDIR;
1642         vat.va_mode = 0755;
1643         ncluster = NULL;
1644         nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1645                                    proc0.p_ucred, pfs->name, name_len,
1646                                    &ncluster,
1647                                    HAMMER2_INSERT_PFSROOT, &error);
1648
1649         if (nip) {
1650                 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1651                 wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER;
1652                 wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1653                 wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT;
1654                 kern_uuidgen(&wipdata->pfs_fsid, 1);
1655
1656                 /*
1657                  * Give the snapshot its own private cluster.  As a snapshot
1658                  * no further synchronization with the original cluster will
1659                  * be done.
1660                  */
1661 #if 0
1662                 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1663                         wipdata->pfs_clid = opfs_clid;
1664                 else
1665                         kern_uuidgen(&wipdata->pfs_clid, 1);
1666 #endif
1667                 kern_uuidgen(&wipdata->pfs_clid, 1);
1668
1669                 for (i = 0; i < ncluster->nchains; ++i) {
1670                         if ((ncluster->array[i].flags &
1671                              HAMMER2_CITEM_FEMOD) == 0) {
1672                                 ncluster->array[i].flags |=
1673                                         HAMMER2_CITEM_INVALID;
1674                                 continue;
1675                         }
1676                         nchain = ncluster->array[i].chain;
1677                         if (nchain)
1678                                 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1679                 }
1680 #if 0
1681                 /* XXX can't set this unless we do an explicit flush, which
1682                    we also need a pmp assigned to do, else the flush code
1683                    won't flush ncluster because it thinks it is crossing a
1684                    flush boundary */
1685                 hammer2_cluster_set_chainflags(ncluster,
1686                                                HAMMER2_CHAIN_PFSBOUNDARY);
1687 #endif
1688
1689                 /* XXX hack blockset copy */
1690                 /* XXX doesn't work with real cluster */
1691                 KKASSERT(ocluster->nchains == 1);
1692                 wipdata->u.blockset = ripdata->u.blockset;
1693                 hammer2_cluster_modsync(ncluster);
1694                 for (i = 0; i < ncluster->nchains; ++i) {
1695                         nchain = ncluster->array[i].chain;
1696                         if (nchain)
1697                                 hammer2_flush(trans, nchain, 1);
1698                 }
1699                 hammer2_inode_unlock(nip, ncluster);
1700         }
1701         return (error);
1702 }
1703
1704 /*
1705  * Return locked parent cluster given a locked child.  The child remains
1706  * locked on return.  The new parent's focus follows the child's focus
1707  * and the parent is always resolved.
1708  *
1709  * We must temporarily unlock the passed-in cluster to avoid a deadlock
1710  * between elements of the cluster.
1711  *
1712  * We must not try to hammer2_cluster_resolve() cparent.  The individual
1713  * parent chains for the nodes are the correct parents for the cluster but
1714  * do not necessarily match, so resolve would likely implode.
1715  */
1716 hammer2_cluster_t *
1717 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1718 {
1719         hammer2_cluster_t *cparent;
1720         int i;
1721
1722         cparent = hammer2_cluster_copy(cluster);
1723         hammer2_cluster_unlock(cluster);
1724
1725         for (i = 0; i < cparent->nchains; ++i) {
1726                 hammer2_chain_t *chain;
1727                 hammer2_chain_t *rchain;
1728
1729                 /*
1730                  * Calculate parent for each element.  Old chain has an extra
1731                  * ref for cparent but the lock remains with cluster.
1732                  */
1733                 chain = cparent->array[i].chain;
1734                 if (chain == NULL)
1735                         continue;
1736                 while ((rchain = chain->parent) != NULL) {
1737                         hammer2_chain_ref(rchain);
1738                         hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1739                         if (chain->parent == rchain)
1740                                 break;
1741                         hammer2_chain_unlock(rchain);
1742                         hammer2_chain_drop(rchain);
1743                 }
1744                 cparent->array[i].chain = rchain;
1745                 hammer2_chain_drop(chain);
1746         }
1747         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1748         /* hammer2_cluster_resolve(cparent); */
1749         hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1750
1751         return cparent;
1752 }
1753
1754 /************************************************************************
1755  *                              CLUSTER I/O                             *
1756  ************************************************************************
1757  *
1758  *
1759  * WARNING! blockref[] array data is not universal.  These functions should
1760  *          only be used to access universal data.
1761  *
1762  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1763  *          complete if necessary.  The I/O's should have already been
1764  *          initiated by the cluster_lock/chain_lock operation.
1765  *
1766  *          The cluster must already be in a modified state before wdata
1767  *          is called.  The data will already be available for this case.
1768  */
1769 const hammer2_media_data_t *
1770 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1771 {
1772         return(cluster->focus->data);
1773 }
1774
1775 hammer2_media_data_t *
1776 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1777 {
1778         KKASSERT(hammer2_cluster_modified(cluster));
1779         return(cluster->focus->data);
1780 }
1781
1782 /*
1783  * Load cluster data asynchronously with callback.
1784  *
1785  * The callback is made for the first validated data found, or NULL
1786  * if no valid data is available.
1787  *
1788  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1789  *       in the inode with an exclusive lock held), the chain structure may be
1790  *       shared.
1791  */
1792 void
1793 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1794                            void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1795 {
1796         hammer2_chain_t *chain;
1797         hammer2_iocb_t *iocb;
1798         hammer2_dev_t *hmp;
1799         hammer2_blockref_t *bref;
1800         int i;
1801
1802         i = cluster->focus_index;
1803         chain = cluster->focus;
1804
1805         iocb = &cluster->iocb;
1806         iocb->callback = callback;
1807         iocb->dio = NULL;               /* for already-validated case */
1808         iocb->cluster = cluster;
1809         iocb->chain = chain;
1810         iocb->ptr = ptr;
1811         iocb->lbase = (off_t)i;
1812         iocb->flags = 0;
1813         iocb->error = 0;
1814
1815         /*
1816          * Data already validated
1817          */
1818         if (chain->data) {
1819                 callback(iocb);
1820                 return;
1821         }
1822
1823         /*
1824          * We must resolve to a device buffer, either by issuing I/O or
1825          * by creating a zero-fill element.  We do not mark the buffer
1826          * dirty when creating a zero-fill element (the hammer2_chain_modify()
1827          * API must still be used to do that).
1828          *
1829          * The device buffer is variable-sized in powers of 2 down
1830          * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1831          * chunk always contains buffers of the same size. (XXX)
1832          *
1833          * The minimum physical IO size may be larger than the variable
1834          * block size.
1835          *
1836          * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
1837          *            matches hammer2_devblksize()?  Or does the freemap's
1838          *            pre-zeroing handle the case for us?
1839          */
1840         bref = &chain->bref;
1841         hmp = chain->hmp;
1842
1843 #if 0
1844         /* handled by callback? <- TODO XXX even needed for loads? */
1845         /*
1846          * The getblk() optimization for a 100% overwrite can only be used
1847          * if the physical block size matches the request.
1848          */
1849         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1850             chain->bytes == hammer2_devblksize(chain->bytes)) {
1851                 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1852                 KKASSERT(error == 0);
1853                 iocb->dio = dio;
1854                 callback(iocb);
1855                 return;
1856         }
1857 #endif
1858
1859         /*
1860          * Otherwise issue a read
1861          */
1862         hammer2_adjreadcounter(&chain->bref, chain->bytes);
1863         hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1864 }