ffc1838eca843249a7c2affc1e201847151eea19
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *          locks and I/O, do quorum and/or master-slave processing, and
50  *          it must operate properly even if some nodes are broken (which
51  *          can also mean indefinite locks).
52  *
53  *                              CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *              hammer2_cluster_lock()
59  *              hammer2_cluster_parent()
60  *
61  *      - Most complex functions, quorum management on transaction ids.
62  *
63  *      - Locking and data accesses must be internally asynchronous.
64  *
65  *      - Validate and manage cache coherency primitives (cache state
66  *        is stored in chain topologies but must be validated by these
67  *        functions).
68  *
69  * (2) Lookups and Scans
70  *              hammer2_cluster_lookup()
71  *              hammer2_cluster_next()
72  *
73  *      - Depend on locking & data retrieval functions, but still complex.
74  *
75  *      - Must do quorum management on transaction ids.
76  *
77  *      - Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *              hammer2_cluster_create()
81  *              hammer2_cluster_rename()
82  *              hammer2_cluster_delete()
83  *              hammer2_cluster_modify()
84  *              hammer2_cluster_modsync()
85  *
86  *      - Can usually punt on failures, operation continues unless quorum
87  *        is lost.  If quorum is lost, must wait for resynchronization
88  *        (depending on the management mode).
89  *
90  *      - Must disconnect node on failures (also not flush), remount, and
91  *        resynchronize.
92  *
93  *      - Network links (via kdmsg) are relatively easy to issue as the
94  *        complex underworkings of hammer2_chain.c don't have to messed
95  *        with (the protocol is at a higher level than block-level).
96  *
97  *      - Multiple local disk nodes (i.e. block devices) are another matter.
98  *        Chain operations have to be dispatched to per-node threads (xN)
99  *        because we can't asynchronize potentially very complex chain
100  *        operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *        (these threads are also used to terminate incoming kdmsg ops from
103  *        other machines).
104  *
105  *      - Single-node filesystems do not use threads and will simply call
106  *        hammer2_chain.c functions directly.  This short-cut is handled
107  *        at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115
116 #include "hammer2.h"
117
118 /*
119  * Returns TRUE if any chain in the cluster needs to be resized.
120  */
121 int
122 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
123 {
124         hammer2_chain_t *chain;
125         int i;
126
127         for (i = 0; i < cluster->nchains; ++i) {
128                 chain = cluster->array[i].chain;
129                 if (chain && chain->bytes != bytes)
130                         return 1;
131         }
132         return 0;
133 }
134
135 uint8_t
136 hammer2_cluster_type(hammer2_cluster_t *cluster)
137 {
138         return(cluster->focus->bref.type);
139 }
140
141 int
142 hammer2_cluster_modified(hammer2_cluster_t *cluster)
143 {
144         return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
145 }
146
147 /*
148  * Return a bref representative of the cluster.  Any data offset is removed
149  * (since it would only be applicable to a particular chain in the cluster).
150  *
151  * However, the radix portion of data_off is used for many purposes and will
152  * be retained.
153  */
154 void
155 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
156 {
157         *bref = cluster->focus->bref;
158         bref->data_off &= HAMMER2_OFF_MASK_RADIX;
159 }
160
161 /*
162  * Return non-zero if the chain representing an inode has been flagged
163  * as having been unlinked.  Allows the vnode reclaim to avoid loading
164  * the inode data from disk e.g. when unmount or recycling old, clean
165  * vnodes.
166  */
167 int
168 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster)
169 {
170         hammer2_chain_t *chain;
171         int flags;
172         int i;
173
174         flags = 0;
175         for (i = 0; i < cluster->nchains; ++i) {
176                 chain = cluster->array[i].chain;
177                 if (chain)
178                         flags |= chain->flags;
179         }
180         return (flags & HAMMER2_CHAIN_UNLINKED);
181 }
182
183 void
184 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
185 {
186         hammer2_chain_t *chain;
187         int i;
188
189         for (i = 0; i < cluster->nchains; ++i) {
190                 chain = cluster->array[i].chain;
191                 if (chain)
192                         atomic_set_int(&chain->flags, flags);
193         }
194 }
195
196 void
197 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
198 {
199         hammer2_chain_t *chain;
200         int i;
201
202         for (i = 0; i < cluster->nchains; ++i) {
203                 chain = cluster->array[i].chain;
204                 if (chain)
205                         atomic_clear_int(&chain->flags, flags);
206         }
207 }
208
209 void
210 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
211 {
212         hammer2_chain_t *chain;
213         int i;
214
215         for (i = 0; i < cluster->nchains; ++i) {
216                 chain = cluster->array[i].chain;
217                 if (chain)
218                         hammer2_chain_setflush(trans, chain);
219         }
220 }
221
222 void
223 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
224                                 hammer2_cluster_t *cluster,
225                                 int check_algo)
226 {
227         hammer2_chain_t *chain;
228         int i;
229
230         for (i = 0; i < cluster->nchains; ++i) {
231                 chain = cluster->array[i].chain;
232                 if (chain) {
233                         KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
234                         chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
235                         chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
236                 }
237         }
238 }
239
240 /*
241  * Create a cluster with one ref from the specified chain.  The chain
242  * is not further referenced.  The caller typically supplies a locked
243  * chain and transfers ownership to the cluster.
244  *
245  * The returned cluster will be focused on the chain (strictly speaking,
246  * the focus should be NULL if the chain is not locked but we do not check
247  * for this condition).
248  */
249 hammer2_cluster_t *
250 hammer2_cluster_from_chain(hammer2_chain_t *chain)
251 {
252         hammer2_cluster_t *cluster;
253
254         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
255         cluster->array[0].chain = chain;
256         cluster->nchains = 1;
257         cluster->focus = chain;
258         cluster->pmp = chain->pmp;
259         cluster->refs = 1;
260         cluster->flags = HAMMER2_CLUSTER_LOCKED;
261
262         return cluster;
263 }
264
265 #if 0
266 /*
267  * Allocates a cluster and its underlying chain structures.  The underlying
268  * chains will be locked.  The cluster and underlying chains will have one
269  * ref and will be focused on the first chain.
270  *
271  * XXX focus on first chain.
272  */
273 hammer2_cluster_t *
274 hammer2_cluster_alloc(hammer2_pfs_t *pmp,
275                       hammer2_trans_t *trans, hammer2_blockref_t *bref)
276 {
277         hammer2_cluster_t *cluster;
278         hammer2_cluster_t *rcluster;
279         hammer2_chain_t *chain;
280         hammer2_chain_t *rchain;
281 #if 0
282         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
283 #endif
284         int i;
285
286         KKASSERT(pmp != NULL);
287
288         /*
289          * Construct the appropriate system structure.
290          */
291         switch(bref->type) {
292         case HAMMER2_BREF_TYPE_INODE:
293         case HAMMER2_BREF_TYPE_INDIRECT:
294         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
295         case HAMMER2_BREF_TYPE_DATA:
296         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
297                 /*
298                  * Chain's are really only associated with the hmp but we
299                  * maintain a pmp association for per-mount memory tracking
300                  * purposes.  The pmp can be NULL.
301                  */
302                 break;
303         case HAMMER2_BREF_TYPE_VOLUME:
304         case HAMMER2_BREF_TYPE_FREEMAP:
305                 chain = NULL;
306                 panic("hammer2_cluster_alloc volume type illegal for op");
307         default:
308                 chain = NULL;
309                 panic("hammer2_cluster_alloc: unrecognized blockref type: %d",
310                       bref->type);
311         }
312
313         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
314         cluster->refs = 1;
315         cluster->flags = HAMMER2_CLUSTER_LOCKED;
316
317         rcluster = &pmp->iroot->cluster;
318         for (i = 0; i < rcluster->nchains; ++i) {
319                 rchain = rcluster->array[i].chain;
320                 chain = hammer2_chain_alloc(rchain->hmp, pmp, trans, bref);
321 #if 0
322                 chain->hmp = rchain->hmp;
323                 chain->bref = *bref;
324                 chain->bytes = bytes;
325                 chain->refs = 1;
326                 chain->flags |= HAMMER2_CHAIN_ALLOCATED;
327 #endif
328
329                 /*
330                  * NOTE: When loading a chain from backing store or creating a
331                  *       snapshot, trans will be NULL and the caller is
332                  *       responsible for setting these fields.
333                  */
334                 cluster->array[i].chain = chain;
335         }
336         cluster->nchains = i;
337         cluster->pmp = pmp;
338         cluster->focus = cluster->array[0].chain;
339
340         return (cluster);
341 }
342 #endif
343
344 /*
345  * Add a reference to a cluster.
346  *
347  * We must also ref the underlying chains in order to allow ref/unlock
348  * sequences to later re-lock.
349  */
350 void
351 hammer2_cluster_ref(hammer2_cluster_t *cluster)
352 {
353         hammer2_chain_t *chain;
354         int i;
355
356         atomic_add_int(&cluster->refs, 1);
357         for (i = 0; i < cluster->nchains; ++i) {
358                 chain = cluster->array[i].chain;
359                 if (chain)
360                         hammer2_chain_ref(chain);
361         }
362 }
363
364 /*
365  * Drop the caller's reference to the cluster.  When the ref count drops to
366  * zero this function frees the cluster and drops all underlying chains.
367  *
368  * In-progress read I/Os are typically detached from the cluster once the
369  * first one returns (the remaining stay attached to the DIOs but are then
370  * ignored and drop naturally).
371  */
372 void
373 hammer2_cluster_drop(hammer2_cluster_t *cluster)
374 {
375         hammer2_chain_t *chain;
376         int i;
377
378         KKASSERT(cluster->refs > 0);
379         for (i = 0; i < cluster->nchains; ++i) {
380                 chain = cluster->array[i].chain;
381                 if (chain) {
382                         hammer2_chain_drop(chain);
383                         if (cluster->refs == 1)
384                                 cluster->array[i].chain = NULL;
385                 }
386         }
387         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
388                 cluster->focus = NULL;          /* safety */
389                 kfree(cluster, M_HAMMER2);
390                 /* cluster is invalid */
391         }
392 }
393
394 void
395 hammer2_cluster_wait(hammer2_cluster_t *cluster)
396 {
397         tsleep(cluster->focus, 0, "h2clcw", 1);
398 }
399
400 /*
401  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
402  * and then locks them.
403  *
404  * The act of locking a cluster sets its focus if not already set.
405  *
406  * The chains making up the cluster may be narrowed down based on quorum
407  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
408  * narrowed down to a single chain as long as the entire subtopology is known
409  * to be intact.  So, for example, we can narrow a read-only op to a single
410  * fast SLAVE but if we focus a CACHE chain we must still retain at least
411  * a SLAVE to ensure that the subtopology can be accessed.
412  *
413  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
414  * to be maintained once the topology is validated as-of the top level of
415  * the operation.
416  */
417 int
418 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
419 {
420         hammer2_chain_t *chain;
421         hammer2_chain_t *tmp;
422         int i;
423         int error;
424
425         /* cannot be on inode-embedded cluster template, must be on copy */
426         KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
427         if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
428                 kprintf("hammer2_cluster_lock: cluster %p already locked!\n",
429                         cluster);
430         }
431         atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
432
433         if ((how & HAMMER2_RESOLVE_NOREF) == 0)
434                 atomic_add_int(&cluster->refs, 1);
435
436         error = 0;
437
438         for (i = 0; i < cluster->nchains; ++i) {
439                 chain = cluster->array[i].chain;
440                 if (chain) {
441                         error = hammer2_chain_lock(chain, how);
442                         if (error) {
443                                 while (--i >= 0) {
444                                         tmp = cluster->array[i].chain;
445                                         hammer2_chain_unlock(tmp);
446                                 }
447                                 atomic_add_int(&cluster->refs, -1);
448                                 break;
449                         }
450                         if (cluster->focus == NULL)
451                                 cluster->focus = chain;
452                 }
453         }
454         return error;
455 }
456
457 #if 0
458 /*
459  * Replace the contents of dst with src, adding a reference to src's chains
460  * but not adding any additional locks.
461  *
462  * dst is assumed to already have a ref and any chains present in dst are
463  * assumed to be locked and will be unlocked.
464  *
465  * If the chains in src are locked, only one of (src) or (dst) should be
466  * considered locked by the caller after return, not both.
467  */
468 void
469 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src)
470 {
471         hammer2_chain_t *chain;
472         hammer2_chain_t *tmp;
473         int i;
474
475         KKASSERT(dst->refs == 1);
476         dst->focus = NULL;
477
478         for (i = 0; i < src->nchains; ++i) {
479                 chain = src->array[i].chain;
480                 if (chain) {
481                         hammer2_chain_ref(chain);
482                         if (i < dst->nchains &&
483                             (tmp = dst->array[i].chain) != NULL) {
484                                 hammer2_chain_unlock(tmp);
485                         }
486                         dst->array[i].chain = chain;
487                         if (dst->focus == NULL)
488                                 dst->focus = chain;
489                 }
490         }
491         while (i < dst->nchains) {
492                 chain = dst->array[i].chain;
493                 if (chain) {
494                         hammer2_chain_unlock(chain);
495                         dst->array[i].chain = NULL;
496                 }
497                 ++i;
498         }
499         dst->nchains = src->nchains;
500 }
501
502 /*
503  * Replace the contents of the locked destination with the contents of the
504  * locked source.  The destination must have one ref.
505  *
506  * Returns with the destination still with one ref and the copied chains
507  * with an additional lock (representing their state on the destination).
508  * The original chains associated with the destination are unlocked.
509  *
510  * From the point of view of the caller, both src and dst are locked on
511  * call and remain locked on return.
512  *
513  * XXX adjust flag state
514  */
515 void
516 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src)
517 {
518         hammer2_chain_t *chain;
519         hammer2_chain_t *tmp;
520         int i;
521
522         KKASSERT(dst->refs == 1);
523
524         dst->focus = NULL;
525         for (i = 0; i < src->nchains; ++i) {
526                 chain = src->array[i].chain;
527                 if (chain) {
528                         hammer2_chain_lock(chain, 0);
529                         if (i < dst->nchains &&
530                             (tmp = dst->array[i].chain) != NULL) {
531                                 hammer2_chain_unlock(tmp);
532                         }
533                         dst->array[i].chain = chain;
534                 }
535         }
536         while (i < dst->nchains) {
537                 chain = dst->array[i].chain;
538                 if (chain) {
539                         hammer2_chain_unlock(chain);
540                         dst->array[i].chain = NULL;
541                 }
542                 ++i;
543         }
544         dst->nchains = src->nchains;
545         dst->flags = src->flags;
546         dst->focus = src->focus;
547 }
548 #endif
549
550 /*
551  * Copy a cluster, returned a ref'd cluster.  All underlying chains
552  * are also ref'd, but not locked.  The cluster focus is not set because
553  * the cluster is not yet locked (and the originating cluster does not
554  * have to be locked either).
555  */
556 hammer2_cluster_t *
557 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
558 {
559         hammer2_pfs_t *pmp = ocluster->pmp;
560         hammer2_cluster_t *ncluster;
561         hammer2_chain_t *chain;
562         int i;
563
564         ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
565         ncluster->pmp = pmp;
566         ncluster->nchains = ocluster->nchains;
567         ncluster->refs = 1;
568         ncluster->flags = 0;    /* cluster not locked */
569
570         for (i = 0; i < ocluster->nchains; ++i) {
571                 chain = ocluster->array[i].chain;
572                 ncluster->array[i].chain = chain;
573                 if (chain)
574                         hammer2_chain_ref(chain);
575         }
576         return (ncluster);
577 }
578
579 /*
580  * Unlock and deref a cluster.  The cluster is destroyed if this is the
581  * last ref.
582  */
583 void
584 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
585 {
586         hammer2_chain_t *chain;
587         int i;
588
589         if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
590                 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
591                         cluster);
592         }
593         /* KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); */
594         KKASSERT(cluster->refs > 0);
595         atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
596
597         for (i = 0; i < cluster->nchains; ++i) {
598                 chain = cluster->array[i].chain;
599                 if (chain) {
600                         hammer2_chain_unlock(chain);
601                         if (cluster->refs == 1)
602                                 cluster->array[i].chain = NULL; /* safety */
603                 }
604         }
605         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
606                 cluster->focus = NULL;
607                 kfree(cluster, M_HAMMER2);
608                 /* cluster = NULL; safety */
609         }
610 }
611
612 /*
613  * Resize the cluster's physical storage allocation in-place.  This may
614  * replace the cluster's chains.
615  */
616 void
617 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
618                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
619                        int nradix, int flags)
620 {
621         hammer2_chain_t *chain;
622         int i;
623
624         KKASSERT(cparent->pmp == cluster->pmp);         /* can be NULL */
625         KKASSERT(cparent->nchains == cluster->nchains);
626
627         cluster->focus = NULL;
628         for (i = 0; i < cluster->nchains; ++i) {
629                 chain = cluster->array[i].chain;
630                 if (chain) {
631                         KKASSERT(cparent->array[i].chain);
632                         hammer2_chain_resize(trans, ip,
633                                              cparent->array[i].chain, chain,
634                                              nradix, flags);
635                         if (cluster->focus == NULL)
636                                 cluster->focus = chain;
637                 }
638         }
639 }
640
641 /*
642  * Set an inode's cluster modified, marking the related chains RW and
643  * duplicating them if necessary.
644  *
645  * The passed-in chain is a localized copy of the chain previously acquired
646  * when the inode was locked (and possilby replaced in the mean time), and
647  * must also be updated.  In fact, we update it first and then synchronize
648  * the inode's cluster cache.
649  */
650 hammer2_inode_data_t *
651 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
652                           hammer2_cluster_t *cluster, int flags)
653 {
654         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
655         hammer2_cluster_modify(trans, cluster, flags);
656
657         hammer2_inode_repoint(ip, NULL, cluster);
658         if (ip->vp)
659                 vsetisdirty(ip->vp);
660         return (&hammer2_cluster_wdata(cluster)->ipdata);
661 }
662
663 /*
664  * Adjust the cluster's chains to allow modification and adjust the
665  * focus.  Data will be accessible on return.
666  */
667 void
668 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
669                        int flags)
670 {
671         hammer2_chain_t *chain;
672         int i;
673
674         cluster->focus = NULL;
675         for (i = 0; i < cluster->nchains; ++i) {
676                 chain = cluster->array[i].chain;
677                 if (chain) {
678                         hammer2_chain_modify(trans, chain, flags);
679                         if (cluster->focus == NULL)
680                                 cluster->focus = chain;
681                 }
682         }
683 }
684
685 /*
686  * Synchronize modifications from the focus to other chains in a cluster.
687  * Convenient because nominal API users can just modify the contents of the
688  * focus (at least for non-blockref data).
689  *
690  * Nominal front-end operations only edit non-block-table data in a single
691  * chain.  This code copies such modifications to the other chains in the
692  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
693  * by both the frontend and the backend and will explode in fireworks if
694  * blindly copied.
695  */
696 void
697 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
698 {
699         hammer2_chain_t *focus;
700         hammer2_chain_t *scan;
701         const hammer2_inode_data_t *ripdata;
702         hammer2_inode_data_t *wipdata;
703         int i;
704
705         focus = cluster->focus;
706         KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
707
708         for (i = 0; i < cluster->nchains; ++i) {
709                 scan = cluster->array[i].chain;
710                 if (scan == NULL || scan == focus)
711                         continue;
712                 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
713                 KKASSERT(focus->bytes == scan->bytes &&
714                          focus->bref.type == scan->bref.type);
715                 switch(focus->bref.type) {
716                 case HAMMER2_BREF_TYPE_INODE:
717                         ripdata = &focus->data->ipdata;
718                         wipdata = &scan->data->ipdata;
719                         if ((ripdata->op_flags &
720                             HAMMER2_OPFLAG_DIRECTDATA) == 0) {
721                                 bcopy(ripdata, wipdata,
722                                       offsetof(hammer2_inode_data_t, u));
723                                 break;
724                         }
725                         /* fall through to full copy */
726                 case HAMMER2_BREF_TYPE_DATA:
727                         bcopy(focus->data, scan->data, focus->bytes);
728                         break;
729                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
730                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
731                 case HAMMER2_BREF_TYPE_FREEMAP:
732                 case HAMMER2_BREF_TYPE_VOLUME:
733                         panic("hammer2_cluster_modsync: illegal node type");
734                         /* NOT REACHED */
735                         break;
736                 default:
737                         panic("hammer2_cluster_modsync: unknown node type");
738                         break;
739                 }
740         }
741 }
742
743 /*
744  * Lookup initialization/completion API
745  */
746 hammer2_cluster_t *
747 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
748 {
749         hammer2_cluster_t *cluster;
750         int i;
751
752         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
753         cluster->pmp = cparent->pmp;                    /* can be NULL */
754         cluster->flags = 0;     /* cluster not locked (yet) */
755         /* cluster->focus = NULL; already null */
756
757         for (i = 0; i < cparent->nchains; ++i) {
758                 cluster->array[i].chain = cparent->array[i].chain;
759                 if (cluster->focus == NULL)
760                         cluster->focus = cluster->array[i].chain;
761         }
762         cluster->nchains = cparent->nchains;
763
764         /*
765          * Independently lock (this will also give cluster 1 ref)
766          */
767         if (flags & HAMMER2_LOOKUP_SHARED) {
768                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
769                                               HAMMER2_RESOLVE_SHARED);
770         } else {
771                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
772         }
773         return (cluster);
774 }
775
776 void
777 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
778 {
779         if (cparent)
780                 hammer2_cluster_unlock(cparent);
781 }
782
783 /*
784  * Locate first match or overlap under parent, return a new cluster
785  */
786 hammer2_cluster_t *
787 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
788                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
789 {
790         hammer2_pfs_t *pmp;
791         hammer2_cluster_t *cluster;
792         hammer2_chain_t *chain;
793         hammer2_key_t key_accum;
794         hammer2_key_t key_next;
795         hammer2_key_t bref_key;
796         int null_count;
797         int bref_keybits;
798         int i;
799         uint8_t bref_type;
800         u_int bytes;
801
802         pmp = cparent->pmp;                             /* can be NULL */
803         key_accum = *key_nextp;
804         null_count = 0;
805         bref_type = 0;
806         bref_key = 0;
807         bref_keybits = 0;
808         bytes = 0;
809
810         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
811         cluster->pmp = pmp;                             /* can be NULL */
812         cluster->refs = 1;
813         /* cluster->focus = NULL; already null */
814         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
815                 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
816
817         for (i = 0; i < cparent->nchains; ++i) {
818                 key_next = *key_nextp;
819                 if (cparent->array[i].chain == NULL) {
820                         ++null_count;
821                         continue;
822                 }
823                 chain = hammer2_chain_lookup(&cparent->array[i].chain,
824                                              &key_next,
825                                              key_beg, key_end,
826                                              &cparent->array[i].cache_index,
827                                              flags);
828                 cluster->array[i].chain = chain;
829                 if (chain == NULL) {
830                         ++null_count;
831                 } else {
832                         int ddflag = (chain->bref.type ==
833                                       HAMMER2_BREF_TYPE_INODE);
834
835                         /*
836                          * Set default focus.
837                          */
838                         if (cluster->focus == NULL) {
839                                 bref_type = chain->bref.type;
840                                 bref_key = chain->bref.key;
841                                 bref_keybits = chain->bref.keybits;
842                                 bytes = chain->bytes;
843                                 cluster->ddflag = ddflag;
844                                 cluster->focus = chain;
845                         }
846
847                         /*
848                          * Override default focus to follow the parent.
849                          */
850                         if (cparent->focus == cparent->array[i].chain)
851                                 cluster->focus = chain;
852
853                         KKASSERT(bref_type == chain->bref.type);
854                         KKASSERT(bref_key == chain->bref.key);
855                         KKASSERT(bref_keybits == chain->bref.keybits);
856                         KKASSERT(bytes == chain->bytes);
857                         KKASSERT(cluster->ddflag == ddflag);
858                 }
859                 if (key_accum > key_next)
860                         key_accum = key_next;
861         }
862         *key_nextp = key_accum;
863         cluster->nchains = i;
864
865         if (null_count == i) {
866                 hammer2_cluster_drop(cluster);
867                 cluster = NULL;
868         }
869
870         return (cluster);
871 }
872
873 /*
874  * Locate next match or overlap under parent, replace cluster
875  */
876 hammer2_cluster_t *
877 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
878                      hammer2_key_t *key_nextp,
879                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
880 {
881         hammer2_chain_t *chain;
882         hammer2_key_t key_accum;
883         hammer2_key_t key_next;
884         hammer2_key_t bref_key;
885         int null_count;
886         int bref_keybits;
887         int i;
888         uint8_t bref_type;
889         u_int bytes;
890
891         key_accum = *key_nextp;
892         null_count = 0;
893         cluster->focus = NULL;
894         cparent->focus = NULL;
895
896         bref_type = 0;
897         bref_key = 0;
898         bref_keybits = 0;
899         bytes = 0;
900         cluster->ddflag = 0;
901
902         for (i = 0; i < cparent->nchains; ++i) {
903                 key_next = *key_nextp;
904                 chain = cluster->array[i].chain;
905                 if (chain == NULL) {
906                         ++null_count;
907                         continue;
908                 }
909                 if (cparent->array[i].chain == NULL) {
910                         if (flags & HAMMER2_LOOKUP_NOLOCK)
911                                 hammer2_chain_drop(chain);
912                         else
913                                 hammer2_chain_unlock(chain);
914                         ++null_count;
915                         continue;
916                 }
917                 chain = hammer2_chain_next(&cparent->array[i].chain, chain,
918                                            &key_next, key_beg, key_end,
919                                            &cparent->array[i].cache_index,
920                                            flags);
921                 cluster->array[i].chain = chain;
922                 if (chain == NULL) {
923                         ++null_count;
924                 } else {
925                         int ddflag = (chain->bref.type ==
926                                       HAMMER2_BREF_TYPE_INODE);
927                         if (cluster->focus == NULL) {
928                                 bref_type = chain->bref.type;
929                                 bref_key = chain->bref.key;
930                                 bref_keybits = chain->bref.keybits;
931                                 bytes = chain->bytes;
932                                 cluster->ddflag = ddflag;
933                                 cluster->focus = chain;
934                         }
935
936                         /*
937                          * Override default focus to follow the parent.
938                          */
939                         if (cparent->focus == cparent->array[i].chain)
940                                 cluster->focus = chain;
941
942                         KKASSERT(bref_type == chain->bref.type);
943                         KKASSERT(bref_key == chain->bref.key);
944                         KKASSERT(bref_keybits == chain->bref.keybits);
945                         KKASSERT(bytes == chain->bytes);
946                         KKASSERT(cluster->ddflag == ddflag);
947                 }
948                 if (key_accum > key_next)
949                         key_accum = key_next;
950         }
951         cluster->nchains = i;
952
953         if (null_count == i) {
954                 hammer2_cluster_drop(cluster);
955                 cluster = NULL;
956         }
957         return(cluster);
958 }
959
960 #if 0
961 /*
962  * XXX initial NULL cluster needs reworking (pass **clusterp ?)
963  *
964  * The raw scan function is similar to lookup/next but does not seek to a key.
965  * Blockrefs are iterated via first_chain = (parent, NULL) and
966  * next_chain = (parent, chain).
967  *
968  * The passed-in parent must be locked and its data resolved.  The returned
969  * chain will be locked.  Pass chain == NULL to acquire the first sub-chain
970  * under parent and then iterate with the passed-in chain (which this
971  * function will unlock).
972  */
973 hammer2_cluster_t *
974 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
975                      int flags)
976 {
977         hammer2_chain_t *chain;
978         int null_count;
979         int i;
980
981         null_count = 0;
982
983         for (i = 0; i < cparent->nchains; ++i) {
984                 chain = cluster->array[i].chain;
985                 if (chain == NULL) {
986                         ++null_count;
987                         continue;
988                 }
989                 if (cparent->array[i].chain == NULL) {
990                         if (flags & HAMMER2_LOOKUP_NOLOCK)
991                                 hammer2_chain_drop(chain);
992                         else
993                                 hammer2_chain_unlock(chain);
994                         ++null_count;
995                         continue;
996                 }
997
998                 chain = hammer2_chain_scan(cparent->array[i].chain, chain,
999                                            &cparent->array[i].cache_index,
1000                                            flags);
1001                 cluster->array[i].chain = chain;
1002                 if (chain == NULL)
1003                         ++null_count;
1004         }
1005
1006         if (null_count == i) {
1007                 hammer2_cluster_drop(cluster);
1008                 cluster = NULL;
1009         }
1010         return(cluster);
1011 }
1012
1013 #endif
1014
1015 /*
1016  * Create a new cluster using the specified key
1017  */
1018 int
1019 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1020                      hammer2_cluster_t **clusterp,
1021                      hammer2_key_t key, int keybits,
1022                      int type, size_t bytes, int flags)
1023 {
1024         hammer2_cluster_t *cluster;
1025         hammer2_pfs_t *pmp;
1026         int error;
1027         int i;
1028
1029         pmp = trans->pmp;                               /* can be NULL */
1030
1031         if ((cluster = *clusterp) == NULL) {
1032                 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1033                                   M_WAITOK | M_ZERO);
1034                 cluster->pmp = pmp;                     /* can be NULL */
1035                 cluster->refs = 1;
1036                 cluster->flags = HAMMER2_CLUSTER_LOCKED;
1037         }
1038         cluster->focus = NULL;
1039
1040         /*
1041          * NOTE: cluster->array[] entries can initially be NULL.  If
1042          *       *clusterp is supplied, skip NULL entries, otherwise
1043          *       create new chains.
1044          */
1045         for (i = 0; i < cparent->nchains; ++i) {
1046                 if (*clusterp && cluster->array[i].chain == NULL) {
1047                         continue;
1048                 }
1049                 error = hammer2_chain_create(trans, &cparent->array[i].chain,
1050                                              &cluster->array[i].chain, pmp,
1051                                              key, keybits,
1052                                              type, bytes, flags);
1053                 KKASSERT(error == 0);
1054                 if (cluster->focus == NULL)
1055                         cluster->focus = cluster->array[i].chain;
1056                 if (cparent->focus == cparent->array[i].chain)
1057                         cluster->focus = cluster->array[i].chain;
1058         }
1059         cluster->nchains = i;
1060         *clusterp = cluster;
1061
1062         return error;
1063 }
1064
1065 /*
1066  * Rename a cluster to a new parent.
1067  *
1068  * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields
1069  *          are used from a passed-in non-NULL bref pointer.  All other fields
1070  *          are extracted from the original chain for each chain in the
1071  *          iteration.
1072  */
1073 void
1074 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1075                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1076                        int flags)
1077 {
1078         hammer2_chain_t *chain;
1079         hammer2_blockref_t xbref;
1080         int i;
1081
1082         cluster->focus = NULL;
1083         cparent->focus = NULL;
1084
1085         for (i = 0; i < cluster->nchains; ++i) {
1086                 chain = cluster->array[i].chain;
1087                 if (chain) {
1088                         if (bref) {
1089                                 xbref = chain->bref;
1090                                 xbref.key = bref->key;
1091                                 xbref.keybits = bref->keybits;
1092                                 hammer2_chain_rename(trans, &xbref,
1093                                                      &cparent->array[i].chain,
1094                                                      chain, flags);
1095                         } else {
1096                                 hammer2_chain_rename(trans, NULL,
1097                                                      &cparent->array[i].chain,
1098                                                      chain, flags);
1099                         }
1100                         cluster->array[i].chain = chain;
1101                         if (cluster->focus == NULL)
1102                                 cluster->focus = chain;
1103                         if (cparent->focus == NULL)
1104                                 cparent->focus = cparent->array[i].chain;
1105                 } else {
1106                         if (cparent->focus == NULL)
1107                                 cparent->focus = cparent->array[i].chain;
1108                 }
1109         }
1110 }
1111
1112 /*
1113  * Mark a cluster deleted
1114  */
1115 void
1116 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1117                        hammer2_cluster_t *cluster, int flags)
1118 {
1119         hammer2_chain_t *chain;
1120         hammer2_chain_t *parent;
1121         int i;
1122
1123         if (cparent == NULL) {
1124                 kprintf("cparent is NULL\n");
1125                 return;
1126         }
1127
1128         for (i = 0; i < cluster->nchains; ++i) {
1129                 parent = (i < cparent->nchains) ?
1130                          cparent->array[i].chain : NULL;
1131                 chain = cluster->array[i].chain;
1132                 if (chain == NULL)
1133                         continue;
1134                 if (chain->parent != parent) {
1135                         kprintf("hammer2_cluster_delete: parent "
1136                                 "mismatch chain=%p parent=%p against=%p\n",
1137                                 chain, chain->parent, parent);
1138                 } else {
1139                         hammer2_chain_delete(trans, parent, chain, flags);
1140                 }
1141         }
1142 }
1143
1144 /*
1145  * Create a snapshot of the specified {parent, ochain} with the specified
1146  * label.  The originating hammer2_inode must be exclusively locked for
1147  * safety.
1148  *
1149  * The ioctl code has already synced the filesystem.
1150  */
1151 int
1152 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1153                        hammer2_ioc_pfs_t *pfs)
1154 {
1155         hammer2_dev_t *hmp;
1156         hammer2_cluster_t *ncluster;
1157         const hammer2_inode_data_t *ripdata;
1158         hammer2_inode_data_t *wipdata;
1159         hammer2_chain_t *nchain;
1160         hammer2_inode_t *nip;
1161         size_t name_len;
1162         hammer2_key_t lhc;
1163         struct vattr vat;
1164 #if 0
1165         uuid_t opfs_clid;
1166 #endif
1167         int error;
1168         int i;
1169
1170         kprintf("snapshot %s\n", pfs->name);
1171
1172         name_len = strlen(pfs->name);
1173         lhc = hammer2_dirhash(pfs->name, name_len);
1174
1175         /*
1176          * Get the clid
1177          */
1178         ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1179 #if 0
1180         opfs_clid = ripdata->pfs_clid;
1181 #endif
1182         hmp = ocluster->focus->hmp;     /* XXX find synchronized local disk */
1183
1184         /*
1185          * Create the snapshot directory under the super-root
1186          *
1187          * Set PFS type, generate a unique filesystem id, and generate
1188          * a cluster id.  Use the same clid when snapshotting a PFS root,
1189          * which theoretically allows the snapshot to be used as part of
1190          * the same cluster (perhaps as a cache).
1191          *
1192          * Copy the (flushed) blockref array.  Theoretically we could use
1193          * chain_duplicate() but it becomes difficult to disentangle
1194          * the shared core so for now just brute-force it.
1195          */
1196         VATTR_NULL(&vat);
1197         vat.va_type = VDIR;
1198         vat.va_mode = 0755;
1199         ncluster = NULL;
1200         nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1201                                    proc0.p_ucred, pfs->name, name_len,
1202                                    &ncluster,
1203                                    HAMMER2_INSERT_PFSROOT, &error);
1204
1205         if (nip) {
1206                 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1207                 wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
1208                 wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT;
1209                 kern_uuidgen(&wipdata->pfs_fsid, 1);
1210
1211                 /*
1212                  * Give the snapshot its own private cluster.  As a snapshot
1213                  * no further synchronization with the original cluster will
1214                  * be done.
1215                  */
1216 #if 0
1217                 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1218                         wipdata->pfs_clid = opfs_clid;
1219                 else
1220                         kern_uuidgen(&wipdata->pfs_clid, 1);
1221 #endif
1222                 kern_uuidgen(&wipdata->pfs_clid, 1);
1223
1224                 for (i = 0; i < ncluster->nchains; ++i) {
1225                         nchain = ncluster->array[i].chain;
1226                         if (nchain)
1227                                 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1228                 }
1229 #if 0
1230                 /* XXX can't set this unless we do an explicit flush, which
1231                    we also need a pmp assigned to do, else the flush code
1232                    won't flush ncluster because it thinks it is crossing a
1233                    flush boundary */
1234                 hammer2_cluster_set_chainflags(ncluster,
1235                                                HAMMER2_CHAIN_PFSBOUNDARY);
1236 #endif
1237
1238                 /* XXX hack blockset copy */
1239                 /* XXX doesn't work with real cluster */
1240                 KKASSERT(ocluster->nchains == 1);
1241                 wipdata->u.blockset = ripdata->u.blockset;
1242                 hammer2_cluster_modsync(ncluster);
1243                 for (i = 0; i < ncluster->nchains; ++i) {
1244                         nchain = ncluster->array[i].chain;
1245                         if (nchain)
1246                                 hammer2_flush(trans, nchain);
1247                 }
1248                 hammer2_inode_unlock_ex(nip, ncluster);
1249         }
1250         return (error);
1251 }
1252
1253 /*
1254  * Return locked parent cluster given a locked child.  The child remains
1255  * locked on return.  The new parent's focus follows the child's focus
1256  * and the parent is always resolved.
1257  */
1258 hammer2_cluster_t *
1259 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1260 {
1261         hammer2_cluster_t *cparent;
1262         int i;
1263
1264         cparent = hammer2_cluster_copy(cluster);
1265
1266         for (i = 0; i < cparent->nchains; ++i) {
1267                 hammer2_chain_t *chain;
1268                 hammer2_chain_t *rchain;
1269
1270                 /*
1271                  * Calculate parent for each element.  Old chain has an extra
1272                  * ref for cparent but the lock remains with cluster.
1273                  */
1274                 chain = cparent->array[i].chain;
1275                 if (chain == NULL)
1276                         continue;
1277                 while ((rchain = chain->parent) != NULL) {
1278                         hammer2_chain_ref(rchain);
1279                         hammer2_chain_unlock(chain);
1280                         hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1281                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1282                         hammer2_chain_drop(rchain);
1283                         if (chain->parent == rchain)
1284                                 break;
1285                         hammer2_chain_unlock(rchain);
1286                 }
1287                 if (cluster->focus == chain)
1288                         cparent->focus = rchain;
1289                 cparent->array[i].chain = rchain;
1290                 hammer2_chain_drop(chain);
1291         }
1292         cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1293
1294         return cparent;
1295 }
1296
1297 /************************************************************************
1298  *                              CLUSTER I/O                             *
1299  ************************************************************************
1300  *
1301  *
1302  * WARNING! blockref[] array data is not universal.  These functions should
1303  *          only be used to access universal data.
1304  *
1305  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1306  *          complete if necessary.  The I/O's should have already been
1307  *          initiated by the cluster_lock/chain_lock operation.
1308  *
1309  *          The cluster must already be in a modified state before wdata
1310  *          is called.  The data will already be available for this case.
1311  */
1312 const hammer2_media_data_t *
1313 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1314 {
1315         return(cluster->focus->data);
1316 }
1317
1318 hammer2_media_data_t *
1319 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1320 {
1321         KKASSERT(hammer2_cluster_modified(cluster));
1322         return(cluster->focus->data);
1323 }
1324
1325 /*
1326  * Load cluster data asynchronously with callback.
1327  *
1328  * The callback is made for the first validated data found, or NULL
1329  * if no valid data is available.
1330  *
1331  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1332  *       in the inode with an exclusive lock held), the chain structure may be
1333  *       shared.
1334  */
1335 void
1336 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1337                            void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1338 {
1339         hammer2_chain_t *chain;
1340         hammer2_iocb_t *iocb;
1341         hammer2_dev_t *hmp;
1342         hammer2_blockref_t *bref;
1343         int i;
1344
1345         /*
1346          * Try to find a chain whos data is already resolved.  If none can
1347          * be found, start with the first chain.
1348          */
1349         chain = NULL;
1350         for (i = 0; i < cluster->nchains; ++i) {
1351                 chain = cluster->array[i].chain;
1352                 if (chain && chain->data)
1353                         break;
1354         }
1355         if (i == cluster->nchains) {
1356                 chain = cluster->array[0].chain;
1357                 i = 0;
1358         }
1359
1360         iocb = &cluster->iocb;
1361         iocb->callback = callback;
1362         iocb->dio = NULL;               /* for already-validated case */
1363         iocb->cluster = cluster;
1364         iocb->chain = chain;
1365         iocb->ptr = ptr;
1366         iocb->lbase = (off_t)i;
1367         iocb->flags = 0;
1368         iocb->error = 0;
1369
1370         /*
1371          * Data already validated
1372          */
1373         if (chain->data) {
1374                 callback(iocb);
1375                 return;
1376         }
1377
1378         /*
1379          * We must resolve to a device buffer, either by issuing I/O or
1380          * by creating a zero-fill element.  We do not mark the buffer
1381          * dirty when creating a zero-fill element (the hammer2_chain_modify()
1382          * API must still be used to do that).
1383          *
1384          * The device buffer is variable-sized in powers of 2 down
1385          * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1386          * chunk always contains buffers of the same size. (XXX)
1387          *
1388          * The minimum physical IO size may be larger than the variable
1389          * block size.
1390          *
1391          * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
1392          *            matches hammer2_devblksize()?  Or does the freemap's
1393          *            pre-zeroing handle the case for us?
1394          */
1395         bref = &chain->bref;
1396         hmp = chain->hmp;
1397
1398 #if 0
1399         /* handled by callback? <- TODO XXX even needed for loads? */
1400         /*
1401          * The getblk() optimization for a 100% overwrite can only be used
1402          * if the physical block size matches the request.
1403          */
1404         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1405             chain->bytes == hammer2_devblksize(chain->bytes)) {
1406                 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1407                 KKASSERT(error == 0);
1408                 iocb->dio = dio;
1409                 callback(iocb);
1410                 return;
1411         }
1412 #endif
1413
1414         /*
1415          * Otherwise issue a read
1416          */
1417         hammer2_adjreadcounter(&chain->bref, chain->bytes);
1418         hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1419 }
1420
1421 /************************************************************************
1422  *                          NODE FAILURES                               *
1423  ************************************************************************
1424  *
1425  * A node failure can occur for numerous reasons.
1426  *
1427  *      - A read I/O may fail
1428  *      - A write I/O may fail
1429  *      - An unexpected chain might be found (or be missing)
1430  *      - A node might disconnect temporarily and reconnect later
1431  *        (for example, a USB stick could get pulled, or a node might
1432  *        be programmatically disconnected).
1433  *      - A node might run out of space during a modifying operation.
1434  *
1435  * When a read failure or an unexpected chain state is found, the chain and
1436  * parent chain at the failure point for the nodes involved (the nodes
1437  * which we determine to be in error) are flagged as failed and removed
1438  * from the cluster.  The node itself is allowed to remain active.  The
1439  * highest common point (usually a parent chain) is queued to the
1440  * resynchronization thread for action.
1441  *
1442  * When a write I/O fails or a node runs out of space, we first adjust
1443  * as if a read failure occurs but we further disable flushes on the
1444  * ENTIRE node.  Concurrent modifying transactions are allowed to complete
1445  * but any new modifying transactions will automatically remove the node
1446  * from consideration in all related cluster structures and not generate
1447  * any new modified chains.  The ROOT chain for the failed node(s) is queued
1448  * to the resynchronization thread for action.
1449  *
1450  * A temporary disconnect is handled as if a write failure occurred.
1451  *
1452  * Any of these failures might or might not stall related high level VNOPS,
1453  * depending on what has failed, what nodes remain, the type of cluster,
1454  * and the operating state of the cluster.
1455  *
1456  *                          FLUSH ON WRITE-DISABLED NODES
1457  *
1458  * A flush on a write-disabled node is not allowed to write anything because
1459  * we cannot safely update the mirror_tid anywhere on the failed node.  The
1460  * synchronization thread uses mirror_tid to calculate incremental resyncs.
1461  * Dirty meta-data related to the failed node is thrown away.
1462  *
1463  * Dirty buffer cache buffers and inodes are only thrown away if they can be
1464  * retired... that is, if the filesystem still has enough nodes to complete
1465  * the operation.
1466  */
1467
1468 /************************************************************************
1469  *                      SYNCHRONIZATION THREAD                          *
1470  ************************************************************************
1471  *
1472  * This thread is responsible for [re]synchronizing the cluster representing
1473  * a PFS.  Any out-of-sync or failed node starts this thread on a
1474  * node-by-node basis when the failure is detected.
1475  *
1476  * Clusters needing resynchronization are queued at the highest point
1477  * where the parent on the failed node is still valid, or a special
1478  * incremental scan from the ROOT is queued if no parent exists.  This
1479  * thread is also responsible for waiting for reconnections of the failed
1480  * node if the cause was due to a disconnect, and waiting for space to be
1481  * freed up if the cause was due to running out of space.
1482  *
1483  * If the cause is due to a node running out of space, this thread will also
1484  * remove older (unlocked) snapshots to make new space, recover space, and
1485  * then start resynchronization.
1486  *
1487  * Each resynchronization pass virtually snapshots the PFS on the good nodes
1488  * and synchronizes using that snapshot against the target node.  This
1489  * ensures a consistent chain topology and also avoids interference between
1490  * the resynchronization thread and frontend operations.
1491  *
1492  * Since these are per-node threads it is possible to resynchronize several
1493  * nodes at once.
1494  */