hammer2 - major simplification 1/many (stabilization)
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information into a single entity.  It allows direct access to media
37  * data as long as it is not blockref array data.  Meaning, basically,
38  * just inode and file data.
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  */
48 #include <sys/cdefs.h>
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/types.h>
52 #include <sys/lock.h>
53 #include <sys/uuid.h>
54
55 #include "hammer2.h"
56
57 u_int
58 hammer2_cluster_bytes(hammer2_cluster_t *cluster)
59 {
60         return(cluster->focus->bytes);
61 }
62
63 uint8_t
64 hammer2_cluster_type(hammer2_cluster_t *cluster)
65 {
66         return(cluster->focus->bref.type);
67 }
68
69 /*
70  * NOTE: When modifying a cluster object via hammer2_cluster_wdata()
71  *       and hammer2_cluster_modsync(), remember that block array
72  *       entries are not copied to the elements of the cluster.
73  */
74 const hammer2_media_data_t *
75 hammer2_cluster_data(hammer2_cluster_t *cluster)
76 {
77         return(cluster->focus->data);
78 }
79
80 hammer2_media_data_t *
81 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
82 {
83         return(cluster->focus->data);
84 }
85
86 int
87 hammer2_cluster_modified(hammer2_cluster_t *cluster)
88 {
89         return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
90 }
91
92 /*
93  * Return a bref representative of the cluster.  Any data offset is removed
94  * (since it would only be applicable to a particular chain in the cluster).
95  *
96  * However, the radix portion of data_off is used for many purposes and will
97  * be retained.
98  */
99 void
100 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
101 {
102         *bref = cluster->focus->bref;
103         bref->data_off &= HAMMER2_OFF_MASK_RADIX;
104 }
105
106 void
107 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
108 {
109         hammer2_chain_t *chain;
110         int i;
111
112         for (i = 0; i < cluster->nchains; ++i) {
113                 chain = cluster->array[i];
114                 if (chain)
115                         atomic_set_int(&chain->flags, flags);
116         }
117 }
118
119 void
120 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
121 {
122         hammer2_chain_t *chain;
123         int i;
124
125         for (i = 0; i < cluster->nchains; ++i) {
126                 chain = cluster->array[i];
127                 if (chain)
128                         hammer2_chain_setflush(trans, chain);
129         }
130 }
131
132 /*
133  * Create a cluster with one ref from the specified chain.  The chain
134  * is not further referenced.  The caller typically supplies a locked
135  * chain and transfers ownership to the cluster.
136  */
137 hammer2_cluster_t *
138 hammer2_cluster_from_chain(hammer2_chain_t *chain)
139 {
140         hammer2_cluster_t *cluster;
141
142         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
143         cluster->array[0] = chain;
144         cluster->nchains = 1;
145         cluster->focus = chain;
146         cluster->pmp = chain->pmp;
147         cluster->refs = 1;
148
149         return cluster;
150 }
151
152 /*
153  * Allocates a cluster and its underlying chain structures.  The underlying
154  * chains will be locked.  The cluster and underlying chains will have one
155  * ref.
156  */
157 hammer2_cluster_t *
158 hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
159                       hammer2_trans_t *trans, hammer2_blockref_t *bref)
160 {
161         hammer2_cluster_t *cluster;
162         hammer2_cluster_t *rcluster;
163         hammer2_chain_t *chain;
164 #if 0
165         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
166 #endif
167         int i;
168
169         KKASSERT(pmp != NULL);
170
171         /*
172          * Construct the appropriate system structure.
173          */
174         switch(bref->type) {
175         case HAMMER2_BREF_TYPE_INODE:
176         case HAMMER2_BREF_TYPE_INDIRECT:
177         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
178         case HAMMER2_BREF_TYPE_DATA:
179         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
180                 /*
181                  * Chain's are really only associated with the hmp but we
182                  * maintain a pmp association for per-mount memory tracking
183                  * purposes.  The pmp can be NULL.
184                  */
185                 break;
186         case HAMMER2_BREF_TYPE_VOLUME:
187         case HAMMER2_BREF_TYPE_FREEMAP:
188                 chain = NULL;
189                 panic("hammer2_cluster_alloc volume type illegal for op");
190         default:
191                 chain = NULL;
192                 panic("hammer2_cluster_alloc: unrecognized blockref type: %d",
193                       bref->type);
194         }
195
196         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
197         cluster->refs = 1;
198
199         rcluster = &pmp->iroot->cluster;
200         for (i = 0; i < rcluster->nchains; ++i) {
201                 chain = hammer2_chain_alloc(rcluster->array[i]->hmp,
202                                             pmp, trans, bref);
203 #if 0
204                 chain->hmp = rcluster->array[i]->hmp;
205                 chain->bref = *bref;
206                 chain->bytes = bytes;
207                 chain->refs = 1;
208                 chain->flags = HAMMER2_CHAIN_ALLOCATED;
209 #endif
210
211                 /*
212                  * NOTE: When loading a chain from backing store or creating a
213                  *       snapshot, trans will be NULL and the caller is
214                  *       responsible for setting these fields.
215                  */
216                 cluster->array[i] = chain;
217         }
218         cluster->nchains = i;
219         cluster->pmp = pmp;
220         cluster->focus = cluster->array[0];
221
222         return (cluster);
223 }
224
225 /*
226  * Add a reference to a cluster.
227  *
228  * We must also ref the underlying chains in order to allow ref/unlock
229  * sequences to later re-lock.
230  */
231 void
232 hammer2_cluster_ref(hammer2_cluster_t *cluster)
233 {
234         hammer2_chain_t *chain;
235         int i;
236
237         atomic_add_int(&cluster->refs, 1);
238         for (i = 0; i < cluster->nchains; ++i) {
239                 chain = cluster->array[i];
240                 if (chain)
241                         hammer2_chain_ref(chain);
242         }
243 }
244
245 /*
246  * Drop the caller's reference to the cluster.  When the ref count drops to
247  * zero this function frees the cluster and drops all underlying chains.
248  */
249 void
250 hammer2_cluster_drop(hammer2_cluster_t *cluster)
251 {
252         hammer2_chain_t *chain;
253         int i;
254
255         KKASSERT(cluster->refs > 0);
256         for (i = 0; i < cluster->nchains; ++i) {
257                 chain = cluster->array[i];
258                 if (chain) {
259                         hammer2_chain_drop(chain);
260                         if (cluster->refs == 1)
261                                 cluster->array[i] = NULL;
262                 }
263         }
264         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
265                 cluster->focus = NULL;
266                 kfree(cluster, M_HAMMER2);
267                 /* cluster = NULL; safety */
268         }
269 }
270
271 void
272 hammer2_cluster_wait(hammer2_cluster_t *cluster)
273 {
274         tsleep(cluster->focus, 0, "h2clcw", 1);
275 }
276
277 /*
278  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
279  * and then locks them.
280  */
281 int
282 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
283 {
284         hammer2_chain_t *chain;
285         int i;
286         int error;
287
288         error = 0;
289         atomic_add_int(&cluster->refs, 1);
290         for (i = 0; i < cluster->nchains; ++i) {
291                 chain = cluster->array[i];
292                 if (chain) {
293                         error = hammer2_chain_lock(chain, how);
294                         if (error) {
295                                 while (--i >= 0)
296                                         hammer2_chain_unlock(cluster->array[i]);
297                                 atomic_add_int(&cluster->refs, -1);
298                                 break;
299                         }
300                 }
301         }
302         return error;
303 }
304
305 /*
306  * Replace the contents of dst with src, adding a reference to src's chains.
307  * dst is assumed to already have a ref and any chains present in dst are
308  * assumed to be locked and will be unlocked.
309  *
310  * If the chains in src are locked, only one of (src) or (dst) should be
311  * considered locked by the caller after return, not both.
312  */
313 void
314 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src)
315 {
316         hammer2_chain_t *chain;
317         int i;
318
319         KKASSERT(dst->refs == 1);
320         dst->focus = NULL;
321
322         for (i = 0; i < src->nchains; ++i) {
323                 chain = src->array[i];
324                 if (chain) {
325                         hammer2_chain_ref(chain);
326                         if (i < dst->nchains && dst->array[i])
327                                 hammer2_chain_unlock(dst->array[i]);
328                         dst->array[i] = chain;
329                         if (dst->focus == NULL)
330                                 dst->focus = chain;
331                 }
332         }
333         while (i < dst->nchains) {
334                 chain = dst->array[i];
335                 if (chain) {
336                         hammer2_chain_unlock(chain);
337                         dst->array[i] = NULL;
338                 }
339                 ++i;
340         }
341         dst->nchains = src->nchains;
342 }
343
344 /*
345  * Replace the contents of the locked destination with the contents of the
346  * locked source.  Destination must have one ref.
347  *
348  * Returns with the destination still with one ref and the copied chains
349  * with an additional lock (representing their state on the destination).
350  * The original chains associated with the destination are unlocked.
351  */
352 void
353 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src)
354 {
355         hammer2_chain_t *chain;
356         int i;
357
358         KKASSERT(dst->refs == 1);
359
360         dst->focus = NULL;
361         for (i = 0; i < src->nchains; ++i) {
362                 chain = src->array[i];
363                 if (chain) {
364                         hammer2_chain_lock(chain, 0);
365                         if (i < dst->nchains && dst->array[i])
366                                 hammer2_chain_unlock(dst->array[i]);
367                         dst->array[i] = src->array[i];
368                         if (dst->focus == NULL)
369                                 dst->focus = chain;
370                 }
371         }
372         while (i < dst->nchains) {
373                 chain = dst->array[i];
374                 if (chain) {
375                         hammer2_chain_unlock(chain);
376                         dst->array[i] = NULL;
377                 }
378                 ++i;
379         }
380         dst->nchains = src->nchains;
381 }
382
383 /*
384  * Copy a cluster, returned a ref'd cluster.  All underlying chains
385  * are also ref'd, but not locked.
386  *
387  * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied
388  * to the new cluster and a reference is nominally added to them and to
389  * the cluster.  The cluster will have 1 ref.
390  *
391  * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains
392  * are copied but no additional references are made and the cluster will have
393  * 0 refs.  Callers must ref the cluster and the chains before dropping it
394  * (typically by locking it).
395  *
396  * If flags are passed as 0 the copy is setup as if it contained the chains
397  * but the chains will not be copied over, and the cluster will have 0 refs.
398  * Callers must ref the cluster before dropping it (typically by locking it).
399  */
400 hammer2_cluster_t *
401 hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags)
402 {
403         hammer2_pfsmount_t *pmp = ocluster->pmp;
404         hammer2_cluster_t *ncluster;
405         hammer2_chain_t *chain;
406         int i;
407
408         ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
409         ncluster->pmp = pmp;
410         ncluster->nchains = ocluster->nchains;
411         ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1;
412         if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) {
413                 ncluster->focus = ocluster->focus;
414                 for (i = 0; i < ocluster->nchains; ++i) {
415                         chain = ocluster->array[i];
416                         ncluster->array[i] = chain;
417                         if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 &&
418                             chain) {
419                                 hammer2_chain_ref(chain);
420                         }
421                 }
422         }
423         return (ncluster);
424 }
425
426 /*
427  * Unlock and deref a cluster.  The cluster is destroyed if this is the
428  * last ref.
429  */
430 void
431 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
432 {
433         hammer2_chain_t *chain;
434         int i;
435
436         KKASSERT(cluster->refs > 0);
437         for (i = 0; i < cluster->nchains; ++i) {
438                 chain = cluster->array[i];
439                 if (chain) {
440                         hammer2_chain_unlock(chain);
441                         if (cluster->refs == 1)
442                                 cluster->array[i] = NULL;       /* safety */
443                 }
444         }
445         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
446                 cluster->focus = NULL;
447                 kfree(cluster, M_HAMMER2);
448                 /* cluster = NULL; safety */
449         }
450 }
451
452 /*
453  * Resize the cluster's physical storage allocation in-place.  This may
454  * replace the cluster's chains.
455  */
456 void
457 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
458                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
459                        int nradix, int flags)
460 {
461         int i;
462
463         KKASSERT(cparent->pmp == cluster->pmp);         /* can be NULL */
464         KKASSERT(cparent->nchains == cluster->nchains);
465
466         cluster->focus = NULL;
467         for (i = 0; i < cluster->nchains; ++i) {
468                 if (cluster->array[i]) {
469                         KKASSERT(cparent->array[i]);
470                         hammer2_chain_resize(trans, ip,
471                                              cparent->array[i],
472                                              cluster->array[i],
473                                              nradix, flags);
474                         if (cluster->focus == NULL)
475                                 cluster->focus = cluster->array[i];
476                 }
477         }
478 }
479
480 /*
481  * Set an inode's cluster modified, marking the related chains RW and
482  * duplicating them if necessary.
483  *
484  * The passed-in chain is a localized copy of the chain previously acquired
485  * when the inode was locked (and possilby replaced in the mean time), and
486  * must also be updated.  In fact, we update it first and then synchronize
487  * the inode's cluster cache.
488  */
489 hammer2_inode_data_t *
490 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
491                           hammer2_cluster_t *cluster, int flags)
492 {
493         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
494         hammer2_cluster_modify(trans, cluster, flags);
495
496         hammer2_inode_repoint(ip, NULL, cluster);
497         if (ip->vp)
498                 vsetisdirty(ip->vp);
499         return (&hammer2_cluster_wdata(cluster)->ipdata);
500 }
501
502 /*
503  * Adjust the cluster's chains to allow modification.
504  */
505 void
506 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
507                        int flags)
508 {
509         int i;
510
511         cluster->focus = NULL;
512         for (i = 0; i < cluster->nchains; ++i) {
513                 if (cluster->array[i]) {
514                         hammer2_chain_modify(trans, cluster->array[i], flags);
515                         if (cluster->focus == NULL)
516                                 cluster->focus = cluster->array[i];
517                 }
518         }
519 }
520
521 /*
522  * Synchronize modifications with other chains in a cluster.
523  *
524  * Nominal front-end operations only edit non-block-table data in a single
525  * chain.  This code copies such modifications to the other chains in the
526  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
527  * by both the frontend and the backend and will explode in fireworks if
528  * blindly copied.
529  */
530 void
531 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
532 {
533         hammer2_chain_t *focus;
534         hammer2_chain_t *scan;
535         const hammer2_inode_data_t *ripdata;
536         hammer2_inode_data_t *wipdata;
537         int i;
538
539         focus = cluster->focus;
540         KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
541
542         for (i = 0; i < cluster->nchains; ++i) {
543                 scan = cluster->array[i];
544                 if (scan == NULL || scan == focus)
545                         continue;
546                 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
547                 KKASSERT(focus->bytes == scan->bytes &&
548                          focus->bref.type == scan->bref.type);
549                 switch(focus->bref.type) {
550                 case HAMMER2_BREF_TYPE_INODE:
551                         ripdata = &focus->data->ipdata;
552                         wipdata = &scan->data->ipdata;
553                         if ((ripdata->op_flags &
554                             HAMMER2_OPFLAG_DIRECTDATA) == 0) {
555                                 bcopy(ripdata, wipdata,
556                                       offsetof(hammer2_inode_data_t, u));
557                                 break;
558                         }
559                         /* fall through */
560                 case HAMMER2_BREF_TYPE_DATA:
561                         bcopy(focus->data, scan->data, focus->bytes);
562                         break;
563                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
564                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
565                 case HAMMER2_BREF_TYPE_FREEMAP:
566                 case HAMMER2_BREF_TYPE_VOLUME:
567                         panic("hammer2_cluster_modsync: illegal node type");
568                         /* NOT REACHED */
569                         break;
570                 default:
571                         panic("hammer2_cluster_modsync: unknown node type");
572                         break;
573                 }
574         }
575 }
576
577 /*
578  * Lookup initialization/completion API
579  */
580 hammer2_cluster_t *
581 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
582 {
583         hammer2_cluster_t *cluster;
584         int i;
585
586         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
587         cluster->pmp = cparent->pmp;                    /* can be NULL */
588         /* cluster->focus = NULL; already null */
589
590         for (i = 0; i < cparent->nchains; ++i) {
591                 cluster->array[i] = cparent->array[i];
592                 if (cluster->focus == NULL)
593                         cluster->focus = cluster->array[i];
594         }
595         cluster->nchains = cparent->nchains;
596
597         /*
598          * Independently lock (this will also give cluster 1 ref)
599          */
600         if (flags & HAMMER2_LOOKUP_SHARED) {
601                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
602                                               HAMMER2_RESOLVE_SHARED);
603         } else {
604                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
605         }
606         return (cluster);
607 }
608
609 void
610 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
611 {
612         if (cparent)
613                 hammer2_cluster_unlock(cparent);
614 }
615
616 /*
617  * Locate first match or overlap under parent, return a new cluster
618  */
619 hammer2_cluster_t *
620 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
621                      hammer2_key_t key_beg, hammer2_key_t key_end,
622                      int flags, int *ddflagp)
623 {
624         hammer2_pfsmount_t *pmp;
625         hammer2_cluster_t *cluster;
626         hammer2_chain_t *chain;
627         hammer2_key_t key_accum;
628         hammer2_key_t key_next;
629         hammer2_key_t bref_key;
630         int bref_keybits;
631         int null_count;
632         int ddflag;
633         int i;
634         uint8_t bref_type;
635         u_int bytes;
636
637         pmp = cparent->pmp;                             /* can be NULL */
638         key_accum = *key_nextp;
639         null_count = 0;
640         bref_type = 0;
641         bref_key = 0;
642         bref_keybits = 0;
643         bytes = 0;
644
645         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
646         cluster->pmp = pmp;                             /* can be NULL */
647         cluster->refs = 1;
648         /* cluster->focus = NULL; already null */
649         cparent->focus = NULL;
650         *ddflagp = 0;
651
652         for (i = 0; i < cparent->nchains; ++i) {
653                 key_next = *key_nextp;
654                 if (cparent->array[i] == NULL) {
655                         ++null_count;
656                         continue;
657                 }
658                 chain = hammer2_chain_lookup(&cparent->array[i], &key_next,
659                                              key_beg, key_end,
660                                              &cparent->cache_index[i],
661                                              flags, &ddflag);
662                 if (cparent->focus == NULL)
663                         cparent->focus = cparent->array[i];
664                 cluster->array[i] = chain;
665                 if (chain == NULL) {
666                         ++null_count;
667                 } else {
668                         if (cluster->focus == NULL) {
669                                 bref_type = chain->bref.type;
670                                 bref_key = chain->bref.key;
671                                 bref_keybits = chain->bref.keybits;
672                                 bytes = chain->bytes;
673                                 *ddflagp = ddflag;
674                                 cluster->focus = chain;
675                         }
676                         KKASSERT(bref_type == chain->bref.type);
677                         KKASSERT(bref_key == chain->bref.key);
678                         KKASSERT(bref_keybits == chain->bref.keybits);
679                         KKASSERT(bytes == chain->bytes);
680                         KKASSERT(*ddflagp == ddflag);
681                 }
682                 if (key_accum > key_next)
683                         key_accum = key_next;
684         }
685         *key_nextp = key_accum;
686         cluster->nchains = i;
687
688         if (null_count == i) {
689                 hammer2_cluster_drop(cluster);
690                 cluster = NULL;
691         }
692
693         return (cluster);
694 }
695
696 /*
697  * Locate next match or overlap under parent, replace cluster
698  */
699 hammer2_cluster_t *
700 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
701                      hammer2_key_t *key_nextp,
702                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
703 {
704         hammer2_chain_t *chain;
705         hammer2_key_t key_accum;
706         hammer2_key_t key_next;
707         int null_count;
708         int i;
709
710         key_accum = *key_nextp;
711         null_count = 0;
712         cluster->focus = NULL;
713         cparent->focus = NULL;
714
715         for (i = 0; i < cparent->nchains; ++i) {
716                 key_next = *key_nextp;
717                 chain = cluster->array[i];
718                 if (chain == NULL) {
719                         if (cparent->focus == NULL)
720                                 cparent->focus = cparent->array[i];
721                         ++null_count;
722                         continue;
723                 }
724                 if (cparent->array[i] == NULL) {
725                         if (flags & HAMMER2_LOOKUP_NOLOCK)
726                                 hammer2_chain_drop(chain);
727                         else
728                                 hammer2_chain_unlock(chain);
729                         ++null_count;
730                         continue;
731                 }
732                 chain = hammer2_chain_next(&cparent->array[i], chain,
733                                            &key_next, key_beg, key_end,
734                                            &cparent->cache_index[i], flags);
735                 if (cparent->focus == NULL)
736                         cparent->focus = cparent->array[i];
737                 cluster->array[i] = chain;
738                 if (chain == NULL) {
739                         ++null_count;
740                 } else if (cluster->focus == NULL) {
741                         cluster->focus = chain;
742                 }
743                 if (key_accum > key_next)
744                         key_accum = key_next;
745         }
746
747         if (null_count == i) {
748                 hammer2_cluster_drop(cluster);
749                 cluster = NULL;
750         }
751         return(cluster);
752 }
753
754 #if 0
755 /*
756  * XXX initial NULL cluster needs reworking (pass **clusterp ?)
757  *
758  * The raw scan function is similar to lookup/next but does not seek to a key.
759  * Blockrefs are iterated via first_chain = (parent, NULL) and
760  * next_chain = (parent, chain).
761  *
762  * The passed-in parent must be locked and its data resolved.  The returned
763  * chain will be locked.  Pass chain == NULL to acquire the first sub-chain
764  * under parent and then iterate with the passed-in chain (which this
765  * function will unlock).
766  */
767 hammer2_cluster_t *
768 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
769                      int flags)
770 {
771         hammer2_chain_t *chain;
772         int null_count;
773         int i;
774
775         null_count = 0;
776
777         for (i = 0; i < cparent->nchains; ++i) {
778                 chain = cluster->array[i];
779                 if (chain == NULL) {
780                         ++null_count;
781                         continue;
782                 }
783                 if (cparent->array[i] == NULL) {
784                         if (flags & HAMMER2_LOOKUP_NOLOCK)
785                                 hammer2_chain_drop(chain);
786                         else
787                                 hammer2_chain_unlock(chain);
788                         ++null_count;
789                         continue;
790                 }
791
792                 chain = hammer2_chain_scan(cparent->array[i], chain,
793                                            &cparent->cache_index[i], flags);
794                 cluster->array[i] = chain;
795                 if (chain == NULL)
796                         ++null_count;
797         }
798
799         if (null_count == i) {
800                 hammer2_cluster_drop(cluster);
801                 cluster = NULL;
802         }
803         return(cluster);
804 }
805
806 #endif
807
808 /*
809  * Create a new cluster using the specified key
810  */
811 int
812 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
813                      hammer2_cluster_t **clusterp,
814                      hammer2_key_t key, int keybits, int type, size_t bytes)
815 {
816         hammer2_cluster_t *cluster;
817         hammer2_pfsmount_t *pmp;
818         int error;
819         int i;
820
821         pmp = trans->pmp;                               /* can be NULL */
822
823         if ((cluster = *clusterp) == NULL) {
824                 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
825                                   M_WAITOK | M_ZERO);
826                 cluster->pmp = pmp;                     /* can be NULL */
827                 cluster->refs = 1;
828         }
829         cluster->focus = NULL;
830         cparent->focus = NULL;
831
832         /*
833          * NOTE: cluster->array[] entries can initially be NULL.  If
834          *       *clusterp is supplied, skip NULL entries, otherwise
835          *       create new chains.
836          */
837         for (i = 0; i < cparent->nchains; ++i) {
838                 if (*clusterp && cluster->array[i] == NULL) {
839                         if (cparent->focus == NULL)
840                                 cparent->focus = cparent->array[i];
841                         continue;
842                 }
843                 error = hammer2_chain_create(trans, &cparent->array[i],
844                                              &cluster->array[i], pmp,
845                                              key, keybits, type, bytes);
846                 KKASSERT(error == 0);
847                 if (cparent->focus == NULL)
848                         cparent->focus = cparent->array[i];
849                 if (cluster->focus == NULL)
850                         cluster->focus = cluster->array[i];
851         }
852         cluster->nchains = i;
853         *clusterp = cluster;
854
855         return error;
856 }
857
858 /*
859  * Rename a cluster to a new parent.
860  *
861  * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields
862  *          are used from a passed-in non-NULL bref pointer.  All other fields
863  *          are extracted from the original chain for each chain in the
864  *          iteration.
865  */
866 void
867 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
868                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster)
869 {
870         hammer2_chain_t *chain;
871         hammer2_blockref_t xbref;
872         int i;
873
874         cluster->focus = NULL;
875         cparent->focus = NULL;
876
877         for (i = 0; i < cluster->nchains; ++i) {
878                 chain = cluster->array[i];
879                 if (chain) {
880                         if (bref) {
881                                 xbref = chain->bref;
882                                 xbref.key = bref->key;
883                                 xbref.keybits = bref->keybits;
884                                 hammer2_chain_rename(trans, &xbref,
885                                                      &cparent->array[i],
886                                                      chain);
887                         } else {
888                                 hammer2_chain_rename(trans, NULL,
889                                                      &cparent->array[i],
890                                                      chain);
891                         }
892                         cluster->array[i] = chain;
893                         if (cluster->focus == NULL)
894                                 cluster->focus = chain;
895                         if (cparent->focus == NULL)
896                                 cparent->focus = cparent->array[i];
897                 } else {
898                         if (cparent->focus == NULL)
899                                 cparent->focus = cparent->array[i];
900                 }
901         }
902 }
903
904 /*
905  * Mark a cluster deleted
906  */
907 void
908 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
909                        hammer2_cluster_t *cluster, int flags)
910 {
911         hammer2_chain_t *chain;
912         hammer2_chain_t *parent;
913         int i;
914
915         if (cparent == NULL) {
916                 kprintf("cparent is NULL\n");
917                 return;
918         }
919
920         for (i = 0; i < cluster->nchains; ++i) {
921                 parent = (i < cparent->nchains) ? cparent->array[i] : NULL;
922                 chain = cluster->array[i];
923                 if (chain == NULL)
924                         continue;
925                 if (chain->parent != parent) {
926                         kprintf("hammer2_cluster_delete: parent "
927                                 "mismatch chain=%p parent=%p against=%p\n",
928                                 chain, chain->parent, parent);
929                 } else {
930                         hammer2_chain_delete(trans, parent, chain, flags);
931                 }
932         }
933 }
934
935 /*
936  * Create a snapshot of the specified {parent, ochain} with the specified
937  * label.  The originating hammer2_inode must be exclusively locked for
938  * safety.
939  *
940  * The ioctl code has already synced the filesystem.
941  */
942 int
943 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
944                        hammer2_ioc_pfs_t *pfs)
945 {
946         hammer2_mount_t *hmp;
947         hammer2_cluster_t *ncluster;
948         const hammer2_inode_data_t *ipdata;
949         hammer2_inode_data_t *wipdata;
950         hammer2_inode_t *nip;
951         size_t name_len;
952         hammer2_key_t lhc;
953         struct vattr vat;
954         uuid_t opfs_clid;
955         int error;
956
957         kprintf("snapshot %s\n", pfs->name);
958
959         name_len = strlen(pfs->name);
960         lhc = hammer2_dirhash(pfs->name, name_len);
961
962         ipdata = &hammer2_cluster_data(ocluster)->ipdata;
963         opfs_clid = ipdata->pfs_clid;
964         hmp = ocluster->focus->hmp;
965
966         /*
967          * Create the snapshot directory under the super-root
968          *
969          * Set PFS type, generate a unique filesystem id, and generate
970          * a cluster id.  Use the same clid when snapshotting a PFS root,
971          * which theoretically allows the snapshot to be used as part of
972          * the same cluster (perhaps as a cache).
973          *
974          * Copy the (flushed) blockref array.  Theoretically we could use
975          * chain_duplicate() but it becomes difficult to disentangle
976          * the shared core so for now just brute-force it.
977          */
978         VATTR_NULL(&vat);
979         vat.va_type = VDIR;
980         vat.va_mode = 0755;
981         ncluster = NULL;
982         nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
983                                    proc0.p_ucred, pfs->name, name_len,
984                                    &ncluster, &error);
985
986         if (nip) {
987                 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
988                 wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
989                 kern_uuidgen(&wipdata->pfs_fsid, 1);
990                 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSROOT)
991                         wipdata->pfs_clid = opfs_clid;
992                 else
993                         kern_uuidgen(&wipdata->pfs_clid, 1);
994                 hammer2_cluster_set_chainflags(ncluster, HAMMER2_CHAIN_PFSROOT);
995
996                 /* XXX hack blockset copy */
997                 /* XXX doesn't work with real cluster */
998                 KKASSERT(ocluster->nchains == 1);
999                 wipdata->u.blockset = ocluster->focus->data->ipdata.u.blockset;
1000                 hammer2_cluster_modsync(ncluster);
1001                 hammer2_inode_unlock_ex(nip, ncluster);
1002         }
1003         return (error);
1004 }
1005
1006 /*
1007  * Return locked parent cluster given a locked child.  The child remains
1008  * locked on return.
1009  */
1010 hammer2_cluster_t *
1011 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1012 {
1013         hammer2_cluster_t *cparent;
1014         int i;
1015
1016         cparent = hammer2_cluster_copy(cluster, HAMMER2_CLUSTER_COPY_NOCHAINS);
1017         for (i = 0; i < cluster->nchains; ++i) {
1018                 hammer2_chain_t *chain;
1019                 hammer2_chain_t *rchain;
1020
1021                 chain = cluster->array[i];
1022                 if (chain == NULL)
1023                         continue;
1024                 hammer2_chain_ref(chain);
1025                 while ((rchain = chain->parent) != NULL) {
1026                         hammer2_chain_ref(rchain);
1027                         hammer2_chain_unlock(chain);
1028                         hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1029                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1030                         hammer2_chain_drop(rchain);
1031                         if (chain->parent == rchain)
1032                                 break;
1033                         hammer2_chain_unlock(rchain);
1034                 }
1035                 hammer2_chain_drop(chain);
1036                 cparent->array[i] = rchain;
1037         }
1038         return cparent;
1039 }
1040
1041 /************************************************************************
1042  *                          NODE FAILURES                               *
1043  ************************************************************************
1044  *
1045  * A node failure can occur for numerous reasons.
1046  *
1047  *      - A read I/O may fail
1048  *      - A write I/O may fail
1049  *      - An unexpected chain might be found (or be missing)
1050  *      - A node might disconnect temporarily and reconnect later
1051  *        (for example, a USB stick could get pulled, or a node might
1052  *        be programmatically disconnected).
1053  *      - A node might run out of space during a modifying operation.
1054  *
1055  * When a read failure or an unexpected chain state is found, the chain and
1056  * parent chain at the failure point for the nodes involved (the nodes
1057  * which we determine to be in error) are flagged as failed and removed
1058  * from the cluster.  The node itself is allowed to remain active.  The
1059  * highest common point (usually a parent chain) is queued to the
1060  * resynchronization thread for action.
1061  *
1062  * When a write I/O fails or a node runs out of space, we first adjust
1063  * as if a read failure occurs but we further disable flushes on the
1064  * ENTIRE node.  Concurrent modifying transactions are allowed to complete
1065  * but any new modifying transactions will automatically remove the node
1066  * from consideration in all related cluster structures and not generate
1067  * any new modified chains.  The ROOT chain for the failed node(s) is queued
1068  * to the resynchronization thread for action.
1069  *
1070  * A temporary disconnect is handled as if a write failure occurred.
1071  *
1072  * Any of these failures might or might not stall related high level VNOPS,
1073  * depending on what has failed, what nodes remain, the type of cluster,
1074  * and the operating state of the cluster.
1075  *
1076  *                          FLUSH ON WRITE-DISABLED NODES
1077  *
1078  * A flush on a write-disabled node is not allowed to write anything because
1079  * we cannot safely update the mirror_tid anywhere on the failed node.  The
1080  * synchronization thread uses mirror_tid to calculate incremental resyncs.
1081  * Dirty meta-data related to the failed node is thrown away.
1082  *
1083  * Dirty buffer cache buffers and inodes are only thrown away if they can be
1084  * retired... that is, if the filesystem still has enough nodes to complete
1085  * the operation.
1086  */
1087
1088 /************************************************************************
1089  *                      SYNCHRONIZATION THREAD                          *
1090  ************************************************************************
1091  *
1092  * This thread is responsible for [re]synchronizing the cluster representing
1093  * a PFS.  Any out-of-sync or failed node starts this thread on a
1094  * node-by-node basis when the failure is detected.
1095  *
1096  * Clusters needing resynchronization are queued at the highest point
1097  * where the parent on the failed node is still valid, or a special
1098  * incremental scan from the ROOT is queued if no parent exists.  This
1099  * thread is also responsible for waiting for reconnections of the failed
1100  * node if the cause was due to a disconnect, and waiting for space to be
1101  * freed up if the cause was due to running out of space.
1102  *
1103  * If the cause is due to a node running out of space, this thread will also
1104  * remove older (unlocked) snapshots to make new space, recover space, and
1105  * then start resynchronization.
1106  *
1107  * Each resynchronization pass virtually snapshots the PFS on the good nodes
1108  * and synchronizes using that snapshot against the target node.  This
1109  * ensures a consistent chain topology and also avoid interference between
1110  * the resynchronization thread and frontend operations.
1111  *
1112  * Since these are per-node threads it is possible to resynchronize several
1113  * nodes at once.
1114  */