Merge branch 'vendor/WPA_SUPPLICANT'
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information into a single entity.  It allows direct access to media
37  * data as long as it is not blockref array data.  Meaning, basically,
38  * just inode and file data.
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  */
48 #include <sys/cdefs.h>
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/types.h>
52 #include <sys/lock.h>
53 #include <sys/uuid.h>
54
55 #include "hammer2.h"
56
57 u_int
58 hammer2_cluster_bytes(hammer2_cluster_t *cluster)
59 {
60         return(cluster->focus->bytes);
61 }
62
63 uint8_t
64 hammer2_cluster_type(hammer2_cluster_t *cluster)
65 {
66         return(cluster->focus->bref.type);
67 }
68
69 /*
70  * NOTE: When modifying a cluster object via hammer2_cluster_wdata()
71  *       and hammer2_cluster_modsync(), remember that block array
72  *       entries are not copied to the elements of the cluster.
73  */
74 const hammer2_media_data_t *
75 hammer2_cluster_data(hammer2_cluster_t *cluster)
76 {
77         return(cluster->focus->data);
78 }
79
80 hammer2_media_data_t *
81 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
82 {
83         return(cluster->focus->data);
84 }
85
86 int
87 hammer2_cluster_modified(hammer2_cluster_t *cluster)
88 {
89         return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
90 }
91
92 int
93 hammer2_cluster_unlinked(hammer2_cluster_t *cluster)
94 {
95         return((cluster->focus->flags & HAMMER2_CHAIN_UNLINKED) != 0);
96 }
97
98 /*
99  * Return a bref representative of the cluster.  Any data offset is removed
100  * (since it would only be applicable to a particular chain in the cluster).
101  *
102  * However, the radix portion of data_off is used for many purposes and will
103  * be retained.
104  */
105 void
106 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
107 {
108         *bref = cluster->focus->bref;
109         bref->data_off &= HAMMER2_OFF_MASK_RADIX;
110 }
111
112 void
113 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
114 {
115         hammer2_chain_t *chain;
116         int i;
117
118         for (i = 0; i < cluster->nchains; ++i) {
119                 chain = cluster->array[i];
120                 if (chain)
121                         atomic_set_int(&chain->flags, flags);
122         }
123 }
124
125 void
126 hammer2_cluster_setsubmod(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
127 {
128         hammer2_chain_t *chain;
129         int i;
130
131         for (i = 0; i < cluster->nchains; ++i) {
132                 chain = cluster->array[i];
133                 if (chain)
134                         hammer2_chain_setsubmod(trans, chain);
135         }
136 }
137
138 /*
139  * Create a cluster with one ref from the specified chain.  The chain
140  * is not further referenced.  The caller typically supplies a locked
141  * chain and transfers ownership to the cluster.
142  */
143 hammer2_cluster_t *
144 hammer2_cluster_from_chain(hammer2_chain_t *chain)
145 {
146         hammer2_cluster_t *cluster;
147
148         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
149         cluster->array[0] = chain;
150         cluster->nchains = 1;
151         cluster->focus = chain;
152         cluster->pmp = chain->pmp;
153         cluster->refs = 1;
154
155         return cluster;
156 }
157
158 /*
159  * Allocates a cluster and its underlying chain structures.  The underlying
160  * chains will be locked.  The cluster and underlying chains will have one
161  * ref.
162  */
163 hammer2_cluster_t *
164 hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
165                       hammer2_trans_t *trans, hammer2_blockref_t *bref)
166 {
167         hammer2_cluster_t *cluster;
168         hammer2_cluster_t *rcluster;
169         hammer2_chain_t *chain;
170         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
171         int i;
172
173         KKASSERT(pmp != NULL);
174
175         /*
176          * Construct the appropriate system structure.
177          */
178         switch(bref->type) {
179         case HAMMER2_BREF_TYPE_INODE:
180         case HAMMER2_BREF_TYPE_INDIRECT:
181         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
182         case HAMMER2_BREF_TYPE_DATA:
183         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
184                 /*
185                  * Chain's are really only associated with the hmp but we
186                  * maintain a pmp association for per-mount memory tracking
187                  * purposes.  The pmp can be NULL.
188                  */
189                 break;
190         case HAMMER2_BREF_TYPE_VOLUME:
191         case HAMMER2_BREF_TYPE_FREEMAP:
192                 chain = NULL;
193                 panic("hammer2_cluster_alloc volume type illegal for op");
194         default:
195                 chain = NULL;
196                 panic("hammer2_cluster_alloc: unrecognized blockref type: %d",
197                       bref->type);
198         }
199
200         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
201         cluster->refs = 1;
202
203         rcluster = &pmp->iroot->cluster;
204         for (i = 0; i < rcluster->nchains; ++i) {
205                 chain = hammer2_chain_alloc(rcluster->array[i]->hmp,
206                                             pmp, trans, bref);
207                 chain->hmp = rcluster->array[i]->hmp;
208                 chain->bref = *bref;
209                 chain->bytes = bytes;
210                 chain->refs = 1;
211                 chain->flags = HAMMER2_CHAIN_ALLOCATED;
212                 chain->delete_xid = HAMMER2_XID_MAX;
213
214                 /*
215                  * Set modify_tid if a transaction is creating the inode.
216                  * Enforce update_xlo = 0 so nearby transactions do not think
217                  * it has been flushed when it hasn't.
218                  *
219                  * NOTE: When loading a chain from backing store or creating a
220                  *       snapshot, trans will be NULL and the caller is
221                  *       responsible for setting these fields.
222                  */
223                 if (trans) {
224                         chain->modify_xid = trans->sync_xid;
225                         chain->update_xlo = 0;
226                 }
227                 cluster->array[i] = chain;
228         }
229         cluster->nchains = i;
230         cluster->pmp = pmp;
231         cluster->focus = cluster->array[0];
232
233         return (cluster);
234 }
235
236 /*
237  * Associate an existing core with the chain or allocate a new core.
238  *
239  * The core is not locked.  No additional refs on the chain are made.
240  * (trans) must not be NULL if (core) is not NULL.
241  *
242  * When chains are delete-duplicated during flushes we insert nchain on
243  * the ownerq after ochain instead of at the end in order to give the
244  * drop code visibility in the correct order, otherwise drops can be missed.
245  */
246 void
247 hammer2_cluster_core_alloc(hammer2_trans_t *trans,
248                            hammer2_cluster_t *ncluster,
249                            hammer2_cluster_t *ocluster)
250 {
251         int i;
252
253         for (i = 0; i < ocluster->nchains; ++i) {
254                 if (ncluster->array[i]) {
255                         hammer2_chain_core_alloc(trans,
256                                                  ncluster->array[i],
257                                                  ocluster->array[i]);
258                 }
259         }
260 }
261
262 /*
263  * Add a reference to a cluster.
264  *
265  * We must also ref the underlying chains in order to allow ref/unlock
266  * sequences to later re-lock.
267  */
268 void
269 hammer2_cluster_ref(hammer2_cluster_t *cluster)
270 {
271         hammer2_chain_t *chain;
272         int i;
273
274         atomic_add_int(&cluster->refs, 1);
275         for (i = 0; i < cluster->nchains; ++i) {
276                 chain = cluster->array[i];
277                 if (chain)
278                         hammer2_chain_ref(chain);
279         }
280 }
281
282 /*
283  * Drop the caller's reference to the cluster.  When the ref count drops to
284  * zero this function frees the cluster and drops all underlying chains.
285  */
286 void
287 hammer2_cluster_drop(hammer2_cluster_t *cluster)
288 {
289         hammer2_chain_t *chain;
290         int i;
291
292         KKASSERT(cluster->refs > 0);
293         for (i = 0; i < cluster->nchains; ++i) {
294                 chain = cluster->array[i];
295                 if (chain) {
296                         hammer2_chain_drop(chain);
297                         if (cluster->refs == 1)
298                                 cluster->array[i] = NULL;
299                 }
300         }
301         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
302                 cluster->focus = NULL;
303                 kfree(cluster, M_HAMMER2);
304                 /* cluster = NULL; safety */
305         }
306 }
307
308 void
309 hammer2_cluster_wait(hammer2_cluster_t *cluster)
310 {
311         tsleep(cluster->focus, 0, "h2clcw", 1);
312 }
313
314 /*
315  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
316  * and then locks them.
317  */
318 int
319 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
320 {
321         hammer2_chain_t *chain;
322         int i;
323         int error;
324
325         error = 0;
326         atomic_add_int(&cluster->refs, 1);
327         for (i = 0; i < cluster->nchains; ++i) {
328                 chain = cluster->array[i];
329                 if (chain) {
330                         error = hammer2_chain_lock(chain, how);
331                         if (error) {
332                                 while (--i >= 0)
333                                         hammer2_chain_unlock(cluster->array[i]);
334                                 atomic_add_int(&cluster->refs, -1);
335                                 break;
336                         }
337                 }
338         }
339         return error;
340 }
341
342 /*
343  * Replace the contents of dst with src, adding a reference to src's chains.
344  * dst is assumed to already have a ref and any chains present in dst are
345  * assumed to be locked and will be unlocked.
346  *
347  * If the chains in src are locked, only one of (src) or (dst) should be
348  * considered locked by the caller after return, not both.
349  */
350 void
351 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src)
352 {
353         hammer2_chain_t *chain;
354         int i;
355
356         KKASSERT(dst->refs == 1);
357         dst->focus = NULL;
358
359         for (i = 0; i < src->nchains; ++i) {
360                 chain = src->array[i];
361                 if (chain) {
362                         hammer2_chain_ref(chain);
363                         if (i < dst->nchains && dst->array[i])
364                                 hammer2_chain_unlock(dst->array[i]);
365                         dst->array[i] = chain;
366                         if (dst->focus == NULL)
367                                 dst->focus = chain;
368                 }
369         }
370         while (i < dst->nchains) {
371                 chain = dst->array[i];
372                 if (chain) {
373                         hammer2_chain_unlock(chain);
374                         dst->array[i] = NULL;
375                 }
376                 ++i;
377         }
378         dst->nchains = src->nchains;
379 }
380
381 /*
382  * Replace the contents of the locked destination with the contents of the
383  * locked source.  Destination must have one ref.
384  *
385  * Returns with the destination still with one ref and the copied chains
386  * with an additional lock (representing their state on the destination).
387  * The original chains associated with the destination are unlocked.
388  */
389 void
390 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src)
391 {
392         hammer2_chain_t *chain;
393         int i;
394
395         KKASSERT(dst->refs == 1);
396
397         dst->focus = NULL;
398         for (i = 0; i < src->nchains; ++i) {
399                 chain = src->array[i];
400                 if (chain) {
401                         hammer2_chain_lock(chain, 0);
402                         if (i < dst->nchains && dst->array[i])
403                                 hammer2_chain_unlock(dst->array[i]);
404                         dst->array[i] = src->array[i];
405                         if (dst->focus == NULL)
406                                 dst->focus = chain;
407                 }
408         }
409         while (i < dst->nchains) {
410                 chain = dst->array[i];
411                 if (chain) {
412                         hammer2_chain_unlock(chain);
413                         dst->array[i] = NULL;
414                 }
415                 ++i;
416         }
417         dst->nchains = src->nchains;
418 }
419
420 /*
421  * Copy a cluster, returned a ref'd cluster.  All underlying chains
422  * are also ref'd, but not locked.
423  *
424  * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied
425  * to the new cluster and a reference is nominally added to them and to
426  * the cluster.  The cluster will have 1 ref.
427  *
428  * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains
429  * are copied but no additional references are made and the cluster will have
430  * 0 refs.  Callers must ref the cluster and the chains before dropping it
431  * (typically by locking it).
432  *
433  * If flags are passed as 0 the copy is setup as if it contained the chains
434  * but the chains will not be copied over, and the cluster will have 0 refs.
435  * Callers must ref the cluster before dropping it (typically by locking it).
436  */
437 hammer2_cluster_t *
438 hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags)
439 {
440         hammer2_pfsmount_t *pmp = ocluster->pmp;
441         hammer2_cluster_t *ncluster;
442         hammer2_chain_t *chain;
443         int i;
444
445         ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
446         ncluster->pmp = pmp;
447         ncluster->nchains = ocluster->nchains;
448         ncluster->focus = ocluster->focus;
449         ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1;
450         if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) {
451                 for (i = 0; i < ocluster->nchains; ++i) {
452                         chain = ocluster->array[i];
453                         ncluster->array[i] = chain;
454                         if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 &&
455                             chain) {
456                                 hammer2_chain_ref(chain);
457                         }
458                 }
459         }
460         return (ncluster);
461 }
462
463 /*
464  * Unlock and deref a cluster.  The cluster is destroyed if this is the
465  * last ref.
466  */
467 void
468 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
469 {
470         hammer2_chain_t *chain;
471         int i;
472
473         KKASSERT(cluster->refs > 0);
474         for (i = 0; i < cluster->nchains; ++i) {
475                 chain = cluster->array[i];
476                 if (chain) {
477                         hammer2_chain_unlock(chain);
478                         if (cluster->refs == 1)
479                                 cluster->array[i] = NULL;       /* safety */
480                 }
481         }
482         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
483                 cluster->focus = NULL;
484                 kfree(cluster, M_HAMMER2);
485                 /* cluster = NULL; safety */
486         }
487 }
488
489 /*
490  * Refactor the locked chains of a cluster.
491  */
492 void
493 hammer2_cluster_refactor(hammer2_cluster_t *cluster)
494 {
495         int i;
496
497         cluster->focus = NULL;
498         for (i = 0; i < cluster->nchains; ++i) {
499                 if (cluster->array[i]) {
500                         hammer2_chain_refactor(&cluster->array[i]);
501                         if (cluster->focus == NULL)
502                                 cluster->focus = cluster->array[i];
503                 }
504         }
505 }
506
507 /*
508  * Resize the cluster's physical storage allocation in-place.  This may
509  * replace the cluster's chains.
510  */
511 void
512 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
513                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
514                        int nradix, int flags)
515 {
516         int i;
517
518         KKASSERT(cparent->pmp == cluster->pmp);         /* can be NULL */
519         KKASSERT(cparent->nchains == cluster->nchains);
520
521         cluster->focus = NULL;
522         for (i = 0; i < cluster->nchains; ++i) {
523                 if (cluster->array[i]) {
524                         KKASSERT(cparent->array[i]);
525                         hammer2_chain_resize(trans, ip,
526                                              cparent->array[i],
527                                              &cluster->array[i],
528                                              nradix, flags);
529                         if (cluster->focus == NULL)
530                                 cluster->focus = cluster->array[i];
531                 }
532         }
533 }
534
535 /*
536  * Set an inode's cluster modified, marking the related chains RW and
537  * duplicating them if necessary.
538  *
539  * The passed-in chain is a localized copy of the chain previously acquired
540  * when the inode was locked (and possilby replaced in the mean time), and
541  * must also be updated.  In fact, we update it first and then synchronize
542  * the inode's cluster cache.
543  */
544 hammer2_inode_data_t *
545 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
546                           hammer2_cluster_t *cluster, int flags)
547 {
548         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
549         hammer2_cluster_modify(trans, cluster, flags);
550
551         hammer2_inode_repoint(ip, NULL, cluster);
552         if (ip->vp)
553                 vsetisdirty(ip->vp);
554         return (&hammer2_cluster_wdata(cluster)->ipdata);
555 }
556
557 /*
558  * Adjust the cluster's chains to allow modification.
559  */
560 void
561 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
562                        int flags)
563 {
564         int i;
565
566         cluster->focus = NULL;
567         for (i = 0; i < cluster->nchains; ++i) {
568                 if (cluster->array[i]) {
569                         hammer2_chain_modify(trans, &cluster->array[i], flags);
570                         if (cluster->focus == NULL)
571                                 cluster->focus = cluster->array[i];
572                 }
573         }
574 }
575
576 /*
577  * Synchronize modifications with other chains in a cluster.
578  *
579  * Nominal front-end operations only edit non-block-table data in a single
580  * chain.  This code copies such modifications to the other chains in the
581  * cluster.
582  */
583 /* hammer2_cluster_modsync() */
584
585 void
586 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
587 {
588         hammer2_chain_t *focus;
589         hammer2_chain_t *scan;
590         const hammer2_inode_data_t *ripdata;
591         hammer2_inode_data_t *wipdata;
592         int i;
593
594         focus = cluster->focus;
595         KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
596
597         for (i = 0; i < cluster->nchains; ++i) {
598                 scan = cluster->array[i];
599                 if (scan == NULL || scan == focus)
600                         continue;
601                 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
602                 KKASSERT(focus->bytes == scan->bytes &&
603                          focus->bref.type == scan->bref.type);
604                 switch(focus->bref.type) {
605                 case HAMMER2_BREF_TYPE_INODE:
606                         ripdata = &focus->data->ipdata;
607                         wipdata = &scan->data->ipdata;
608                         if ((ripdata->op_flags &
609                             HAMMER2_OPFLAG_DIRECTDATA) == 0) {
610                                 bcopy(ripdata, wipdata,
611                                       offsetof(hammer2_inode_data_t, u));
612                                 break;
613                         }
614                         /* fall through */
615                 case HAMMER2_BREF_TYPE_DATA:
616                         bcopy(focus->data, scan->data, focus->bytes);
617                         break;
618                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
619                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
620                 case HAMMER2_BREF_TYPE_FREEMAP:
621                 case HAMMER2_BREF_TYPE_VOLUME:
622                         panic("hammer2_cluster_modsync: illegal node type");
623                         /* NOT REACHED */
624                         break;
625                 default:
626                         panic("hammer2_cluster_modsync: unknown node type");
627                         break;
628                 }
629         }
630 }
631
632 /*
633  * Lookup initialization/completion API
634  */
635 hammer2_cluster_t *
636 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
637 {
638         hammer2_cluster_t *cluster;
639         int i;
640
641         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
642         cluster->pmp = cparent->pmp;                    /* can be NULL */
643         /* cluster->focus = NULL; already null */
644
645         for (i = 0; i < cparent->nchains; ++i) {
646                 cluster->array[i] = cparent->array[i];
647                 if (cluster->focus == NULL)
648                         cluster->focus = cluster->array[i];
649         }
650         cluster->nchains = cparent->nchains;
651
652         /*
653          * Independently lock (this will also give cluster 1 ref)
654          */
655         if (flags & HAMMER2_LOOKUP_SHARED) {
656                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
657                                               HAMMER2_RESOLVE_SHARED);
658         } else {
659                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
660         }
661         return (cluster);
662 }
663
664 void
665 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
666 {
667         if (cparent)
668                 hammer2_cluster_unlock(cparent);
669 }
670
671 /*
672  * Locate first match or overlap under parent, return a new cluster
673  */
674 hammer2_cluster_t *
675 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
676                      hammer2_key_t key_beg, hammer2_key_t key_end,
677                      int flags, int *ddflagp)
678 {
679         hammer2_pfsmount_t *pmp;
680         hammer2_cluster_t *cluster;
681         hammer2_chain_t *chain;
682         hammer2_key_t key_accum;
683         hammer2_key_t key_next;
684         hammer2_key_t bref_key;
685         int bref_keybits;
686         int null_count;
687         int ddflag;
688         int i;
689         uint8_t bref_type;
690         u_int bytes;
691
692         pmp = cparent->pmp;                             /* can be NULL */
693         key_accum = *key_nextp;
694         null_count = 0;
695         bref_type = 0;
696         bref_key = 0;
697         bref_keybits = 0;
698         bytes = 0;
699
700         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
701         cluster->pmp = pmp;                             /* can be NULL */
702         cluster->refs = 1;
703         /* cluster->focus = NULL; already null */
704         cparent->focus = NULL;
705         *ddflagp = 0;
706
707         for (i = 0; i < cparent->nchains; ++i) {
708                 key_next = *key_nextp;
709                 if (cparent->array[i] == NULL) {
710                         ++null_count;
711                         continue;
712                 }
713                 chain = hammer2_chain_lookup(&cparent->array[i], &key_next,
714                                              key_beg, key_end,
715                                              &cparent->cache_index[i],
716                                              flags, &ddflag);
717                 if (cparent->focus == NULL)
718                         cparent->focus = cparent->array[i];
719                 cluster->array[i] = chain;
720                 if (chain == NULL) {
721                         ++null_count;
722                 } else {
723                         if (cluster->focus == NULL) {
724                                 bref_type = chain->bref.type;
725                                 bref_key = chain->bref.key;
726                                 bref_keybits = chain->bref.keybits;
727                                 bytes = chain->bytes;
728                                 *ddflagp = ddflag;
729                                 cluster->focus = chain;
730                         }
731                         KKASSERT(bref_type == chain->bref.type);
732                         KKASSERT(bref_key == chain->bref.key);
733                         KKASSERT(bref_keybits == chain->bref.keybits);
734                         KKASSERT(bytes == chain->bytes);
735                         KKASSERT(*ddflagp == ddflag);
736                 }
737                 if (key_accum > key_next)
738                         key_accum = key_next;
739         }
740         *key_nextp = key_accum;
741         cluster->nchains = i;
742
743         if (null_count == i) {
744                 hammer2_cluster_drop(cluster);
745                 cluster = NULL;
746         }
747
748         return (cluster);
749 }
750
751 /*
752  * Locate next match or overlap under parent, replace cluster
753  */
754 hammer2_cluster_t *
755 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
756                      hammer2_key_t *key_nextp,
757                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
758 {
759         hammer2_chain_t *chain;
760         hammer2_key_t key_accum;
761         hammer2_key_t key_next;
762         int null_count;
763         int i;
764
765         key_accum = *key_nextp;
766         null_count = 0;
767         cluster->focus = NULL;
768         cparent->focus = NULL;
769
770         for (i = 0; i < cparent->nchains; ++i) {
771                 key_next = *key_nextp;
772                 chain = cluster->array[i];
773                 if (chain == NULL) {
774                         if (cparent->focus == NULL)
775                                 cparent->focus = cparent->array[i];
776                         ++null_count;
777                         continue;
778                 }
779                 if (cparent->array[i] == NULL) {
780                         if (flags & HAMMER2_LOOKUP_NOLOCK)
781                                 hammer2_chain_drop(chain);
782                         else
783                                 hammer2_chain_unlock(chain);
784                         ++null_count;
785                         continue;
786                 }
787                 chain = hammer2_chain_next(&cparent->array[i], chain,
788                                            &key_next, key_beg, key_end,
789                                            &cparent->cache_index[i], flags);
790                 if (cparent->focus == NULL)
791                         cparent->focus = cparent->array[i];
792                 cluster->array[i] = chain;
793                 if (chain == NULL) {
794                         ++null_count;
795                 } else if (cluster->focus == NULL) {
796                         cluster->focus = chain;
797                 }
798                 if (key_accum > key_next)
799                         key_accum = key_next;
800         }
801
802         if (null_count == i) {
803                 hammer2_cluster_drop(cluster);
804                 cluster = NULL;
805         }
806         return(cluster);
807 }
808
809 #if 0
810 /*
811  * XXX initial NULL cluster needs reworking (pass **clusterp ?)
812  *
813  * The raw scan function is similar to lookup/next but does not seek to a key.
814  * Blockrefs are iterated via first_chain = (parent, NULL) and
815  * next_chain = (parent, chain).
816  *
817  * The passed-in parent must be locked and its data resolved.  The returned
818  * chain will be locked.  Pass chain == NULL to acquire the first sub-chain
819  * under parent and then iterate with the passed-in chain (which this
820  * function will unlock).
821  */
822 hammer2_cluster_t *
823 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
824                      int flags)
825 {
826         hammer2_chain_t *chain;
827         int null_count;
828         int i;
829
830         null_count = 0;
831
832         for (i = 0; i < cparent->nchains; ++i) {
833                 chain = cluster->array[i];
834                 if (chain == NULL) {
835                         ++null_count;
836                         continue;
837                 }
838                 if (cparent->array[i] == NULL) {
839                         if (flags & HAMMER2_LOOKUP_NOLOCK)
840                                 hammer2_chain_drop(chain);
841                         else
842                                 hammer2_chain_unlock(chain);
843                         ++null_count;
844                         continue;
845                 }
846
847                 chain = hammer2_chain_scan(cparent->array[i], chain,
848                                            &cparent->cache_index[i], flags);
849                 cluster->array[i] = chain;
850                 if (chain == NULL)
851                         ++null_count;
852         }
853
854         if (null_count == i) {
855                 hammer2_cluster_drop(cluster);
856                 cluster = NULL;
857         }
858         return(cluster);
859 }
860
861 #endif
862
863 /*
864  * Create a new cluster using the specified key
865  */
866 int
867 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
868                      hammer2_cluster_t **clusterp,
869                      hammer2_key_t key, int keybits, int type, size_t bytes)
870 {
871         hammer2_cluster_t *cluster;
872         hammer2_pfsmount_t *pmp;
873         int error;
874         int i;
875
876         pmp = trans->pmp;                               /* can be NULL */
877
878         if ((cluster = *clusterp) == NULL) {
879                 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
880                                   M_WAITOK | M_ZERO);
881                 cluster->pmp = pmp;                     /* can be NULL */
882                 cluster->refs = 1;
883         }
884         cluster->focus = NULL;
885         cparent->focus = NULL;
886
887         /*
888          * NOTE: cluster->array[] entries can initially be NULL.  If
889          *       *clusterp is supplied, skip NULL entries, otherwise
890          *       create new chains.
891          */
892         for (i = 0; i < cparent->nchains; ++i) {
893                 if (*clusterp && cluster->array[i] == NULL) {
894                         if (cparent->focus == NULL)
895                                 cparent->focus = cparent->array[i];
896                         continue;
897                 }
898                 error = hammer2_chain_create(trans, &cparent->array[i],
899                                              &cluster->array[i], pmp,
900                                              key, keybits, type, bytes);
901                 KKASSERT(error == 0);
902                 if (cparent->focus == NULL)
903                         cparent->focus = cparent->array[i];
904                 if (cluster->focus == NULL)
905                         cluster->focus = cluster->array[i];
906         }
907         cluster->nchains = i;
908         *clusterp = cluster;
909
910         return error;
911 }
912
913 /*
914  * Duplicate a cluster under a new parent.
915  *
916  * WARNING! Unlike hammer2_chain_duplicate(), only the key and keybits fields
917  *          are used from a passed-in non-NULL bref pointer.  All other fields
918  *          are extracted from the original chain for each chain in the
919  *          iteration.
920  */
921 void
922 hammer2_cluster_duplicate(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
923                           hammer2_cluster_t *cluster, hammer2_blockref_t *bref,
924                           int snapshot, int duplicate_reason)
925 {
926         hammer2_chain_t *chain;
927         hammer2_blockref_t xbref;
928         int i;
929
930         cluster->focus = NULL;
931         cparent->focus = NULL;
932
933         for (i = 0; i < cluster->nchains; ++i) {
934                 chain = cluster->array[i];
935                 if (chain) {
936                         if (bref) {
937                                 xbref = chain->bref;
938                                 xbref.key = bref->key;
939                                 xbref.keybits = bref->keybits;
940                                 hammer2_chain_duplicate(trans,
941                                                         &cparent->array[i],
942                                                         &chain, &xbref,
943                                                         snapshot,
944                                                         duplicate_reason);
945                         } else {
946                                 hammer2_chain_duplicate(trans,
947                                                         &cparent->array[i],
948                                                         &chain, NULL,
949                                                         snapshot,
950                                                         duplicate_reason);
951                         }
952                         cluster->array[i] = chain;
953                         if (cluster->focus == NULL)
954                                 cluster->focus = chain;
955                         if (cparent->focus == NULL)
956                                 cparent->focus = cparent->array[i];
957                 } else {
958                         if (cparent->focus == NULL)
959                                 cparent->focus = cparent->array[i];
960                 }
961         }
962 }
963
964 /*
965  * Delete-duplicate a cluster in-place.
966  */
967 void
968 hammer2_cluster_delete_duplicate(hammer2_trans_t *trans,
969                                  hammer2_cluster_t *cluster, int flags)
970 {
971         hammer2_chain_t *chain;
972         int i;
973
974         cluster->focus = NULL;
975         for (i = 0; i < cluster->nchains; ++i) {
976                 chain = cluster->array[i];
977                 if (chain) {
978                         hammer2_chain_delete_duplicate(trans, &chain, flags);
979                         cluster->array[i] = chain;
980                         if (cluster->focus == NULL)
981                                 cluster->focus = chain;
982                 }
983         }
984 }
985
986 /*
987  * Mark a cluster deleted
988  */
989 void
990 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
991                        int flags)
992 {
993         hammer2_chain_t *chain;
994         int i;
995
996         for (i = 0; i < cluster->nchains; ++i) {
997                 chain = cluster->array[i];
998                 if (chain)
999                         hammer2_chain_delete(trans, chain, flags);
1000         }
1001 }
1002
1003 /*
1004  * Create a snapshot of the specified {parent, ochain} with the specified
1005  * label.  The originating hammer2_inode must be exclusively locked for
1006  * safety.
1007  *
1008  * The ioctl code has already synced the filesystem.
1009  */
1010 int
1011 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1012                        hammer2_ioc_pfs_t *pfs)
1013 {
1014         hammer2_mount_t *hmp;
1015         hammer2_cluster_t *ncluster;
1016         const hammer2_inode_data_t *ipdata;
1017         hammer2_inode_data_t *wipdata;
1018         hammer2_inode_t *nip;
1019         size_t name_len;
1020         hammer2_key_t lhc;
1021         struct vattr vat;
1022         uuid_t opfs_clid;
1023         int error;
1024
1025         kprintf("snapshot %s\n", pfs->name);
1026
1027         name_len = strlen(pfs->name);
1028         lhc = hammer2_dirhash(pfs->name, name_len);
1029
1030         ipdata = &hammer2_cluster_data(ocluster)->ipdata;
1031         opfs_clid = ipdata->pfs_clid;
1032         hmp = ocluster->focus->hmp;
1033
1034         /*
1035          * Create the snapshot directory under the super-root
1036          *
1037          * Set PFS type, generate a unique filesystem id, and generate
1038          * a cluster id.  Use the same clid when snapshotting a PFS root,
1039          * which theoretically allows the snapshot to be used as part of
1040          * the same cluster (perhaps as a cache).
1041          *
1042          * Copy the (flushed) blockref array.  Theoretically we could use
1043          * chain_duplicate() but it becomes difficult to disentangle
1044          * the shared core so for now just brute-force it.
1045          */
1046         VATTR_NULL(&vat);
1047         vat.va_type = VDIR;
1048         vat.va_mode = 0755;
1049         ncluster = NULL;
1050         nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1051                                    proc0.p_ucred, pfs->name, name_len,
1052                                    &ncluster, &error);
1053
1054         if (nip) {
1055                 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1056                 wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
1057                 kern_uuidgen(&wipdata->pfs_fsid, 1);
1058                 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSROOT)
1059                         wipdata->pfs_clid = opfs_clid;
1060                 else
1061                         kern_uuidgen(&wipdata->pfs_clid, 1);
1062                 hammer2_cluster_set_chainflags(ncluster, HAMMER2_CHAIN_PFSROOT);
1063
1064                 /* XXX hack blockset copy */
1065                 /* XXX doesn't work with real cluster */
1066                 KKASSERT(ocluster->nchains == 1);
1067                 wipdata->u.blockset = ocluster->focus->data->ipdata.u.blockset;
1068                 hammer2_cluster_modsync(ncluster);
1069                 hammer2_inode_unlock_ex(nip, ncluster);
1070         }
1071         return (error);
1072 }
1073
1074 /************************************************************************
1075  *                          NODE FAILURES                               *
1076  ************************************************************************
1077  *
1078  * A node failure can occur for numerous reasons.
1079  *
1080  *      - A read I/O may fail
1081  *      - A write I/O may fail
1082  *      - An unexpected chain might be found (or be missing)
1083  *      - A node might disconnect temporarily and reconnect later
1084  *        (for example, a USB stick could get pulled, or a node might
1085  *        be programmatically disconnected).
1086  *      - A node might run out of space during a modifying operation.
1087  *
1088  * When a read failure or an unexpected chain state is found, the chain and
1089  * parent chain at the failure point for the nodes involved (the nodes
1090  * which we determine to be in error) are flagged as failed and removed
1091  * from the cluster.  The node itself is allowed to remain active.  The
1092  * highest common point (usually a parent chain) is queued to the
1093  * resynchronization thread for action.
1094  *
1095  * When a write I/O fails or a node runs out of space, we first adjust
1096  * as if a read failure occurs but we further disable flushes on the
1097  * ENTIRE node.  Concurrent modifying transactions are allowed to complete
1098  * but any new modifying transactions will automatically remove the node
1099  * from consideration in all related cluster structures and not generate
1100  * any new modified chains.  The ROOT chain for the failed node(s) is queued
1101  * to the resynchronization thread for action.
1102  *
1103  * A temporary disconnect is handled as if a write failure occurred.
1104  *
1105  * Any of these failures might or might not stall related high level VNOPS,
1106  * depending on what has failed, what nodes remain, the type of cluster,
1107  * and the operating state of the cluster.
1108  *
1109  *                          FLUSH ON WRITE-DISABLED NODES
1110  *
1111  * A flush on a write-disabled node is not allowed to write anything because
1112  * we cannot safely update the mirror_tid anywhere on the failed node.  The
1113  * synchronization thread uses mirror_tid to calculate incremental resyncs.
1114  * Dirty meta-data related to the failed node is thrown away.
1115  *
1116  * Dirty buffer cache buffers and inodes are only thrown away if they can be
1117  * retired... that is, if the filesystem still has enough nodes to complete
1118  * the operation.
1119  */
1120
1121 /************************************************************************
1122  *                      SYNCHRONIZATION THREAD                          *
1123  ************************************************************************
1124  *
1125  * This thread is responsible for [re]synchronizing the cluster representing
1126  * a PFS.  Any out-of-sync or failed node starts this thread on a
1127  * node-by-node basis when the failure is detected.
1128  *
1129  * Clusters needing resynchronization are queued at the highest point
1130  * where the parent on the failed node is still valid, or a special
1131  * incremental scan from the ROOT is queued if no parent exists.  This
1132  * thread is also responsible for waiting for reconnections of the failed
1133  * node if the cause was due to a disconnect, and waiting for space to be
1134  * freed up if the cause was due to running out of space.
1135  *
1136  * If the cause is due to a node running out of space, this thread will also
1137  * remove older (unlocked) snapshots to make new space, recover space, and
1138  * then start resynchronization.
1139  *
1140  * Each resynchronization pass virtually snapshots the PFS on the good nodes
1141  * and synchronizes using that snapshot against the target node.  This
1142  * ensures a consistent chain topology and also avoid interference between
1143  * the resynchronization thread and frontend operations.
1144  *
1145  * Since these are per-node threads it is possible to resynchronize several
1146  * nodes at once.
1147  */