hammer2 - cluster API, preliminary implementation of multi-chain updates
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information into a single entity.  It allows direct access to media
37  * data as long as it is not blockref array data.  Meaning, basically,
38  * just inode and file data.
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  */
48 #include <sys/cdefs.h>
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/types.h>
52 #include <sys/lock.h>
53 #include <sys/uuid.h>
54
55 #include "hammer2.h"
56
57 u_int
58 hammer2_cluster_bytes(hammer2_cluster_t *cluster)
59 {
60         return(cluster->focus->bytes);
61 }
62
63 uint8_t
64 hammer2_cluster_type(hammer2_cluster_t *cluster)
65 {
66         return(cluster->focus->bref.type);
67 }
68
69 /*
70  * NOTE: When modifying a cluster object via hammer2_cluster_wdata()
71  *       and hammer2_cluster_modsync(), remember that block array
72  *       entries are not copied to the elements of the cluster.
73  */
74 const hammer2_media_data_t *
75 hammer2_cluster_data(hammer2_cluster_t *cluster)
76 {
77         return(cluster->focus->data);
78 }
79
80 hammer2_media_data_t *
81 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
82 {
83         return(cluster->focus->data);
84 }
85
86 int
87 hammer2_cluster_modified(hammer2_cluster_t *cluster)
88 {
89         return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
90 }
91
92 int
93 hammer2_cluster_unlinked(hammer2_cluster_t *cluster)
94 {
95         return((cluster->focus->flags & HAMMER2_CHAIN_UNLINKED) != 0);
96 }
97
98 /*
99  * Return a bref representative of the cluster.  Any data offset is removed
100  * (since it would only be applicable to a particular chain in the cluster).
101  *
102  * However, the radix portion of data_off is used for many purposes and will
103  * be retained.
104  */
105 void
106 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
107 {
108         *bref = cluster->focus->bref;
109         bref->data_off &= HAMMER2_OFF_MASK_RADIX;
110 }
111
112 void
113 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
114 {
115         hammer2_chain_t *chain;
116         int i;
117
118         for (i = 0; i < cluster->nchains; ++i) {
119                 chain = cluster->array[i];
120                 if (chain)
121                         atomic_set_int(&chain->flags, flags);
122         }
123 }
124
125 void
126 hammer2_cluster_setsubmod(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
127 {
128         hammer2_chain_t *chain;
129         int i;
130
131         for (i = 0; i < cluster->nchains; ++i) {
132                 chain = cluster->array[i];
133                 if (chain)
134                         hammer2_chain_setsubmod(trans, chain);
135         }
136 }
137
138 /*
139  * Create a cluster with one ref from the specified chain.  The chain
140  * is not further referenced.  The caller typically supplies a locked
141  * chain and transfers ownership to the cluster.
142  */
143 hammer2_cluster_t *
144 hammer2_cluster_from_chain(hammer2_chain_t *chain)
145 {
146         hammer2_cluster_t *cluster;
147
148         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
149         cluster->array[0] = chain;
150         cluster->nchains = 1;
151         cluster->focus = chain;
152         cluster->pmp = chain->pmp;              /* can be NULL */
153         cluster->refs = 1;
154
155         return cluster;
156 }
157
158 /*
159  * Allocates a cluster and its underlying chain structures.  The underlying
160  * chains will be locked.  The cluster and underlying chains will have one
161  * ref.
162  */
163 hammer2_cluster_t *
164 hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
165                       hammer2_trans_t *trans, hammer2_blockref_t *bref)
166 {
167         hammer2_cluster_t *cluster;
168         hammer2_chain_t *chain;
169         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
170         int i;
171
172         KKASSERT(pmp != NULL);
173
174         /*
175          * Construct the appropriate system structure.
176          */
177         switch(bref->type) {
178         case HAMMER2_BREF_TYPE_INODE:
179         case HAMMER2_BREF_TYPE_INDIRECT:
180         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
181         case HAMMER2_BREF_TYPE_DATA:
182         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
183                 /*
184                  * Chain's are really only associated with the hmp but we
185                  * maintain a pmp association for per-mount memory tracking
186                  * purposes.  The pmp can be NULL.
187                  */
188                 break;
189         case HAMMER2_BREF_TYPE_VOLUME:
190         case HAMMER2_BREF_TYPE_FREEMAP:
191                 chain = NULL;
192                 panic("hammer2_cluster_alloc volume type illegal for op");
193         default:
194                 chain = NULL;
195                 panic("hammer2_cluster_alloc: unrecognized blockref type: %d",
196                       bref->type);
197         }
198
199         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
200         cluster->refs = 1;
201
202         for (i = 0; i < pmp->cluster.nchains; ++i) {
203                 chain = hammer2_chain_alloc(pmp->cluster.array[i]->hmp, pmp,
204                                             trans, bref);
205                 chain->pmp = pmp;
206                 chain->hmp = pmp->cluster.array[i]->hmp;
207                 chain->bref = *bref;
208                 chain->bytes = bytes;
209                 chain->refs = 1;
210                 chain->flags = HAMMER2_CHAIN_ALLOCATED;
211                 chain->delete_tid = HAMMER2_MAX_TID;
212
213                 /*
214                  * Set modify_tid if a transaction is creating the inode.
215                  * Enforce update_lo = 0 so nearby transactions do not think
216                  * it has been flushed when it hasn't.
217                  *
218                  * NOTE: When loading a chain from backing store or creating a
219                  *       snapshot, trans will be NULL and the caller is
220                  *       responsible for setting these fields.
221                  */
222                 if (trans) {
223                         chain->modify_tid = trans->sync_tid;
224                         chain->update_lo = 0;
225                 }
226                 cluster->array[i] = chain;
227         }
228         cluster->nchains = i;
229         cluster->pmp = pmp;
230         cluster->focus = cluster->array[0];
231
232         return (cluster);
233 }
234
235 /*
236  * Associate an existing core with the chain or allocate a new core.
237  *
238  * The core is not locked.  No additional refs on the chain are made.
239  * (trans) must not be NULL if (core) is not NULL.
240  *
241  * When chains are delete-duplicated during flushes we insert nchain on
242  * the ownerq after ochain instead of at the end in order to give the
243  * drop code visibility in the correct order, otherwise drops can be missed.
244  */
245 void
246 hammer2_cluster_core_alloc(hammer2_trans_t *trans,
247                            hammer2_cluster_t *ncluster,
248                            hammer2_cluster_t *ocluster)
249 {
250         int i;
251
252         for (i = 0; i < ocluster->nchains; ++i) {
253                 if (ncluster->array[i]) {
254                         hammer2_chain_core_alloc(trans,
255                                                  ncluster->array[i],
256                                                  ocluster->array[i]);
257                 }
258         }
259 }
260
261 /*
262  * Add a reference to a cluster.
263  *
264  * We must also ref the underlying chains in order to allow ref/unlock
265  * sequences to later re-lock.
266  */
267 void
268 hammer2_cluster_ref(hammer2_cluster_t *cluster)
269 {
270         hammer2_chain_t *chain;
271         int i;
272
273         atomic_add_int(&cluster->refs, 1);
274         for (i = 0; i < cluster->nchains; ++i) {
275                 chain = cluster->array[i];
276                 if (chain)
277                         hammer2_chain_ref(chain);
278         }
279 }
280
281 /*
282  * Drop the caller's reference to the cluster.  When the ref count drops to
283  * zero this function frees the cluster and drops all underlying chains.
284  */
285 void
286 hammer2_cluster_drop(hammer2_cluster_t *cluster)
287 {
288         hammer2_chain_t *chain;
289         int i;
290
291         KKASSERT(cluster->refs > 0);
292         for (i = 0; i < cluster->nchains; ++i) {
293                 chain = cluster->array[i];
294                 if (chain) {
295                         hammer2_chain_drop(chain);
296                         if (cluster->refs == 1)
297                                 cluster->array[i] = NULL;
298                 }
299         }
300         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
301                 cluster->focus = NULL;
302                 kfree(cluster, M_HAMMER2);
303                 /* cluster = NULL; safety */
304         }
305 }
306
307 void
308 hammer2_cluster_wait(hammer2_cluster_t *cluster)
309 {
310         tsleep(cluster->focus, 0, "h2clcw", 1);
311 }
312
313 /*
314  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
315  * and then locks them.
316  */
317 int
318 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
319 {
320         hammer2_chain_t *chain;
321         int i;
322         int error;
323
324         error = 0;
325         atomic_add_int(&cluster->refs, 1);
326         for (i = 0; i < cluster->nchains; ++i) {
327                 chain = cluster->array[i];
328                 if (chain) {
329                         error = hammer2_chain_lock(chain, how);
330                         if (error) {
331                                 while (--i >= 0)
332                                         hammer2_chain_unlock(cluster->array[i]);
333                                 atomic_add_int(&cluster->refs, -1);
334                                 break;
335                         }
336                 }
337         }
338         return error;
339 }
340
341 /*
342  * Replace the contents of dst with src, adding a reference to src's chains.
343  * dst is assumed to already have a ref and any chains present in dst are
344  * assumed to be locked and will be unlocked.
345  *
346  * If the chains in src are locked, only one of (src) or (dst) should be
347  * considered locked by the caller after return, not both.
348  */
349 void
350 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src)
351 {
352         hammer2_chain_t *chain;
353         int i;
354
355         KKASSERT(dst->refs == 1);
356         dst->focus = NULL;
357
358         for (i = 0; i < src->nchains; ++i) {
359                 chain = src->array[i];
360                 if (chain) {
361                         hammer2_chain_ref(chain);
362                         if (i < dst->nchains && dst->array[i])
363                                 hammer2_chain_unlock(dst->array[i]);
364                         dst->array[i] = chain;
365                         if (dst->focus == NULL)
366                                 dst->focus = chain;
367                 }
368         }
369         while (i < dst->nchains) {
370                 chain = dst->array[i];
371                 if (chain) {
372                         hammer2_chain_unlock(chain);
373                         dst->array[i] = NULL;
374                 }
375                 ++i;
376         }
377         dst->nchains = src->nchains;
378 }
379
380 /*
381  * Replace the contents of the locked destination with the contents of the
382  * locked source.  Destination must have one ref.
383  *
384  * Returns with the destination still with one ref and the copied chains
385  * with an additional lock (representing their state on the destination).
386  * The original chains associated with the destination are unlocked.
387  */
388 void
389 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src)
390 {
391         hammer2_chain_t *chain;
392         int i;
393
394         KKASSERT(dst->refs == 1);
395
396         dst->focus = NULL;
397         for (i = 0; i < src->nchains; ++i) {
398                 chain = src->array[i];
399                 if (chain) {
400                         hammer2_chain_lock(chain, 0);
401                         if (i < dst->nchains && dst->array[i])
402                                 hammer2_chain_unlock(dst->array[i]);
403                         dst->array[i] = src->array[i];
404                         if (dst->focus == NULL)
405                                 dst->focus = chain;
406                 }
407         }
408         while (i < dst->nchains) {
409                 chain = dst->array[i];
410                 if (chain) {
411                         hammer2_chain_unlock(chain);
412                         dst->array[i] = NULL;
413                 }
414                 ++i;
415         }
416         dst->nchains = src->nchains;
417 }
418
419 /*
420  * Copy a cluster, returned a ref'd cluster.  All underlying chains
421  * are also ref'd, but not locked.
422  *
423  * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied
424  * to the new cluster and a reference is nominally added to them and to
425  * the cluster.  The cluster will have 1 ref.
426  *
427  * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains
428  * are copied but no additional references are made and the cluster will have
429  * 0 refs.  Callers must ref the cluster and the chains before dropping it
430  * (typically by locking it).
431  *
432  * If flags are passed as 0 the copy is setup as if it contained the chains
433  * but the chains will not be copied over, and the cluster will have 0 refs.
434  * Callers must ref the cluster before dropping it (typically by locking it).
435  */
436 hammer2_cluster_t *
437 hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags)
438 {
439         hammer2_pfsmount_t *pmp = ocluster->pmp;
440         hammer2_cluster_t *ncluster;
441         int i;
442
443         ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
444         ncluster->pmp = pmp;
445         ncluster->nchains = ocluster->nchains;
446         ncluster->focus = ocluster->focus;
447         ncluster->refs = 1;
448         if (copy_flags & HAMMER2_CLUSTER_COPY_CHAINS) {
449                 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0)
450                         ncluster->refs = 1;
451                 for (i = 0; i < ocluster->nchains; ++i) {
452                         ncluster->array[i] = ocluster->array[i];
453                         if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 &&
454                             ncluster->array[i]) {
455                                 hammer2_chain_ref(ncluster->array[i]);
456                         }
457                 }
458         }
459         return (ncluster);
460 }
461
462 /*
463  * Unlock and deref a cluster.  The cluster is destroyed if this is the
464  * last ref.
465  */
466 void
467 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
468 {
469         hammer2_chain_t *chain;
470         int i;
471
472         KKASSERT(cluster->refs > 0);
473         for (i = 0; i < cluster->nchains; ++i) {
474                 chain = cluster->array[i];
475                 if (chain) {
476                         hammer2_chain_unlock(chain);
477                         if (cluster->refs == 1)
478                                 cluster->array[i] = NULL;       /* safety */
479                 }
480         }
481         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
482                 cluster->focus = NULL;
483                 kfree(cluster, M_HAMMER2);
484                 /* cluster = NULL; safety */
485         }
486 }
487
488 /*
489  * Refactor the locked chains of a cluster.
490  */
491 void
492 hammer2_cluster_refactor(hammer2_cluster_t *cluster)
493 {
494         int i;
495
496         cluster->focus = NULL;
497         for (i = 0; i < cluster->nchains; ++i) {
498                 if (cluster->array[i]) {
499                         hammer2_chain_refactor(&cluster->array[i]);
500                         if (cluster->focus == NULL)
501                                 cluster->focus = cluster->array[i];
502                 }
503         }
504 }
505
506 /*
507  * Resize the cluster's physical storage allocation in-place.  This may
508  * replace the cluster's chains.
509  */
510 void
511 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
512                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
513                        int nradix, int flags)
514 {
515         int i;
516
517         KKASSERT(cparent->pmp == cluster->pmp);         /* can be NULL */
518         KKASSERT(cparent->nchains == cluster->nchains);
519
520         cluster->focus = NULL;
521         for (i = 0; i < cluster->nchains; ++i) {
522                 if (cluster->array[i]) {
523                         KKASSERT(cparent->array[i]);
524                         hammer2_chain_resize(trans, ip,
525                                              cparent->array[i],
526                                              &cluster->array[i],
527                                              nradix, flags);
528                         if (cluster->focus == NULL)
529                                 cluster->focus = cluster->array[i];
530                 }
531         }
532 }
533
534 /*
535  * Set an inode's cluster modified, marking the related chains RW and
536  * duplicating them if necessary.
537  *
538  * The passed-in chain is a localized copy of the chain previously acquired
539  * when the inode was locked (and possilby replaced in the mean time), and
540  * must also be updated.  In fact, we update it first and then synchronize
541  * the inode's cluster cache.
542  */
543 hammer2_inode_data_t *
544 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
545                           hammer2_cluster_t *cluster, int flags)
546 {
547         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
548         hammer2_cluster_modify(trans, cluster, flags);
549
550         hammer2_inode_repoint(ip, NULL, cluster);
551         if (ip->vp)
552                 vsetisdirty(ip->vp);
553         return (&hammer2_cluster_wdata(cluster)->ipdata);
554 }
555
556 /*
557  * Adjust the cluster's chains to allow modification.
558  */
559 void
560 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
561                        int flags)
562 {
563         int i;
564
565         cluster->focus = NULL;
566         for (i = 0; i < cluster->nchains; ++i) {
567                 if (cluster->array[i]) {
568                         hammer2_chain_modify(trans, &cluster->array[i], flags);
569                         if (cluster->focus == NULL)
570                                 cluster->focus = cluster->array[i];
571                 }
572         }
573 }
574
575 /*
576  * Synchronize modifications with other chains in a cluster.
577  *
578  * Nominal front-end operations only edit non-block-table data in a single
579  * chain.  This code copies such modifications to the other chains in the
580  * cluster.
581  */
582 /* hammer2_cluster_modsync() */
583
584 void
585 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
586 {
587         hammer2_chain_t *focus;
588         hammer2_chain_t *scan;
589         const hammer2_inode_data_t *ripdata;
590         hammer2_inode_data_t *wipdata;
591         int i;
592
593         focus = cluster->focus;
594         KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
595
596         for (i = 0; i < cluster->nchains; ++i) {
597                 scan = cluster->array[i];
598                 if (scan == NULL || scan == focus)
599                         continue;
600                 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
601                 KKASSERT(focus->bytes == scan->bytes &&
602                          focus->bref.type == scan->bref.type);
603                 switch(focus->bref.type) {
604                 case HAMMER2_BREF_TYPE_INODE:
605                         ripdata = &focus->data->ipdata;
606                         wipdata = &scan->data->ipdata;
607                         if ((ripdata->op_flags &
608                             HAMMER2_OPFLAG_DIRECTDATA) == 0) {
609                                 bcopy(ripdata, wipdata,
610                                       offsetof(hammer2_inode_data_t, u));
611                                 break;
612                         }
613                         /* fall through */
614                 case HAMMER2_BREF_TYPE_DATA:
615                         bcopy(focus->data, scan->data, focus->bytes);
616                         break;
617                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
618                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
619                 case HAMMER2_BREF_TYPE_FREEMAP:
620                 case HAMMER2_BREF_TYPE_VOLUME:
621                         panic("hammer2_cluster_modsync: illegal node type");
622                         /* NOT REACHED */
623                         break;
624                 default:
625                         panic("hammer2_cluster_modsync: unknown node type");
626                         break;
627                 }
628         }
629 }
630
631 /*
632  * Lookup initialization/completion API
633  */
634 hammer2_cluster_t *
635 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
636 {
637         hammer2_cluster_t *cluster;
638         int i;
639
640         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
641         cluster->pmp = cparent->pmp;                    /* can be NULL */
642         /* cluster->focus = NULL; already null */
643
644         for (i = 0; i < cparent->nchains; ++i) {
645                 cluster->array[i] = cparent->array[i];
646                 if (cluster->focus == NULL)
647                         cluster->focus = cluster->array[i];
648         }
649         cluster->nchains = cparent->nchains;
650
651         /*
652          * Independently lock (this will also give cluster 1 ref)
653          */
654         if (flags & HAMMER2_LOOKUP_SHARED) {
655                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
656                                               HAMMER2_RESOLVE_SHARED);
657         } else {
658                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
659         }
660         return (cluster);
661 }
662
663 void
664 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
665 {
666         if (cparent)
667                 hammer2_cluster_unlock(cparent);
668 }
669
670 /*
671  * Locate first match or overlap under parent, return a new cluster
672  */
673 hammer2_cluster_t *
674 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
675                      hammer2_key_t key_beg, hammer2_key_t key_end,
676                      int flags, int *ddflagp)
677 {
678         hammer2_pfsmount_t *pmp;
679         hammer2_cluster_t *cluster;
680         hammer2_chain_t *chain;
681         hammer2_key_t key_accum;
682         hammer2_key_t key_next;
683         int null_count;
684         int ddflag;
685         int i;
686         uint8_t bref_type;
687         u_int bytes;
688
689         pmp = cparent->pmp;                             /* can be NULL */
690         key_accum = *key_nextp;
691         null_count = 0;
692         bref_type = 0;
693         bytes = 0;
694
695         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
696         cluster->pmp = pmp;                             /* can be NULL */
697         cluster->refs = 1;
698         /* cluster->focus = NULL; already null */
699         cparent->focus = NULL;
700         *ddflagp = 0;
701
702         for (i = 0; i < cparent->nchains; ++i) {
703                 key_next = *key_nextp;
704                 if (cparent->array[i] == NULL) {
705                         ++null_count;
706                         continue;
707                 }
708                 chain = hammer2_chain_lookup(&cparent->array[i], &key_next,
709                                              key_beg, key_end,
710                                              &cparent->cache_index[i],
711                                              flags, &ddflag);
712                 if (cparent->focus == NULL)
713                         cparent->focus = cparent->array[i];
714                 cluster->array[i] = chain;
715                 if (chain == NULL) {
716                         ++null_count;
717                 } else {
718                         if (cluster->focus == NULL) {
719                                 bref_type = chain->bref.type;
720                                 bytes = chain->bytes;
721                                 *ddflagp = ddflag;
722                                 cluster->focus = chain;
723                         }
724                         KKASSERT(bref_type == chain->bref.type);
725                         KKASSERT(bytes == chain->bytes);
726                         KKASSERT(*ddflagp == ddflag);
727                 }
728                 if (key_accum > key_next)
729                         key_accum = key_next;
730         }
731         *key_nextp = key_accum;
732         cluster->nchains = i;
733
734         if (null_count == i) {
735                 hammer2_cluster_drop(cluster);
736                 cluster = NULL;
737         }
738
739         return (cluster);
740 }
741
742 /*
743  * Locate next match or overlap under parent, replace cluster
744  */
745 hammer2_cluster_t *
746 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
747                      hammer2_key_t *key_nextp,
748                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
749 {
750         hammer2_chain_t *chain;
751         hammer2_key_t key_accum;
752         hammer2_key_t key_next;
753         int null_count;
754         int i;
755
756         key_accum = *key_nextp;
757         null_count = 0;
758         cluster->focus = NULL;
759         cparent->focus = NULL;
760
761         for (i = 0; i < cparent->nchains; ++i) {
762                 key_next = *key_nextp;
763                 chain = cluster->array[i];
764                 if (chain == NULL) {
765                         if (cparent->focus == NULL)
766                                 cparent->focus = cparent->array[i];
767                         ++null_count;
768                         continue;
769                 }
770                 if (cparent->array[i] == NULL) {
771                         if (flags & HAMMER2_LOOKUP_NOLOCK)
772                                 hammer2_chain_drop(chain);
773                         else
774                                 hammer2_chain_unlock(chain);
775                         ++null_count;
776                         continue;
777                 }
778                 chain = hammer2_chain_next(&cparent->array[i], chain,
779                                            &key_next, key_beg, key_end,
780                                            &cparent->cache_index[i], flags);
781                 if (cparent->focus == NULL)
782                         cparent->focus = cparent->array[i];
783                 cluster->array[i] = chain;
784                 if (chain == NULL) {
785                         ++null_count;
786                 } else if (cluster->focus == NULL) {
787                         cluster->focus = chain;
788                 }
789                 if (key_accum > key_next)
790                         key_accum = key_next;
791         }
792
793         if (null_count == i) {
794                 hammer2_cluster_drop(cluster);
795                 cluster = NULL;
796         }
797         return(cluster);
798 }
799
800 #if 0
801 /*
802  * XXX initial NULL cluster needs reworking (pass **clusterp ?)
803  *
804  * The raw scan function is similar to lookup/next but does not seek to a key.
805  * Blockrefs are iterated via first_chain = (parent, NULL) and
806  * next_chain = (parent, chain).
807  *
808  * The passed-in parent must be locked and its data resolved.  The returned
809  * chain will be locked.  Pass chain == NULL to acquire the first sub-chain
810  * under parent and then iterate with the passed-in chain (which this
811  * function will unlock).
812  */
813 hammer2_cluster_t *
814 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
815                      int flags)
816 {
817         hammer2_chain_t *chain;
818         int null_count;
819         int i;
820
821         null_count = 0;
822
823         for (i = 0; i < cparent->nchains; ++i) {
824                 chain = cluster->array[i];
825                 if (chain == NULL) {
826                         ++null_count;
827                         continue;
828                 }
829                 if (cparent->array[i] == NULL) {
830                         if (flags & HAMMER2_LOOKUP_NOLOCK)
831                                 hammer2_chain_drop(chain);
832                         else
833                                 hammer2_chain_unlock(chain);
834                         ++null_count;
835                         continue;
836                 }
837
838                 chain = hammer2_chain_scan(cparent->array[i], chain,
839                                            &cparent->cache_index[i], flags);
840                 cluster->array[i] = chain;
841                 if (chain == NULL)
842                         ++null_count;
843         }
844
845         if (null_count == i) {
846                 hammer2_cluster_drop(cluster);
847                 cluster = NULL;
848         }
849         return(cluster);
850 }
851
852 #endif
853
854 /*
855  * Create a new cluster using the specified key
856  */
857 int
858 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
859                      hammer2_cluster_t **clusterp,
860                      hammer2_key_t key, int keybits, int type, size_t bytes)
861 {
862         hammer2_cluster_t *cluster;
863         hammer2_pfsmount_t *pmp;
864         int error;
865         int i;
866
867         pmp = trans->pmp;                               /* can be NULL */
868
869         if ((cluster = *clusterp) == NULL) {
870                 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
871                                   M_WAITOK | M_ZERO);
872                 cluster->pmp = pmp;                     /* can be NULL */
873                 cluster->refs = 1;
874         }
875         cluster->focus = NULL;
876         cparent->focus = NULL;
877
878         /*
879          * NOTE: cluster->array[] entries can initially be NULL.  If
880          *       *clusterp is supplied, skip NULL entries, otherwise
881          *       create new chains.
882          */
883         for (i = 0; i < cparent->nchains; ++i) {
884                 if (*clusterp && cluster->array[i] == NULL) {
885                         if (cparent->focus == NULL)
886                                 cparent->focus = cparent->array[i];
887                         continue;
888                 }
889                 error = hammer2_chain_create(trans,
890                                              &cparent->array[i],
891                                              &cluster->array[i],
892                                              key, keybits, type, bytes);
893                 KKASSERT(error == 0);
894                 if (cparent->focus == NULL)
895                         cparent->focus = cparent->array[i];
896                 if (cluster->focus == NULL)
897                         cluster->focus = cluster->array[i];
898         }
899         cluster->nchains = i;
900         *clusterp = cluster;
901
902         return error;
903 }
904
905 /*
906  * Duplicate a cluster under a new parent.
907  *
908  * WARNING! Unlike hammer2_chain_duplicate(), only the key and keybits fields
909  *          are used from a passed-in non-NULL bref pointer.  All other fields
910  *          are extracted from the original chain for each chain in the
911  *          iteration.
912  */
913 void
914 hammer2_cluster_duplicate(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
915                           hammer2_cluster_t *cluster, hammer2_blockref_t *bref,
916                           int snapshot, int duplicate_reason)
917 {
918         hammer2_chain_t *chain;
919         hammer2_blockref_t xbref;
920         int i;
921
922         cluster->focus = NULL;
923         cparent->focus = NULL;
924
925         for (i = 0; i < cluster->nchains; ++i) {
926                 chain = cluster->array[i];
927                 if (chain) {
928                         if (bref) {
929                                 xbref = chain->bref;
930                                 xbref.key = bref->key;
931                                 xbref.keybits = bref->keybits;
932                                 hammer2_chain_duplicate(trans,
933                                                         &cparent->array[i],
934                                                         &chain, &xbref,
935                                                         snapshot,
936                                                         duplicate_reason);
937                         } else {
938                                 hammer2_chain_duplicate(trans,
939                                                         &cparent->array[i],
940                                                         &chain, NULL,
941                                                         snapshot,
942                                                         duplicate_reason);
943                         }
944                         cluster->array[i] = chain;
945                         if (cluster->focus == NULL)
946                                 cluster->focus = chain;
947                         if (cparent->focus == NULL)
948                                 cparent->focus = cparent->array[i];
949                 } else {
950                         if (cparent->focus == NULL)
951                                 cparent->focus = cparent->array[i];
952                 }
953         }
954 }
955
956 /*
957  * Delete-duplicate a cluster in-place.
958  */
959 void
960 hammer2_cluster_delete_duplicate(hammer2_trans_t *trans,
961                                  hammer2_cluster_t *cluster, int flags)
962 {
963         hammer2_chain_t *chain;
964         int i;
965
966         cluster->focus = NULL;
967         for (i = 0; i < cluster->nchains; ++i) {
968                 chain = cluster->array[i];
969                 if (chain) {
970                         hammer2_chain_delete_duplicate(trans, &chain, flags);
971                         cluster->array[i] = chain;
972                         if (cluster->focus == NULL)
973                                 cluster->focus = chain;
974                 }
975         }
976 }
977
978 /*
979  * Mark a cluster deleted
980  */
981 void
982 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
983                        int flags)
984 {
985         hammer2_chain_t *chain;
986         int i;
987
988         for (i = 0; i < cluster->nchains; ++i) {
989                 chain = cluster->array[i];
990                 if (chain)
991                         hammer2_chain_delete(trans, chain, flags);
992         }
993 }
994
995 /*
996  * Create a snapshot of the specified {parent, ochain} with the specified
997  * label.  The originating hammer2_inode must be exclusively locked for
998  * safety.
999  *
1000  * The ioctl code has already synced the filesystem.
1001  */
1002 int
1003 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1004                        hammer2_ioc_pfs_t *pfs)
1005 {
1006         hammer2_mount_t *hmp;
1007         hammer2_cluster_t *ncluster;
1008         const hammer2_inode_data_t *ipdata;
1009         hammer2_inode_data_t *wipdata;
1010         hammer2_inode_t *nip;
1011         size_t name_len;
1012         hammer2_key_t lhc;
1013         struct vattr vat;
1014         uuid_t opfs_clid;
1015         int error;
1016
1017         kprintf("snapshot %s\n", pfs->name);
1018
1019         name_len = strlen(pfs->name);
1020         lhc = hammer2_dirhash(pfs->name, name_len);
1021
1022         ipdata = &hammer2_cluster_data(ocluster)->ipdata;
1023         opfs_clid = ipdata->pfs_clid;
1024         hmp = ocluster->focus->hmp;
1025
1026         /*
1027          * Create the snapshot directory under the super-root
1028          *
1029          * Set PFS type, generate a unique filesystem id, and generate
1030          * a cluster id.  Use the same clid when snapshotting a PFS root,
1031          * which theoretically allows the snapshot to be used as part of
1032          * the same cluster (perhaps as a cache).
1033          *
1034          * Copy the (flushed) blockref array.  Theoretically we could use
1035          * chain_duplicate() but it becomes difficult to disentangle
1036          * the shared core so for now just brute-force it.
1037          */
1038         VATTR_NULL(&vat);
1039         vat.va_type = VDIR;
1040         vat.va_mode = 0755;
1041         ncluster = NULL;
1042         nip = hammer2_inode_create(trans, hmp->sroot, &vat, proc0.p_ucred,
1043                                    pfs->name, name_len, &ncluster, &error);
1044
1045         if (nip) {
1046                 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1047                 wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
1048                 kern_uuidgen(&wipdata->pfs_fsid, 1);
1049                 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSROOT)
1050                         wipdata->pfs_clid = opfs_clid;
1051                 else
1052                         kern_uuidgen(&wipdata->pfs_clid, 1);
1053                 hammer2_cluster_set_chainflags(ncluster, HAMMER2_CHAIN_PFSROOT);
1054
1055                 /* XXX hack blockset copy */
1056                 /* XXX doesn't work with real cluster */
1057                 KKASSERT(ocluster->nchains == 1);
1058                 wipdata->u.blockset = ocluster->focus->data->ipdata.u.blockset;
1059
1060                 hammer2_inode_unlock_ex(nip, ncluster);
1061         }
1062         return (error);
1063 }