102bd706866ad7ac26cac0f0bf0b9dd9f7a8d427
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
1 /*
2  * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information into a single entity.  It allows direct access to media
37  * data as long as it is not blockref array data.  Meaning, basically,
38  * just inode and file data.
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  */
48 #include <sys/cdefs.h>
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/types.h>
52 #include <sys/lock.h>
53 #include <sys/uuid.h>
54
55 #include "hammer2.h"
56
57 u_int
58 hammer2_cluster_bytes(hammer2_cluster_t *cluster)
59 {
60         return(cluster->focus->bytes);
61 }
62
63 uint8_t
64 hammer2_cluster_type(hammer2_cluster_t *cluster)
65 {
66         return(cluster->focus->bref.type);
67 }
68
69 /*
70  * NOTE: When modifying a cluster object via hammer2_cluster_wdata()
71  *       and hammer2_cluster_modsync(), remember that block array
72  *       entries are not copied to the elements of the cluster.
73  */
74 const hammer2_media_data_t *
75 hammer2_cluster_data(hammer2_cluster_t *cluster)
76 {
77         return(cluster->focus->data);
78 }
79
80 hammer2_media_data_t *
81 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
82 {
83         return(cluster->focus->data);
84 }
85
86 int
87 hammer2_cluster_modified(hammer2_cluster_t *cluster)
88 {
89         return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
90 }
91
92 int
93 hammer2_cluster_unlinked(hammer2_cluster_t *cluster)
94 {
95         return((cluster->focus->flags & HAMMER2_CHAIN_UNLINKED) != 0);
96 }
97
98 /*
99  * Return a bref representative of the cluster.  Any data offset is removed
100  * (since it would only be applicable to a particular chain in the cluster).
101  *
102  * However, the radix portion of data_off is used for many purposes and will
103  * be retained.
104  */
105 void
106 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
107 {
108         *bref = cluster->focus->bref;
109         bref->data_off &= HAMMER2_OFF_MASK_RADIX;
110 }
111
112 void
113 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
114 {
115         hammer2_chain_t *chain;
116         int i;
117
118         for (i = 0; i < cluster->nchains; ++i) {
119                 chain = cluster->array[i];
120                 if (chain)
121                         atomic_set_int(&chain->flags, flags);
122         }
123 }
124
125 void
126 hammer2_cluster_setsubmod(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
127 {
128         hammer2_chain_t *chain;
129         int i;
130
131         for (i = 0; i < cluster->nchains; ++i) {
132                 chain = cluster->array[i];
133                 if (chain)
134                         hammer2_chain_setsubmod(trans, chain);
135         }
136 }
137
138 /*
139  * Create a cluster with one ref from the specified chain.  The chain
140  * is not further referenced.  The caller typically supplies a locked
141  * chain and transfers ownership to the cluster.
142  */
143 hammer2_cluster_t *
144 hammer2_cluster_from_chain(hammer2_chain_t *chain)
145 {
146         hammer2_cluster_t *cluster;
147
148         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
149         cluster->array[0] = chain;
150         cluster->nchains = 1;
151         cluster->focus = chain;
152         cluster->pmp = chain->pmp;              /* can be NULL */
153         cluster->refs = 1;
154
155         return cluster;
156 }
157
158 /*
159  * Allocates a cluster and its underlying chain structures.  The underlying
160  * chains will be locked.  The cluster and underlying chains will have one
161  * ref.
162  */
163 hammer2_cluster_t *
164 hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
165                       hammer2_trans_t *trans, hammer2_blockref_t *bref)
166 {
167         hammer2_cluster_t *cluster;
168         hammer2_cluster_t *rcluster;
169         hammer2_chain_t *chain;
170         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
171         int i;
172
173         KKASSERT(pmp != NULL);
174
175         /*
176          * Construct the appropriate system structure.
177          */
178         switch(bref->type) {
179         case HAMMER2_BREF_TYPE_INODE:
180         case HAMMER2_BREF_TYPE_INDIRECT:
181         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
182         case HAMMER2_BREF_TYPE_DATA:
183         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
184                 /*
185                  * Chain's are really only associated with the hmp but we
186                  * maintain a pmp association for per-mount memory tracking
187                  * purposes.  The pmp can be NULL.
188                  */
189                 break;
190         case HAMMER2_BREF_TYPE_VOLUME:
191         case HAMMER2_BREF_TYPE_FREEMAP:
192                 chain = NULL;
193                 panic("hammer2_cluster_alloc volume type illegal for op");
194         default:
195                 chain = NULL;
196                 panic("hammer2_cluster_alloc: unrecognized blockref type: %d",
197                       bref->type);
198         }
199
200         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
201         cluster->refs = 1;
202
203         rcluster = &pmp->iroot->cluster;
204         for (i = 0; i < rcluster->nchains; ++i) {
205                 chain = hammer2_chain_alloc(rcluster->array[i]->hmp, pmp,
206                                             trans, bref);
207                 chain->pmp = pmp;
208                 chain->hmp = rcluster->array[i]->hmp;
209                 chain->bref = *bref;
210                 chain->bytes = bytes;
211                 chain->refs = 1;
212                 chain->flags = HAMMER2_CHAIN_ALLOCATED;
213                 chain->delete_tid = HAMMER2_MAX_TID;
214
215                 /*
216                  * Set modify_tid if a transaction is creating the inode.
217                  * Enforce update_lo = 0 so nearby transactions do not think
218                  * it has been flushed when it hasn't.
219                  *
220                  * NOTE: When loading a chain from backing store or creating a
221                  *       snapshot, trans will be NULL and the caller is
222                  *       responsible for setting these fields.
223                  */
224                 if (trans) {
225                         chain->modify_tid = trans->sync_tid;
226                         chain->update_lo = 0;
227                 }
228                 cluster->array[i] = chain;
229         }
230         cluster->nchains = i;
231         cluster->pmp = pmp;
232         cluster->focus = cluster->array[0];
233
234         return (cluster);
235 }
236
237 /*
238  * Associate an existing core with the chain or allocate a new core.
239  *
240  * The core is not locked.  No additional refs on the chain are made.
241  * (trans) must not be NULL if (core) is not NULL.
242  *
243  * When chains are delete-duplicated during flushes we insert nchain on
244  * the ownerq after ochain instead of at the end in order to give the
245  * drop code visibility in the correct order, otherwise drops can be missed.
246  */
247 void
248 hammer2_cluster_core_alloc(hammer2_trans_t *trans,
249                            hammer2_cluster_t *ncluster,
250                            hammer2_cluster_t *ocluster)
251 {
252         int i;
253
254         for (i = 0; i < ocluster->nchains; ++i) {
255                 if (ncluster->array[i]) {
256                         hammer2_chain_core_alloc(trans,
257                                                  ncluster->array[i],
258                                                  ocluster->array[i]);
259                 }
260         }
261 }
262
263 /*
264  * Add a reference to a cluster.
265  *
266  * We must also ref the underlying chains in order to allow ref/unlock
267  * sequences to later re-lock.
268  */
269 void
270 hammer2_cluster_ref(hammer2_cluster_t *cluster)
271 {
272         hammer2_chain_t *chain;
273         int i;
274
275         atomic_add_int(&cluster->refs, 1);
276         for (i = 0; i < cluster->nchains; ++i) {
277                 chain = cluster->array[i];
278                 if (chain)
279                         hammer2_chain_ref(chain);
280         }
281 }
282
283 /*
284  * Drop the caller's reference to the cluster.  When the ref count drops to
285  * zero this function frees the cluster and drops all underlying chains.
286  */
287 void
288 hammer2_cluster_drop(hammer2_cluster_t *cluster)
289 {
290         hammer2_chain_t *chain;
291         int i;
292
293         KKASSERT(cluster->refs > 0);
294         for (i = 0; i < cluster->nchains; ++i) {
295                 chain = cluster->array[i];
296                 if (chain) {
297                         hammer2_chain_drop(chain);
298                         if (cluster->refs == 1)
299                                 cluster->array[i] = NULL;
300                 }
301         }
302         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
303                 cluster->focus = NULL;
304                 kfree(cluster, M_HAMMER2);
305                 /* cluster = NULL; safety */
306         }
307 }
308
309 void
310 hammer2_cluster_wait(hammer2_cluster_t *cluster)
311 {
312         tsleep(cluster->focus, 0, "h2clcw", 1);
313 }
314
315 /*
316  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
317  * and then locks them.
318  */
319 int
320 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
321 {
322         hammer2_chain_t *chain;
323         int i;
324         int error;
325
326         error = 0;
327         atomic_add_int(&cluster->refs, 1);
328         for (i = 0; i < cluster->nchains; ++i) {
329                 chain = cluster->array[i];
330                 if (chain) {
331                         error = hammer2_chain_lock(chain, how);
332                         if (error) {
333                                 while (--i >= 0)
334                                         hammer2_chain_unlock(cluster->array[i]);
335                                 atomic_add_int(&cluster->refs, -1);
336                                 break;
337                         }
338                 }
339         }
340         return error;
341 }
342
343 /*
344  * Replace the contents of dst with src, adding a reference to src's chains.
345  * dst is assumed to already have a ref and any chains present in dst are
346  * assumed to be locked and will be unlocked.
347  *
348  * If the chains in src are locked, only one of (src) or (dst) should be
349  * considered locked by the caller after return, not both.
350  */
351 void
352 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src)
353 {
354         hammer2_chain_t *chain;
355         int i;
356
357         KKASSERT(dst->refs == 1);
358         dst->focus = NULL;
359
360         for (i = 0; i < src->nchains; ++i) {
361                 chain = src->array[i];
362                 if (chain) {
363                         hammer2_chain_ref(chain);
364                         if (i < dst->nchains && dst->array[i])
365                                 hammer2_chain_unlock(dst->array[i]);
366                         dst->array[i] = chain;
367                         if (dst->focus == NULL)
368                                 dst->focus = chain;
369                 }
370         }
371         while (i < dst->nchains) {
372                 chain = dst->array[i];
373                 if (chain) {
374                         hammer2_chain_unlock(chain);
375                         dst->array[i] = NULL;
376                 }
377                 ++i;
378         }
379         dst->nchains = src->nchains;
380 }
381
382 /*
383  * Replace the contents of the locked destination with the contents of the
384  * locked source.  Destination must have one ref.
385  *
386  * Returns with the destination still with one ref and the copied chains
387  * with an additional lock (representing their state on the destination).
388  * The original chains associated with the destination are unlocked.
389  */
390 void
391 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src)
392 {
393         hammer2_chain_t *chain;
394         int i;
395
396         KKASSERT(dst->refs == 1);
397
398         dst->focus = NULL;
399         for (i = 0; i < src->nchains; ++i) {
400                 chain = src->array[i];
401                 if (chain) {
402                         hammer2_chain_lock(chain, 0);
403                         if (i < dst->nchains && dst->array[i])
404                                 hammer2_chain_unlock(dst->array[i]);
405                         dst->array[i] = src->array[i];
406                         if (dst->focus == NULL)
407                                 dst->focus = chain;
408                 }
409         }
410         while (i < dst->nchains) {
411                 chain = dst->array[i];
412                 if (chain) {
413                         hammer2_chain_unlock(chain);
414                         dst->array[i] = NULL;
415                 }
416                 ++i;
417         }
418         dst->nchains = src->nchains;
419 }
420
421 /*
422  * Copy a cluster, returned a ref'd cluster.  All underlying chains
423  * are also ref'd, but not locked.
424  *
425  * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied
426  * to the new cluster and a reference is nominally added to them and to
427  * the cluster.  The cluster will have 1 ref.
428  *
429  * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains
430  * are copied but no additional references are made and the cluster will have
431  * 0 refs.  Callers must ref the cluster and the chains before dropping it
432  * (typically by locking it).
433  *
434  * If flags are passed as 0 the copy is setup as if it contained the chains
435  * but the chains will not be copied over, and the cluster will have 0 refs.
436  * Callers must ref the cluster before dropping it (typically by locking it).
437  */
438 hammer2_cluster_t *
439 hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags)
440 {
441         hammer2_pfsmount_t *pmp = ocluster->pmp;
442         hammer2_cluster_t *ncluster;
443         int i;
444
445         ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
446         ncluster->pmp = pmp;
447         ncluster->nchains = ocluster->nchains;
448         ncluster->focus = ocluster->focus;
449         ncluster->refs = 1;
450         if (copy_flags & HAMMER2_CLUSTER_COPY_CHAINS) {
451                 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0)
452                         ncluster->refs = 1;
453                 for (i = 0; i < ocluster->nchains; ++i) {
454                         ncluster->array[i] = ocluster->array[i];
455                         if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 &&
456                             ncluster->array[i]) {
457                                 hammer2_chain_ref(ncluster->array[i]);
458                         }
459                 }
460         }
461         return (ncluster);
462 }
463
464 /*
465  * Unlock and deref a cluster.  The cluster is destroyed if this is the
466  * last ref.
467  */
468 void
469 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
470 {
471         hammer2_chain_t *chain;
472         int i;
473
474         KKASSERT(cluster->refs > 0);
475         for (i = 0; i < cluster->nchains; ++i) {
476                 chain = cluster->array[i];
477                 if (chain) {
478                         hammer2_chain_unlock(chain);
479                         if (cluster->refs == 1)
480                                 cluster->array[i] = NULL;       /* safety */
481                 }
482         }
483         if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
484                 cluster->focus = NULL;
485                 kfree(cluster, M_HAMMER2);
486                 /* cluster = NULL; safety */
487         }
488 }
489
490 /*
491  * Refactor the locked chains of a cluster.
492  */
493 void
494 hammer2_cluster_refactor(hammer2_cluster_t *cluster)
495 {
496         int i;
497
498         cluster->focus = NULL;
499         for (i = 0; i < cluster->nchains; ++i) {
500                 if (cluster->array[i]) {
501                         hammer2_chain_refactor(&cluster->array[i]);
502                         if (cluster->focus == NULL)
503                                 cluster->focus = cluster->array[i];
504                 }
505         }
506 }
507
508 /*
509  * Resize the cluster's physical storage allocation in-place.  This may
510  * replace the cluster's chains.
511  */
512 void
513 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
514                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
515                        int nradix, int flags)
516 {
517         int i;
518
519         KKASSERT(cparent->pmp == cluster->pmp);         /* can be NULL */
520         KKASSERT(cparent->nchains == cluster->nchains);
521
522         cluster->focus = NULL;
523         for (i = 0; i < cluster->nchains; ++i) {
524                 if (cluster->array[i]) {
525                         KKASSERT(cparent->array[i]);
526                         hammer2_chain_resize(trans, ip,
527                                              cparent->array[i],
528                                              &cluster->array[i],
529                                              nradix, flags);
530                         if (cluster->focus == NULL)
531                                 cluster->focus = cluster->array[i];
532                 }
533         }
534 }
535
536 /*
537  * Set an inode's cluster modified, marking the related chains RW and
538  * duplicating them if necessary.
539  *
540  * The passed-in chain is a localized copy of the chain previously acquired
541  * when the inode was locked (and possilby replaced in the mean time), and
542  * must also be updated.  In fact, we update it first and then synchronize
543  * the inode's cluster cache.
544  */
545 hammer2_inode_data_t *
546 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
547                           hammer2_cluster_t *cluster, int flags)
548 {
549         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
550         hammer2_cluster_modify(trans, cluster, flags);
551
552         hammer2_inode_repoint(ip, NULL, cluster);
553         if (ip->vp)
554                 vsetisdirty(ip->vp);
555         return (&hammer2_cluster_wdata(cluster)->ipdata);
556 }
557
558 /*
559  * Adjust the cluster's chains to allow modification.
560  */
561 void
562 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
563                        int flags)
564 {
565         int i;
566
567         cluster->focus = NULL;
568         for (i = 0; i < cluster->nchains; ++i) {
569                 if (cluster->array[i]) {
570                         hammer2_chain_modify(trans, &cluster->array[i], flags);
571                         if (cluster->focus == NULL)
572                                 cluster->focus = cluster->array[i];
573                 }
574         }
575 }
576
577 /*
578  * Synchronize modifications with other chains in a cluster.
579  *
580  * Nominal front-end operations only edit non-block-table data in a single
581  * chain.  This code copies such modifications to the other chains in the
582  * cluster.
583  */
584 /* hammer2_cluster_modsync() */
585
586 void
587 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
588 {
589         hammer2_chain_t *focus;
590         hammer2_chain_t *scan;
591         const hammer2_inode_data_t *ripdata;
592         hammer2_inode_data_t *wipdata;
593         int i;
594
595         focus = cluster->focus;
596         KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
597
598         for (i = 0; i < cluster->nchains; ++i) {
599                 scan = cluster->array[i];
600                 if (scan == NULL || scan == focus)
601                         continue;
602                 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
603                 KKASSERT(focus->bytes == scan->bytes &&
604                          focus->bref.type == scan->bref.type);
605                 switch(focus->bref.type) {
606                 case HAMMER2_BREF_TYPE_INODE:
607                         ripdata = &focus->data->ipdata;
608                         wipdata = &scan->data->ipdata;
609                         if ((ripdata->op_flags &
610                             HAMMER2_OPFLAG_DIRECTDATA) == 0) {
611                                 bcopy(ripdata, wipdata,
612                                       offsetof(hammer2_inode_data_t, u));
613                                 break;
614                         }
615                         /* fall through */
616                 case HAMMER2_BREF_TYPE_DATA:
617                         bcopy(focus->data, scan->data, focus->bytes);
618                         break;
619                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
620                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
621                 case HAMMER2_BREF_TYPE_FREEMAP:
622                 case HAMMER2_BREF_TYPE_VOLUME:
623                         panic("hammer2_cluster_modsync: illegal node type");
624                         /* NOT REACHED */
625                         break;
626                 default:
627                         panic("hammer2_cluster_modsync: unknown node type");
628                         break;
629                 }
630         }
631 }
632
633 /*
634  * Lookup initialization/completion API
635  */
636 hammer2_cluster_t *
637 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
638 {
639         hammer2_cluster_t *cluster;
640         int i;
641
642         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
643         cluster->pmp = cparent->pmp;                    /* can be NULL */
644         /* cluster->focus = NULL; already null */
645
646         for (i = 0; i < cparent->nchains; ++i) {
647                 cluster->array[i] = cparent->array[i];
648                 if (cluster->focus == NULL)
649                         cluster->focus = cluster->array[i];
650         }
651         cluster->nchains = cparent->nchains;
652
653         /*
654          * Independently lock (this will also give cluster 1 ref)
655          */
656         if (flags & HAMMER2_LOOKUP_SHARED) {
657                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
658                                               HAMMER2_RESOLVE_SHARED);
659         } else {
660                 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
661         }
662         return (cluster);
663 }
664
665 void
666 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
667 {
668         if (cparent)
669                 hammer2_cluster_unlock(cparent);
670 }
671
672 /*
673  * Locate first match or overlap under parent, return a new cluster
674  */
675 hammer2_cluster_t *
676 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
677                      hammer2_key_t key_beg, hammer2_key_t key_end,
678                      int flags, int *ddflagp)
679 {
680         hammer2_pfsmount_t *pmp;
681         hammer2_cluster_t *cluster;
682         hammer2_chain_t *chain;
683         hammer2_key_t key_accum;
684         hammer2_key_t key_next;
685         int null_count;
686         int ddflag;
687         int i;
688         uint8_t bref_type;
689         u_int bytes;
690
691         pmp = cparent->pmp;                             /* can be NULL */
692         key_accum = *key_nextp;
693         null_count = 0;
694         bref_type = 0;
695         bytes = 0;
696
697         cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
698         cluster->pmp = pmp;                             /* can be NULL */
699         cluster->refs = 1;
700         /* cluster->focus = NULL; already null */
701         cparent->focus = NULL;
702         *ddflagp = 0;
703
704         for (i = 0; i < cparent->nchains; ++i) {
705                 key_next = *key_nextp;
706                 if (cparent->array[i] == NULL) {
707                         ++null_count;
708                         continue;
709                 }
710                 chain = hammer2_chain_lookup(&cparent->array[i], &key_next,
711                                              key_beg, key_end,
712                                              &cparent->cache_index[i],
713                                              flags, &ddflag);
714                 if (cparent->focus == NULL)
715                         cparent->focus = cparent->array[i];
716                 cluster->array[i] = chain;
717                 if (chain == NULL) {
718                         ++null_count;
719                 } else {
720                         if (cluster->focus == NULL) {
721                                 bref_type = chain->bref.type;
722                                 bytes = chain->bytes;
723                                 *ddflagp = ddflag;
724                                 cluster->focus = chain;
725                         }
726                         KKASSERT(bref_type == chain->bref.type);
727                         KKASSERT(bytes == chain->bytes);
728                         KKASSERT(*ddflagp == ddflag);
729                 }
730                 if (key_accum > key_next)
731                         key_accum = key_next;
732         }
733         *key_nextp = key_accum;
734         cluster->nchains = i;
735
736         if (null_count == i) {
737                 hammer2_cluster_drop(cluster);
738                 cluster = NULL;
739         }
740
741         return (cluster);
742 }
743
744 /*
745  * Locate next match or overlap under parent, replace cluster
746  */
747 hammer2_cluster_t *
748 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
749                      hammer2_key_t *key_nextp,
750                      hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
751 {
752         hammer2_chain_t *chain;
753         hammer2_key_t key_accum;
754         hammer2_key_t key_next;
755         int null_count;
756         int i;
757
758         key_accum = *key_nextp;
759         null_count = 0;
760         cluster->focus = NULL;
761         cparent->focus = NULL;
762
763         for (i = 0; i < cparent->nchains; ++i) {
764                 key_next = *key_nextp;
765                 chain = cluster->array[i];
766                 if (chain == NULL) {
767                         if (cparent->focus == NULL)
768                                 cparent->focus = cparent->array[i];
769                         ++null_count;
770                         continue;
771                 }
772                 if (cparent->array[i] == NULL) {
773                         if (flags & HAMMER2_LOOKUP_NOLOCK)
774                                 hammer2_chain_drop(chain);
775                         else
776                                 hammer2_chain_unlock(chain);
777                         ++null_count;
778                         continue;
779                 }
780                 chain = hammer2_chain_next(&cparent->array[i], chain,
781                                            &key_next, key_beg, key_end,
782                                            &cparent->cache_index[i], flags);
783                 if (cparent->focus == NULL)
784                         cparent->focus = cparent->array[i];
785                 cluster->array[i] = chain;
786                 if (chain == NULL) {
787                         ++null_count;
788                 } else if (cluster->focus == NULL) {
789                         cluster->focus = chain;
790                 }
791                 if (key_accum > key_next)
792                         key_accum = key_next;
793         }
794
795         if (null_count == i) {
796                 hammer2_cluster_drop(cluster);
797                 cluster = NULL;
798         }
799         return(cluster);
800 }
801
802 #if 0
803 /*
804  * XXX initial NULL cluster needs reworking (pass **clusterp ?)
805  *
806  * The raw scan function is similar to lookup/next but does not seek to a key.
807  * Blockrefs are iterated via first_chain = (parent, NULL) and
808  * next_chain = (parent, chain).
809  *
810  * The passed-in parent must be locked and its data resolved.  The returned
811  * chain will be locked.  Pass chain == NULL to acquire the first sub-chain
812  * under parent and then iterate with the passed-in chain (which this
813  * function will unlock).
814  */
815 hammer2_cluster_t *
816 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
817                      int flags)
818 {
819         hammer2_chain_t *chain;
820         int null_count;
821         int i;
822
823         null_count = 0;
824
825         for (i = 0; i < cparent->nchains; ++i) {
826                 chain = cluster->array[i];
827                 if (chain == NULL) {
828                         ++null_count;
829                         continue;
830                 }
831                 if (cparent->array[i] == NULL) {
832                         if (flags & HAMMER2_LOOKUP_NOLOCK)
833                                 hammer2_chain_drop(chain);
834                         else
835                                 hammer2_chain_unlock(chain);
836                         ++null_count;
837                         continue;
838                 }
839
840                 chain = hammer2_chain_scan(cparent->array[i], chain,
841                                            &cparent->cache_index[i], flags);
842                 cluster->array[i] = chain;
843                 if (chain == NULL)
844                         ++null_count;
845         }
846
847         if (null_count == i) {
848                 hammer2_cluster_drop(cluster);
849                 cluster = NULL;
850         }
851         return(cluster);
852 }
853
854 #endif
855
856 /*
857  * Create a new cluster using the specified key
858  */
859 int
860 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
861                      hammer2_cluster_t **clusterp,
862                      hammer2_key_t key, int keybits, int type, size_t bytes)
863 {
864         hammer2_cluster_t *cluster;
865         hammer2_pfsmount_t *pmp;
866         int error;
867         int i;
868
869         pmp = trans->pmp;                               /* can be NULL */
870
871         if ((cluster = *clusterp) == NULL) {
872                 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
873                                   M_WAITOK | M_ZERO);
874                 cluster->pmp = pmp;                     /* can be NULL */
875                 cluster->refs = 1;
876         }
877         cluster->focus = NULL;
878         cparent->focus = NULL;
879
880         /*
881          * NOTE: cluster->array[] entries can initially be NULL.  If
882          *       *clusterp is supplied, skip NULL entries, otherwise
883          *       create new chains.
884          */
885         for (i = 0; i < cparent->nchains; ++i) {
886                 if (*clusterp && cluster->array[i] == NULL) {
887                         if (cparent->focus == NULL)
888                                 cparent->focus = cparent->array[i];
889                         continue;
890                 }
891                 error = hammer2_chain_create(trans,
892                                              &cparent->array[i],
893                                              &cluster->array[i],
894                                              key, keybits, type, bytes);
895                 KKASSERT(error == 0);
896                 if (cparent->focus == NULL)
897                         cparent->focus = cparent->array[i];
898                 if (cluster->focus == NULL)
899                         cluster->focus = cluster->array[i];
900         }
901         cluster->nchains = i;
902         *clusterp = cluster;
903
904         return error;
905 }
906
907 /*
908  * Duplicate a cluster under a new parent.
909  *
910  * WARNING! Unlike hammer2_chain_duplicate(), only the key and keybits fields
911  *          are used from a passed-in non-NULL bref pointer.  All other fields
912  *          are extracted from the original chain for each chain in the
913  *          iteration.
914  */
915 void
916 hammer2_cluster_duplicate(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
917                           hammer2_cluster_t *cluster, hammer2_blockref_t *bref,
918                           int snapshot, int duplicate_reason)
919 {
920         hammer2_chain_t *chain;
921         hammer2_blockref_t xbref;
922         int i;
923
924         cluster->focus = NULL;
925         cparent->focus = NULL;
926
927         for (i = 0; i < cluster->nchains; ++i) {
928                 chain = cluster->array[i];
929                 if (chain) {
930                         if (bref) {
931                                 xbref = chain->bref;
932                                 xbref.key = bref->key;
933                                 xbref.keybits = bref->keybits;
934                                 hammer2_chain_duplicate(trans,
935                                                         &cparent->array[i],
936                                                         &chain, &xbref,
937                                                         snapshot,
938                                                         duplicate_reason);
939                         } else {
940                                 hammer2_chain_duplicate(trans,
941                                                         &cparent->array[i],
942                                                         &chain, NULL,
943                                                         snapshot,
944                                                         duplicate_reason);
945                         }
946                         cluster->array[i] = chain;
947                         if (cluster->focus == NULL)
948                                 cluster->focus = chain;
949                         if (cparent->focus == NULL)
950                                 cparent->focus = cparent->array[i];
951                 } else {
952                         if (cparent->focus == NULL)
953                                 cparent->focus = cparent->array[i];
954                 }
955         }
956 }
957
958 /*
959  * Delete-duplicate a cluster in-place.
960  */
961 void
962 hammer2_cluster_delete_duplicate(hammer2_trans_t *trans,
963                                  hammer2_cluster_t *cluster, int flags)
964 {
965         hammer2_chain_t *chain;
966         int i;
967
968         cluster->focus = NULL;
969         for (i = 0; i < cluster->nchains; ++i) {
970                 chain = cluster->array[i];
971                 if (chain) {
972                         hammer2_chain_delete_duplicate(trans, &chain, flags);
973                         cluster->array[i] = chain;
974                         if (cluster->focus == NULL)
975                                 cluster->focus = chain;
976                 }
977         }
978 }
979
980 /*
981  * Mark a cluster deleted
982  */
983 void
984 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
985                        int flags)
986 {
987         hammer2_chain_t *chain;
988         int i;
989
990         for (i = 0; i < cluster->nchains; ++i) {
991                 chain = cluster->array[i];
992                 if (chain)
993                         hammer2_chain_delete(trans, chain, flags);
994         }
995 }
996
997 /*
998  * Create a snapshot of the specified {parent, ochain} with the specified
999  * label.  The originating hammer2_inode must be exclusively locked for
1000  * safety.
1001  *
1002  * The ioctl code has already synced the filesystem.
1003  */
1004 int
1005 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1006                        hammer2_ioc_pfs_t *pfs)
1007 {
1008         hammer2_mount_t *hmp;
1009         hammer2_cluster_t *ncluster;
1010         const hammer2_inode_data_t *ipdata;
1011         hammer2_inode_data_t *wipdata;
1012         hammer2_inode_t *nip;
1013         size_t name_len;
1014         hammer2_key_t lhc;
1015         struct vattr vat;
1016         uuid_t opfs_clid;
1017         int error;
1018
1019         kprintf("snapshot %s\n", pfs->name);
1020
1021         name_len = strlen(pfs->name);
1022         lhc = hammer2_dirhash(pfs->name, name_len);
1023
1024         ipdata = &hammer2_cluster_data(ocluster)->ipdata;
1025         opfs_clid = ipdata->pfs_clid;
1026         hmp = ocluster->focus->hmp;
1027
1028         /*
1029          * Create the snapshot directory under the super-root
1030          *
1031          * Set PFS type, generate a unique filesystem id, and generate
1032          * a cluster id.  Use the same clid when snapshotting a PFS root,
1033          * which theoretically allows the snapshot to be used as part of
1034          * the same cluster (perhaps as a cache).
1035          *
1036          * Copy the (flushed) blockref array.  Theoretically we could use
1037          * chain_duplicate() but it becomes difficult to disentangle
1038          * the shared core so for now just brute-force it.
1039          */
1040         VATTR_NULL(&vat);
1041         vat.va_type = VDIR;
1042         vat.va_mode = 0755;
1043         ncluster = NULL;
1044         nip = hammer2_inode_create(trans, hmp->sroot, &vat, proc0.p_ucred,
1045                                    pfs->name, name_len, &ncluster, &error);
1046
1047         if (nip) {
1048                 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1049                 wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
1050                 kern_uuidgen(&wipdata->pfs_fsid, 1);
1051                 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSROOT)
1052                         wipdata->pfs_clid = opfs_clid;
1053                 else
1054                         kern_uuidgen(&wipdata->pfs_clid, 1);
1055                 hammer2_cluster_set_chainflags(ncluster, HAMMER2_CHAIN_PFSROOT);
1056
1057                 /* XXX hack blockset copy */
1058                 /* XXX doesn't work with real cluster */
1059                 KKASSERT(ocluster->nchains == 1);
1060                 wipdata->u.blockset = ocluster->focus->data->ipdata.u.blockset;
1061
1062                 hammer2_inode_unlock_ex(nip, ncluster);
1063         }
1064         return (error);
1065 }