hammer2 - update documentation, begin working on callback I/O
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2013-2014 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34/*
35 * The cluster module collects multiple chains representing the same
36 * information into a single entity. It allows direct access to media
37 * data as long as it is not blockref array data. Meaning, basically,
38 * just inode and file data.
39 *
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
43 *
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
47 */
48#include <sys/cdefs.h>
49#include <sys/param.h>
50#include <sys/systm.h>
51#include <sys/types.h>
52#include <sys/lock.h>
53#include <sys/uuid.h>
54
55#include "hammer2.h"
56
57/*
58 * Returns TRUE if any chain in the cluster needs to be resized.
59 */
60int
61hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
62{
63 hammer2_chain_t *chain;
64 int i;
65
66 for (i = 0; i < cluster->nchains; ++i) {
67 chain = cluster->array[i];
68 if (chain && chain->bytes != bytes)
69 return 1;
70 }
71 return 0;
72}
73
74uint8_t
75hammer2_cluster_type(hammer2_cluster_t *cluster)
76{
77 return(cluster->focus->bref.type);
78}
79
80int
81hammer2_cluster_modified(hammer2_cluster_t *cluster)
82{
83 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
84}
85
86/*
87 * Return a bref representative of the cluster. Any data offset is removed
88 * (since it would only be applicable to a particular chain in the cluster).
89 *
90 * However, the radix portion of data_off is used for many purposes and will
91 * be retained.
92 */
93void
94hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
95{
96 *bref = cluster->focus->bref;
97 bref->data_off &= HAMMER2_OFF_MASK_RADIX;
98}
99
100void
101hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
102{
103 hammer2_chain_t *chain;
104 int i;
105
106 for (i = 0; i < cluster->nchains; ++i) {
107 chain = cluster->array[i];
108 if (chain)
109 atomic_set_int(&chain->flags, flags);
110 }
111}
112
113void
114hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
115{
116 hammer2_chain_t *chain;
117 int i;
118
119 for (i = 0; i < cluster->nchains; ++i) {
120 chain = cluster->array[i];
121 if (chain)
122 hammer2_chain_setflush(trans, chain);
123 }
124}
125
126void
127hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
128 hammer2_cluster_t *cluster,
129 int check_algo)
130{
131 hammer2_chain_t *chain;
132 int i;
133
134 for (i = 0; i < cluster->nchains; ++i) {
135 chain = cluster->array[i];
136 if (chain) {
137 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
138 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
139 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
140 }
141 }
142}
143
144/*
145 * Create a cluster with one ref from the specified chain. The chain
146 * is not further referenced. The caller typically supplies a locked
147 * chain and transfers ownership to the cluster.
148 */
149hammer2_cluster_t *
150hammer2_cluster_from_chain(hammer2_chain_t *chain)
151{
152 hammer2_cluster_t *cluster;
153
154 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
155 cluster->array[0] = chain;
156 cluster->nchains = 1;
157 cluster->focus = chain;
158 cluster->pmp = chain->pmp;
159 cluster->refs = 1;
160
161 return cluster;
162}
163
164/*
165 * Allocates a cluster and its underlying chain structures. The underlying
166 * chains will be locked. The cluster and underlying chains will have one
167 * ref.
168 */
169hammer2_cluster_t *
170hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
171 hammer2_trans_t *trans, hammer2_blockref_t *bref)
172{
173 hammer2_cluster_t *cluster;
174 hammer2_cluster_t *rcluster;
175 hammer2_chain_t *chain;
176#if 0
177 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
178#endif
179 int i;
180
181 KKASSERT(pmp != NULL);
182
183 /*
184 * Construct the appropriate system structure.
185 */
186 switch(bref->type) {
187 case HAMMER2_BREF_TYPE_INODE:
188 case HAMMER2_BREF_TYPE_INDIRECT:
189 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
190 case HAMMER2_BREF_TYPE_DATA:
191 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
192 /*
193 * Chain's are really only associated with the hmp but we
194 * maintain a pmp association for per-mount memory tracking
195 * purposes. The pmp can be NULL.
196 */
197 break;
198 case HAMMER2_BREF_TYPE_VOLUME:
199 case HAMMER2_BREF_TYPE_FREEMAP:
200 chain = NULL;
201 panic("hammer2_cluster_alloc volume type illegal for op");
202 default:
203 chain = NULL;
204 panic("hammer2_cluster_alloc: unrecognized blockref type: %d",
205 bref->type);
206 }
207
208 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
209 cluster->refs = 1;
210
211 rcluster = &pmp->iroot->cluster;
212 for (i = 0; i < rcluster->nchains; ++i) {
213 chain = hammer2_chain_alloc(rcluster->array[i]->hmp,
214 pmp, trans, bref);
215#if 0
216 chain->hmp = rcluster->array[i]->hmp;
217 chain->bref = *bref;
218 chain->bytes = bytes;
219 chain->refs = 1;
220 chain->flags = HAMMER2_CHAIN_ALLOCATED;
221#endif
222
223 /*
224 * NOTE: When loading a chain from backing store or creating a
225 * snapshot, trans will be NULL and the caller is
226 * responsible for setting these fields.
227 */
228 cluster->array[i] = chain;
229 }
230 cluster->nchains = i;
231 cluster->pmp = pmp;
232 cluster->focus = cluster->array[0];
233
234 return (cluster);
235}
236
237/*
238 * Add a reference to a cluster.
239 *
240 * We must also ref the underlying chains in order to allow ref/unlock
241 * sequences to later re-lock.
242 */
243void
244hammer2_cluster_ref(hammer2_cluster_t *cluster)
245{
246 hammer2_chain_t *chain;
247 int i;
248
249 atomic_add_int(&cluster->refs, 1);
250 for (i = 0; i < cluster->nchains; ++i) {
251 chain = cluster->array[i];
252 if (chain)
253 hammer2_chain_ref(chain);
254 }
255}
256
257/*
258 * Drop the caller's reference to the cluster. When the ref count drops to
259 * zero this function frees the cluster and drops all underlying chains.
260 *
261 * In-progress read I/Os are typically detached from the cluster once the
262 * first one returns (the remaining stay attached to the DIOs but are then
263 * ignored and drop naturally).
264 */
265void
266hammer2_cluster_drop(hammer2_cluster_t *cluster)
267{
268 hammer2_chain_t *chain;
269 int i;
270
271 KKASSERT(cluster->refs > 0);
272 for (i = 0; i < cluster->nchains; ++i) {
273 chain = cluster->array[i];
274 if (chain) {
275 hammer2_chain_drop(chain);
276 if (cluster->refs == 1)
277 cluster->array[i] = NULL;
278 }
279 }
280 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
281 cluster->focus = NULL;
282 kfree(cluster, M_HAMMER2);
283 /* cluster = NULL; safety */
284 }
285}
286
287void
288hammer2_cluster_wait(hammer2_cluster_t *cluster)
289{
290 tsleep(cluster->focus, 0, "h2clcw", 1);
291}
292
293/*
294 * Lock and ref a cluster. This adds a ref to the cluster and its chains
295 * and then locks them.
296 */
297int
298hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
299{
300 hammer2_chain_t *chain;
301 int i;
302 int error;
303
304 error = 0;
305 atomic_add_int(&cluster->refs, 1);
306 for (i = 0; i < cluster->nchains; ++i) {
307 chain = cluster->array[i];
308 if (chain) {
309 error = hammer2_chain_lock(chain, how);
310 if (error) {
311 while (--i >= 0)
312 hammer2_chain_unlock(cluster->array[i]);
313 atomic_add_int(&cluster->refs, -1);
314 break;
315 }
316 }
317 }
318 return error;
319}
320
321/*
322 * Replace the contents of dst with src, adding a reference to src's chains.
323 * dst is assumed to already have a ref and any chains present in dst are
324 * assumed to be locked and will be unlocked.
325 *
326 * If the chains in src are locked, only one of (src) or (dst) should be
327 * considered locked by the caller after return, not both.
328 */
329void
330hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src)
331{
332 hammer2_chain_t *chain;
333 int i;
334
335 KKASSERT(dst->refs == 1);
336 dst->focus = NULL;
337
338 for (i = 0; i < src->nchains; ++i) {
339 chain = src->array[i];
340 if (chain) {
341 hammer2_chain_ref(chain);
342 if (i < dst->nchains && dst->array[i])
343 hammer2_chain_unlock(dst->array[i]);
344 dst->array[i] = chain;
345 if (dst->focus == NULL)
346 dst->focus = chain;
347 }
348 }
349 while (i < dst->nchains) {
350 chain = dst->array[i];
351 if (chain) {
352 hammer2_chain_unlock(chain);
353 dst->array[i] = NULL;
354 }
355 ++i;
356 }
357 dst->nchains = src->nchains;
358}
359
360/*
361 * Replace the contents of the locked destination with the contents of the
362 * locked source. Destination must have one ref.
363 *
364 * Returns with the destination still with one ref and the copied chains
365 * with an additional lock (representing their state on the destination).
366 * The original chains associated with the destination are unlocked.
367 */
368void
369hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src)
370{
371 hammer2_chain_t *chain;
372 int i;
373
374 KKASSERT(dst->refs == 1);
375
376 dst->focus = NULL;
377 for (i = 0; i < src->nchains; ++i) {
378 chain = src->array[i];
379 if (chain) {
380 hammer2_chain_lock(chain, 0);
381 if (i < dst->nchains && dst->array[i])
382 hammer2_chain_unlock(dst->array[i]);
383 dst->array[i] = src->array[i];
384 if (dst->focus == NULL)
385 dst->focus = chain;
386 }
387 }
388 while (i < dst->nchains) {
389 chain = dst->array[i];
390 if (chain) {
391 hammer2_chain_unlock(chain);
392 dst->array[i] = NULL;
393 }
394 ++i;
395 }
396 dst->nchains = src->nchains;
397}
398
399/*
400 * Copy a cluster, returned a ref'd cluster. All underlying chains
401 * are also ref'd, but not locked.
402 *
403 * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied
404 * to the new cluster and a reference is nominally added to them and to
405 * the cluster. The cluster will have 1 ref.
406 *
407 * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains
408 * are copied but no additional references are made and the cluster will have
409 * 0 refs. Callers must ref the cluster and the chains before dropping it
410 * (typically by locking it).
411 *
412 * If flags are passed as 0 the copy is setup as if it contained the chains
413 * but the chains will not be copied over, and the cluster will have 0 refs.
414 * Callers must ref the cluster before dropping it (typically by locking it).
415 */
416hammer2_cluster_t *
417hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags)
418{
419 hammer2_pfsmount_t *pmp = ocluster->pmp;
420 hammer2_cluster_t *ncluster;
421 hammer2_chain_t *chain;
422 int i;
423
424 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
425 ncluster->pmp = pmp;
426 ncluster->nchains = ocluster->nchains;
427 ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1;
428 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) {
429 ncluster->focus = ocluster->focus;
430 for (i = 0; i < ocluster->nchains; ++i) {
431 chain = ocluster->array[i];
432 ncluster->array[i] = chain;
433 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 &&
434 chain) {
435 hammer2_chain_ref(chain);
436 }
437 }
438 }
439 return (ncluster);
440}
441
442/*
443 * Unlock and deref a cluster. The cluster is destroyed if this is the
444 * last ref.
445 */
446void
447hammer2_cluster_unlock(hammer2_cluster_t *cluster)
448{
449 hammer2_chain_t *chain;
450 int i;
451
452 KKASSERT(cluster->refs > 0);
453 for (i = 0; i < cluster->nchains; ++i) {
454 chain = cluster->array[i];
455 if (chain) {
456 hammer2_chain_unlock(chain);
457 if (cluster->refs == 1)
458 cluster->array[i] = NULL; /* safety */
459 }
460 }
461 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
462 cluster->focus = NULL;
463 kfree(cluster, M_HAMMER2);
464 /* cluster = NULL; safety */
465 }
466}
467
468/*
469 * Resize the cluster's physical storage allocation in-place. This may
470 * replace the cluster's chains.
471 */
472void
473hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
474 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
475 int nradix, int flags)
476{
477 int i;
478
479 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */
480 KKASSERT(cparent->nchains == cluster->nchains);
481
482 cluster->focus = NULL;
483 for (i = 0; i < cluster->nchains; ++i) {
484 if (cluster->array[i]) {
485 KKASSERT(cparent->array[i]);
486 hammer2_chain_resize(trans, ip,
487 cparent->array[i],
488 cluster->array[i],
489 nradix, flags);
490 if (cluster->focus == NULL)
491 cluster->focus = cluster->array[i];
492 }
493 }
494}
495
496/*
497 * Set an inode's cluster modified, marking the related chains RW and
498 * duplicating them if necessary.
499 *
500 * The passed-in chain is a localized copy of the chain previously acquired
501 * when the inode was locked (and possilby replaced in the mean time), and
502 * must also be updated. In fact, we update it first and then synchronize
503 * the inode's cluster cache.
504 */
505hammer2_inode_data_t *
506hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
507 hammer2_cluster_t *cluster, int flags)
508{
509 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
510 hammer2_cluster_modify(trans, cluster, flags);
511
512 hammer2_inode_repoint(ip, NULL, cluster);
513 if (ip->vp)
514 vsetisdirty(ip->vp);
515 return (&hammer2_cluster_wdata(cluster)->ipdata);
516}
517
518/*
519 * Adjust the cluster's chains to allow modification and adjust the
520 * focus. Data will be accessible on return.
521 */
522void
523hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
524 int flags)
525{
526 int i;
527
528 cluster->focus = NULL;
529 for (i = 0; i < cluster->nchains; ++i) {
530 if (cluster->array[i]) {
531 hammer2_chain_modify(trans, cluster->array[i], flags);
532 if (cluster->focus == NULL)
533 cluster->focus = cluster->array[i];
534 }
535 }
536}
537
538/*
539 * Synchronize modifications from the focus to other chains in a cluster.
540 * Convenient because nominal API users can just modify the contents of the
541 * focus (at least for non-blockref data).
542 *
543 * Nominal front-end operations only edit non-block-table data in a single
544 * chain. This code copies such modifications to the other chains in the
545 * cluster. Blocktable modifications are handled on a chain-by-chain basis
546 * by both the frontend and the backend and will explode in fireworks if
547 * blindly copied.
548 */
549void
550hammer2_cluster_modsync(hammer2_cluster_t *cluster)
551{
552 hammer2_chain_t *focus;
553 hammer2_chain_t *scan;
554 const hammer2_inode_data_t *ripdata;
555 hammer2_inode_data_t *wipdata;
556 int i;
557
558 focus = cluster->focus;
559 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
560
561 for (i = 0; i < cluster->nchains; ++i) {
562 scan = cluster->array[i];
563 if (scan == NULL || scan == focus)
564 continue;
565 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
566 KKASSERT(focus->bytes == scan->bytes &&
567 focus->bref.type == scan->bref.type);
568 switch(focus->bref.type) {
569 case HAMMER2_BREF_TYPE_INODE:
570 ripdata = &focus->data->ipdata;
571 wipdata = &scan->data->ipdata;
572 if ((ripdata->op_flags &
573 HAMMER2_OPFLAG_DIRECTDATA) == 0) {
574 bcopy(ripdata, wipdata,
575 offsetof(hammer2_inode_data_t, u));
576 break;
577 }
578 /* fall through */
579 case HAMMER2_BREF_TYPE_DATA:
580 bcopy(focus->data, scan->data, focus->bytes);
581 break;
582 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
583 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
584 case HAMMER2_BREF_TYPE_FREEMAP:
585 case HAMMER2_BREF_TYPE_VOLUME:
586 panic("hammer2_cluster_modsync: illegal node type");
587 /* NOT REACHED */
588 break;
589 default:
590 panic("hammer2_cluster_modsync: unknown node type");
591 break;
592 }
593 }
594}
595
596/*
597 * Lookup initialization/completion API
598 */
599hammer2_cluster_t *
600hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
601{
602 hammer2_cluster_t *cluster;
603 int i;
604
605 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
606 cluster->pmp = cparent->pmp; /* can be NULL */
607 /* cluster->focus = NULL; already null */
608
609 for (i = 0; i < cparent->nchains; ++i) {
610 cluster->array[i] = cparent->array[i];
611 if (cluster->focus == NULL)
612 cluster->focus = cluster->array[i];
613 }
614 cluster->nchains = cparent->nchains;
615
616 /*
617 * Independently lock (this will also give cluster 1 ref)
618 */
619 if (flags & HAMMER2_LOOKUP_SHARED) {
620 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
621 HAMMER2_RESOLVE_SHARED);
622 } else {
623 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
624 }
625 return (cluster);
626}
627
628void
629hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
630{
631 if (cparent)
632 hammer2_cluster_unlock(cparent);
633}
634
635/*
636 * Locate first match or overlap under parent, return a new cluster
637 */
638hammer2_cluster_t *
639hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
640 hammer2_key_t key_beg, hammer2_key_t key_end,
641 int flags, int *ddflagp)
642{
643 hammer2_pfsmount_t *pmp;
644 hammer2_cluster_t *cluster;
645 hammer2_chain_t *chain;
646 hammer2_key_t key_accum;
647 hammer2_key_t key_next;
648 hammer2_key_t bref_key;
649 int bref_keybits;
650 int null_count;
651 int ddflag;
652 int i;
653 uint8_t bref_type;
654 u_int bytes;
655
656 pmp = cparent->pmp; /* can be NULL */
657 key_accum = *key_nextp;
658 null_count = 0;
659 bref_type = 0;
660 bref_key = 0;
661 bref_keybits = 0;
662 bytes = 0;
663
664 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
665 cluster->pmp = pmp; /* can be NULL */
666 cluster->refs = 1;
667 /* cluster->focus = NULL; already null */
668 cparent->focus = NULL;
669 *ddflagp = 0;
670
671 for (i = 0; i < cparent->nchains; ++i) {
672 key_next = *key_nextp;
673 if (cparent->array[i] == NULL) {
674 ++null_count;
675 continue;
676 }
677 chain = hammer2_chain_lookup(&cparent->array[i], &key_next,
678 key_beg, key_end,
679 &cparent->cache_index[i],
680 flags, &ddflag);
681 if (cparent->focus == NULL)
682 cparent->focus = cparent->array[i];
683 cluster->array[i] = chain;
684 if (chain == NULL) {
685 ++null_count;
686 } else {
687 if (cluster->focus == NULL) {
688 bref_type = chain->bref.type;
689 bref_key = chain->bref.key;
690 bref_keybits = chain->bref.keybits;
691 bytes = chain->bytes;
692 *ddflagp = ddflag;
693 cluster->focus = chain;
694 }
695 KKASSERT(bref_type == chain->bref.type);
696 KKASSERT(bref_key == chain->bref.key);
697 KKASSERT(bref_keybits == chain->bref.keybits);
698 KKASSERT(bytes == chain->bytes);
699 KKASSERT(*ddflagp == ddflag);
700 }
701 if (key_accum > key_next)
702 key_accum = key_next;
703 }
704 *key_nextp = key_accum;
705 cluster->nchains = i;
706
707 if (null_count == i) {
708 hammer2_cluster_drop(cluster);
709 cluster = NULL;
710 }
711
712 return (cluster);
713}
714
715/*
716 * Locate next match or overlap under parent, replace cluster
717 */
718hammer2_cluster_t *
719hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
720 hammer2_key_t *key_nextp,
721 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
722{
723 hammer2_chain_t *chain;
724 hammer2_key_t key_accum;
725 hammer2_key_t key_next;
726 int null_count;
727 int i;
728
729 key_accum = *key_nextp;
730 null_count = 0;
731 cluster->focus = NULL;
732 cparent->focus = NULL;
733
734 for (i = 0; i < cparent->nchains; ++i) {
735 key_next = *key_nextp;
736 chain = cluster->array[i];
737 if (chain == NULL) {
738 if (cparent->focus == NULL)
739 cparent->focus = cparent->array[i];
740 ++null_count;
741 continue;
742 }
743 if (cparent->array[i] == NULL) {
744 if (flags & HAMMER2_LOOKUP_NOLOCK)
745 hammer2_chain_drop(chain);
746 else
747 hammer2_chain_unlock(chain);
748 ++null_count;
749 continue;
750 }
751 chain = hammer2_chain_next(&cparent->array[i], chain,
752 &key_next, key_beg, key_end,
753 &cparent->cache_index[i], flags);
754 if (cparent->focus == NULL)
755 cparent->focus = cparent->array[i];
756 cluster->array[i] = chain;
757 if (chain == NULL) {
758 ++null_count;
759 } else if (cluster->focus == NULL) {
760 cluster->focus = chain;
761 }
762 if (key_accum > key_next)
763 key_accum = key_next;
764 }
765
766 if (null_count == i) {
767 hammer2_cluster_drop(cluster);
768 cluster = NULL;
769 }
770 return(cluster);
771}
772
773#if 0
774/*
775 * XXX initial NULL cluster needs reworking (pass **clusterp ?)
776 *
777 * The raw scan function is similar to lookup/next but does not seek to a key.
778 * Blockrefs are iterated via first_chain = (parent, NULL) and
779 * next_chain = (parent, chain).
780 *
781 * The passed-in parent must be locked and its data resolved. The returned
782 * chain will be locked. Pass chain == NULL to acquire the first sub-chain
783 * under parent and then iterate with the passed-in chain (which this
784 * function will unlock).
785 */
786hammer2_cluster_t *
787hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
788 int flags)
789{
790 hammer2_chain_t *chain;
791 int null_count;
792 int i;
793
794 null_count = 0;
795
796 for (i = 0; i < cparent->nchains; ++i) {
797 chain = cluster->array[i];
798 if (chain == NULL) {
799 ++null_count;
800 continue;
801 }
802 if (cparent->array[i] == NULL) {
803 if (flags & HAMMER2_LOOKUP_NOLOCK)
804 hammer2_chain_drop(chain);
805 else
806 hammer2_chain_unlock(chain);
807 ++null_count;
808 continue;
809 }
810
811 chain = hammer2_chain_scan(cparent->array[i], chain,
812 &cparent->cache_index[i], flags);
813 cluster->array[i] = chain;
814 if (chain == NULL)
815 ++null_count;
816 }
817
818 if (null_count == i) {
819 hammer2_cluster_drop(cluster);
820 cluster = NULL;
821 }
822 return(cluster);
823}
824
825#endif
826
827/*
828 * Create a new cluster using the specified key
829 */
830int
831hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
832 hammer2_cluster_t **clusterp,
833 hammer2_key_t key, int keybits,
834 int type, size_t bytes, int flags)
835{
836 hammer2_cluster_t *cluster;
837 hammer2_pfsmount_t *pmp;
838 int error;
839 int i;
840
841 pmp = trans->pmp; /* can be NULL */
842
843 if ((cluster = *clusterp) == NULL) {
844 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
845 M_WAITOK | M_ZERO);
846 cluster->pmp = pmp; /* can be NULL */
847 cluster->refs = 1;
848 }
849 cluster->focus = NULL;
850 cparent->focus = NULL;
851
852 /*
853 * NOTE: cluster->array[] entries can initially be NULL. If
854 * *clusterp is supplied, skip NULL entries, otherwise
855 * create new chains.
856 */
857 for (i = 0; i < cparent->nchains; ++i) {
858 if (*clusterp && cluster->array[i] == NULL) {
859 if (cparent->focus == NULL)
860 cparent->focus = cparent->array[i];
861 continue;
862 }
863 error = hammer2_chain_create(trans, &cparent->array[i],
864 &cluster->array[i], pmp,
865 key, keybits,
866 type, bytes, flags);
867 KKASSERT(error == 0);
868 if (cparent->focus == NULL)
869 cparent->focus = cparent->array[i];
870 if (cluster->focus == NULL)
871 cluster->focus = cluster->array[i];
872 }
873 cluster->nchains = i;
874 *clusterp = cluster;
875
876 return error;
877}
878
879/*
880 * Rename a cluster to a new parent.
881 *
882 * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields
883 * are used from a passed-in non-NULL bref pointer. All other fields
884 * are extracted from the original chain for each chain in the
885 * iteration.
886 */
887void
888hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
889 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
890 int flags)
891{
892 hammer2_chain_t *chain;
893 hammer2_blockref_t xbref;
894 int i;
895
896 cluster->focus = NULL;
897 cparent->focus = NULL;
898
899 for (i = 0; i < cluster->nchains; ++i) {
900 chain = cluster->array[i];
901 if (chain) {
902 if (bref) {
903 xbref = chain->bref;
904 xbref.key = bref->key;
905 xbref.keybits = bref->keybits;
906 hammer2_chain_rename(trans, &xbref,
907 &cparent->array[i],
908 chain, flags);
909 } else {
910 hammer2_chain_rename(trans, NULL,
911 &cparent->array[i],
912 chain, flags);
913 }
914 cluster->array[i] = chain;
915 if (cluster->focus == NULL)
916 cluster->focus = chain;
917 if (cparent->focus == NULL)
918 cparent->focus = cparent->array[i];
919 } else {
920 if (cparent->focus == NULL)
921 cparent->focus = cparent->array[i];
922 }
923 }
924}
925
926/*
927 * Mark a cluster deleted
928 */
929void
930hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
931 hammer2_cluster_t *cluster, int flags)
932{
933 hammer2_chain_t *chain;
934 hammer2_chain_t *parent;
935 int i;
936
937 if (cparent == NULL) {
938 kprintf("cparent is NULL\n");
939 return;
940 }
941
942 for (i = 0; i < cluster->nchains; ++i) {
943 parent = (i < cparent->nchains) ? cparent->array[i] : NULL;
944 chain = cluster->array[i];
945 if (chain == NULL)
946 continue;
947 if (chain->parent != parent) {
948 kprintf("hammer2_cluster_delete: parent "
949 "mismatch chain=%p parent=%p against=%p\n",
950 chain, chain->parent, parent);
951 } else {
952 hammer2_chain_delete(trans, parent, chain, flags);
953 }
954 }
955}
956
957/*
958 * Create a snapshot of the specified {parent, ochain} with the specified
959 * label. The originating hammer2_inode must be exclusively locked for
960 * safety.
961 *
962 * The ioctl code has already synced the filesystem.
963 */
964int
965hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
966 hammer2_ioc_pfs_t *pfs)
967{
968 hammer2_mount_t *hmp;
969 hammer2_cluster_t *ncluster;
970 const hammer2_inode_data_t *ripdata;
971 hammer2_inode_data_t *wipdata;
972 hammer2_inode_t *nip;
973 size_t name_len;
974 hammer2_key_t lhc;
975 struct vattr vat;
976 uuid_t opfs_clid;
977 int error;
978 int i;
979
980 kprintf("snapshot %s\n", pfs->name);
981
982 name_len = strlen(pfs->name);
983 lhc = hammer2_dirhash(pfs->name, name_len);
984
985 /*
986 * Get the clid
987 */
988 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
989 opfs_clid = ripdata->pfs_clid;
990 hmp = ocluster->focus->hmp;
991
992 /*
993 * Create the snapshot directory under the super-root
994 *
995 * Set PFS type, generate a unique filesystem id, and generate
996 * a cluster id. Use the same clid when snapshotting a PFS root,
997 * which theoretically allows the snapshot to be used as part of
998 * the same cluster (perhaps as a cache).
999 *
1000 * Copy the (flushed) blockref array. Theoretically we could use
1001 * chain_duplicate() but it becomes difficult to disentangle
1002 * the shared core so for now just brute-force it.
1003 */
1004 VATTR_NULL(&vat);
1005 vat.va_type = VDIR;
1006 vat.va_mode = 0755;
1007 ncluster = NULL;
1008 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1009 proc0.p_ucred, pfs->name, name_len,
1010 &ncluster, &error);
1011
1012 if (nip) {
1013 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1014 wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
1015 kern_uuidgen(&wipdata->pfs_fsid, 1);
1016 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1017 wipdata->pfs_clid = opfs_clid;
1018 else
1019 kern_uuidgen(&wipdata->pfs_clid, 1);
1020
1021 for (i = 0; i < ncluster->nchains; ++i) {
1022 if (ncluster->array[i]) {
1023 ncluster->array[i]->bref.flags |=
1024 HAMMER2_BREF_FLAG_PFSROOT;
1025 }
1026 }
1027#if 0
1028 /* XXX can't set this unless we do an explicit flush, which
1029 we also need a pmp assigned to do, else the flush code
1030 won't flush ncluster because it thinks it is crossing a
1031 flush boundary */
1032 hammer2_cluster_set_chainflags(ncluster,
1033 HAMMER2_CHAIN_PFSBOUNDARY);
1034#endif
1035
1036 /* XXX hack blockset copy */
1037 /* XXX doesn't work with real cluster */
1038 KKASSERT(ocluster->nchains == 1);
1039 wipdata->u.blockset = ripdata->u.blockset;
1040 hammer2_cluster_modsync(ncluster);
1041 for (i = 0; i < ncluster->nchains; ++i) {
1042 if (ncluster->array[i])
1043 hammer2_flush(trans, ncluster->array[i]);
1044 }
1045 hammer2_inode_unlock_ex(nip, ncluster);
1046 }
1047 return (error);
1048}
1049
1050/*
1051 * Return locked parent cluster given a locked child. The child remains
1052 * locked on return.
1053 */
1054hammer2_cluster_t *
1055hammer2_cluster_parent(hammer2_cluster_t *cluster)
1056{
1057 hammer2_cluster_t *cparent;
1058 int i;
1059
1060 cparent = hammer2_cluster_copy(cluster, HAMMER2_CLUSTER_COPY_NOCHAINS);
1061 for (i = 0; i < cluster->nchains; ++i) {
1062 hammer2_chain_t *chain;
1063 hammer2_chain_t *rchain;
1064
1065 chain = cluster->array[i];
1066 if (chain == NULL)
1067 continue;
1068 hammer2_chain_ref(chain);
1069 while ((rchain = chain->parent) != NULL) {
1070 hammer2_chain_ref(rchain);
1071 hammer2_chain_unlock(chain);
1072 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1073 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1074 hammer2_chain_drop(rchain);
1075 if (chain->parent == rchain)
1076 break;
1077 hammer2_chain_unlock(rchain);
1078 }
1079 hammer2_chain_drop(chain);
1080 cparent->array[i] = rchain;
1081 }
1082 return cparent;
1083}
1084
1085/************************************************************************
1086 * CLUSTER I/O *
1087 ************************************************************************
1088 *
1089 *
1090 * WARNING! blockref[] array data is not universal. These functions should
1091 * only be used to access universal data.
1092 *
1093 * NOTE! The rdata call will wait for at least one of the chain I/Os to
1094 * complete if necessary. The I/O's should have already been
1095 * initiated by the cluster_lock/chain_lock operation.
1096 *
1097 * The cluster must already be in a modified state before wdata
1098 * is called. The data will already be available for this case.
1099 */
1100const hammer2_media_data_t *
1101hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1102{
1103 return(cluster->focus->data);
1104}
1105
1106hammer2_media_data_t *
1107hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1108{
1109 KKASSERT(hammer2_cluster_modified(cluster));
1110 return(cluster->focus->data);
1111}
1112
1113/*
1114 * Load async into independent buffer - used to load logical buffers from
1115 * underlying device data. The callback is made for the first validated
1116 * data found, or NULL if no valid data is available.
1117 *
1118 * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1119 * in the inode with an exclusive lock held), the chain structure may be
1120 * shared.
1121 */
1122void
1123hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1124 void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1125{
1126 hammer2_chain_t *chain;
1127 hammer2_iocb_t *iocb;
1128 hammer2_mount_t *hmp;
1129 hammer2_blockref_t *bref;
1130 int i;
1131
1132 /*
1133 * Try to find a chain whos data is already resolved. If none can
1134 * be found, start with the first chain.
1135 */
1136 chain = NULL;
1137 for (i = 0; i < cluster->nchains; ++i) {
1138 chain = cluster->array[i];
1139 if (chain && chain->data)
1140 break;
1141 }
1142 if (i == cluster->nchains) {
1143 chain = cluster->array[0];
1144 i = 0;
1145 }
1146
1147 iocb = &cluster->iocb;
1148 iocb->callback = callback;
1149 iocb->dio = NULL; /* for already-validated case */
1150 iocb->cluster = cluster;
1151 iocb->chain = chain;
1152 iocb->ptr = ptr;
1153 iocb->lbase = (off_t)i;
1154 iocb->flags = 0;
1155 iocb->error = 0;
1156
1157 /*
1158 * Data already validated
1159 */
1160 if (chain->data) {
1161 callback(iocb);
1162 return;
1163 }
1164
1165 /*
1166 * We must resolve to a device buffer, either by issuing I/O or
1167 * by creating a zero-fill element. We do not mark the buffer
1168 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1169 * API must still be used to do that).
1170 *
1171 * The device buffer is variable-sized in powers of 2 down
1172 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage
1173 * chunk always contains buffers of the same size. (XXX)
1174 *
1175 * The minimum physical IO size may be larger than the variable
1176 * block size.
1177 */
1178 bref = &chain->bref;
1179 hmp = chain->hmp;
1180
1181#if 0
1182 /* handled by callback? <- TODO XXX even needed for loads? */
1183 /*
1184 * The getblk() optimization for a 100% overwrite can only be used
1185 * if the physical block size matches the request.
1186 */
1187 if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1188 chain->bytes == hammer2_devblksize(chain->bytes)) {
1189 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1190 KKASSERT(error == 0);
1191 iocb->dio = dio;
1192 callback(iocb);
1193 return;
1194 }
1195#endif
1196
1197 /*
1198 * Otherwise issue a read
1199 */
1200 hammer2_adjreadcounter(&chain->bref, chain->bytes);
1201 hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1202}
1203
1204/************************************************************************
1205 * NODE FAILURES *
1206 ************************************************************************
1207 *
1208 * A node failure can occur for numerous reasons.
1209 *
1210 * - A read I/O may fail
1211 * - A write I/O may fail
1212 * - An unexpected chain might be found (or be missing)
1213 * - A node might disconnect temporarily and reconnect later
1214 * (for example, a USB stick could get pulled, or a node might
1215 * be programmatically disconnected).
1216 * - A node might run out of space during a modifying operation.
1217 *
1218 * When a read failure or an unexpected chain state is found, the chain and
1219 * parent chain at the failure point for the nodes involved (the nodes
1220 * which we determine to be in error) are flagged as failed and removed
1221 * from the cluster. The node itself is allowed to remain active. The
1222 * highest common point (usually a parent chain) is queued to the
1223 * resynchronization thread for action.
1224 *
1225 * When a write I/O fails or a node runs out of space, we first adjust
1226 * as if a read failure occurs but we further disable flushes on the
1227 * ENTIRE node. Concurrent modifying transactions are allowed to complete
1228 * but any new modifying transactions will automatically remove the node
1229 * from consideration in all related cluster structures and not generate
1230 * any new modified chains. The ROOT chain for the failed node(s) is queued
1231 * to the resynchronization thread for action.
1232 *
1233 * A temporary disconnect is handled as if a write failure occurred.
1234 *
1235 * Any of these failures might or might not stall related high level VNOPS,
1236 * depending on what has failed, what nodes remain, the type of cluster,
1237 * and the operating state of the cluster.
1238 *
1239 * FLUSH ON WRITE-DISABLED NODES
1240 *
1241 * A flush on a write-disabled node is not allowed to write anything because
1242 * we cannot safely update the mirror_tid anywhere on the failed node. The
1243 * synchronization thread uses mirror_tid to calculate incremental resyncs.
1244 * Dirty meta-data related to the failed node is thrown away.
1245 *
1246 * Dirty buffer cache buffers and inodes are only thrown away if they can be
1247 * retired... that is, if the filesystem still has enough nodes to complete
1248 * the operation.
1249 */
1250
1251/************************************************************************
1252 * SYNCHRONIZATION THREAD *
1253 ************************************************************************
1254 *
1255 * This thread is responsible for [re]synchronizing the cluster representing
1256 * a PFS. Any out-of-sync or failed node starts this thread on a
1257 * node-by-node basis when the failure is detected.
1258 *
1259 * Clusters needing resynchronization are queued at the highest point
1260 * where the parent on the failed node is still valid, or a special
1261 * incremental scan from the ROOT is queued if no parent exists. This
1262 * thread is also responsible for waiting for reconnections of the failed
1263 * node if the cause was due to a disconnect, and waiting for space to be
1264 * freed up if the cause was due to running out of space.
1265 *
1266 * If the cause is due to a node running out of space, this thread will also
1267 * remove older (unlocked) snapshots to make new space, recover space, and
1268 * then start resynchronization.
1269 *
1270 * Each resynchronization pass virtually snapshots the PFS on the good nodes
1271 * and synchronizes using that snapshot against the target node. This
1272 * ensures a consistent chain topology and also avoid interference between
1273 * the resynchronization thread and frontend operations.
1274 *
1275 * Since these are per-node threads it is possible to resynchronize several
1276 * nodes at once.
1277 */