hammer2 - Refactor frontend part 14/many
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
CommitLineData
278ab2b2 1/*
7750fd72 2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved.
278ab2b2
MD
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34/*
35 * The cluster module collects multiple chains representing the same
fe73aa5d
MD
36 * information from different nodes into a single entity. It allows direct
37 * access to media data as long as it is not blockref array data (which
38 * will obviously have to be different at each node).
278ab2b2
MD
39 *
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
43 *
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
fe73aa5d
MD
47 *
48 * WARNING! This module is *extremely* complex. It must issue asynchronous
49 * locks and I/O, do quorum and/or master-slave processing, and
50 * it must operate properly even if some nodes are broken (which
51 * can also mean indefinite locks).
7750fd72
MD
52 *
53 * CLUSTER OPERATIONS
54 *
55 * Cluster operations can be broken down into three pieces:
56 *
57 * (1) Chain locking and data retrieval.
58 * hammer2_cluster_lock()
59 * hammer2_cluster_parent()
60 *
61 * - Most complex functions, quorum management on transaction ids.
62 *
63 * - Locking and data accesses must be internally asynchronous.
64 *
65 * - Validate and manage cache coherency primitives (cache state
66 * is stored in chain topologies but must be validated by these
67 * functions).
68 *
69 * (2) Lookups and Scans
70 * hammer2_cluster_lookup()
71 * hammer2_cluster_next()
72 *
73 * - Depend on locking & data retrieval functions, but still complex.
74 *
75 * - Must do quorum management on transaction ids.
76 *
77 * - Lookup and Iteration ops Must be internally asynchronous.
78 *
79 * (3) Modifying Operations
80 * hammer2_cluster_create()
81 * hammer2_cluster_rename()
82 * hammer2_cluster_delete()
83 * hammer2_cluster_modify()
84 * hammer2_cluster_modsync()
85 *
86 * - Can usually punt on failures, operation continues unless quorum
87 * is lost. If quorum is lost, must wait for resynchronization
88 * (depending on the management mode).
89 *
90 * - Must disconnect node on failures (also not flush), remount, and
91 * resynchronize.
92 *
93 * - Network links (via kdmsg) are relatively easy to issue as the
94 * complex underworkings of hammer2_chain.c don't have to messed
95 * with (the protocol is at a higher level than block-level).
96 *
97 * - Multiple local disk nodes (i.e. block devices) are another matter.
98 * Chain operations have to be dispatched to per-node threads (xN)
99 * because we can't asynchronize potentially very complex chain
100 * operations in hammer2_chain.c (it would be a huge mess).
101 *
102 * (these threads are also used to terminate incoming kdmsg ops from
103 * other machines).
104 *
105 * - Single-node filesystems do not use threads and will simply call
106 * hammer2_chain.c functions directly. This short-cut is handled
107 * at the base of each cluster function.
278ab2b2
MD
108 */
109#include <sys/cdefs.h>
110#include <sys/param.h>
111#include <sys/systm.h>
112#include <sys/types.h>
113#include <sys/lock.h>
114#include <sys/uuid.h>
115
116#include "hammer2.h"
117
05dd26e4 118/*
b93cc2e0
MD
119 * Returns non-zero if any chain in the cluster needs to be resized.
120 * Errored elements are not used in the calculation.
05dd26e4
MD
121 */
122int
123hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
278ab2b2 124{
05dd26e4
MD
125 hammer2_chain_t *chain;
126 int i;
127
b93cc2e0 128 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
05dd26e4 129 for (i = 0; i < cluster->nchains; ++i) {
e513e77e
MD
130 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
131 continue;
4b7e61e0 132 chain = cluster->array[i].chain;
b93cc2e0
MD
133 if (chain == NULL)
134 continue;
135 if (chain->error)
136 continue;
137 if (chain->bytes != bytes)
05dd26e4
MD
138 return 1;
139 }
140 return 0;
278ab2b2
MD
141}
142
b93cc2e0
MD
143/*
144 * Returns the bref type of the cluster's foucs.
145 *
146 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
147 * The cluster must be locked.
148 */
278ab2b2
MD
149uint8_t
150hammer2_cluster_type(hammer2_cluster_t *cluster)
151{
c847e838
MD
152 if (cluster->error == 0) {
153 KKASSERT(cluster->focus != NULL);
b93cc2e0 154 return(cluster->focus->bref.type);
c847e838 155 }
b93cc2e0 156 return 0;
278ab2b2
MD
157}
158
b93cc2e0
MD
159/*
160 * Returns non-zero if the cluster's focus is flagged as being modified.
161 *
162 * If the cluster is errored, returns 0.
163 */
278ab2b2
MD
164int
165hammer2_cluster_modified(hammer2_cluster_t *cluster)
166{
c847e838
MD
167 if (cluster->error == 0) {
168 KKASSERT(cluster->focus != NULL);
b93cc2e0 169 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
c847e838 170 }
b93cc2e0 171 return 0;
278ab2b2
MD
172}
173
84e47819 174/*
b93cc2e0
MD
175 * Returns the bref of the cluster's focus, sans any data-offset information
176 * (since offset information is per-node and wouldn't be useful).
177 *
e513e77e
MD
178 * Callers use this function to access modify_tid, mirror_tid, type,
179 * key, and keybits.
84e47819 180 *
b93cc2e0
MD
181 * If the cluster is errored, returns an empty bref.
182 * The cluster must be locked.
84e47819 183 */
278ab2b2
MD
184void
185hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
186{
b93cc2e0 187 if (cluster->error == 0) {
c847e838 188 KKASSERT(cluster->focus != NULL);
b93cc2e0
MD
189 *bref = cluster->focus->bref;
190 bref->data_off = 0;
191 } else {
192 bzero(bref, sizeof(*bref));
193 }
278ab2b2
MD
194}
195
b93cc2e0
MD
196/*
197 * Flag the cluster for flushing recursively up to the root. Despite the
198 * work it does, this is relatively benign. It just makes sure that the
199 * flusher has top-down visibility to this cluster.
200 *
201 * Errored chains are not flagged for flushing.
202 *
203 * The cluster should probably be locked.
204 */
278ab2b2 205void
c603b86b 206hammer2_cluster_setflush(hammer2_cluster_t *cluster)
278ab2b2 207{
84e47819 208 hammer2_chain_t *chain;
278ab2b2
MD
209 int i;
210
84e47819 211 for (i = 0; i < cluster->nchains; ++i) {
e513e77e
MD
212 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
213 continue;
4b7e61e0 214 chain = cluster->array[i].chain;
b93cc2e0
MD
215 if (chain == NULL)
216 continue;
217 if (chain->error)
218 continue;
c603b86b 219 hammer2_chain_setflush(chain);
84e47819
MD
220 }
221}
222
b93cc2e0
MD
223/*
224 * Set the check mode for the cluster.
225 * Errored elements of the cluster are ignored.
226 *
e513e77e 227 * The cluster must be locked and modified.
b93cc2e0 228 */
e07becf8 229void
c603b86b 230hammer2_cluster_setmethod_check(hammer2_cluster_t *cluster, int check_algo)
e07becf8
MD
231{
232 hammer2_chain_t *chain;
233 int i;
234
b93cc2e0 235 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
e07becf8 236 for (i = 0; i < cluster->nchains; ++i) {
e513e77e
MD
237 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
238 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
239 continue;
240 }
4b7e61e0 241 chain = cluster->array[i].chain;
b93cc2e0
MD
242 if (chain == NULL)
243 continue;
244 if (chain->error)
245 continue;
246 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
247 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
248 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
e07becf8
MD
249 }
250}
251
84e47819 252/*
b93cc2e0
MD
253 * Create a degenerate cluster with one ref from a single locked chain.
254 * The returned cluster will be focused on the chain and inherit its
255 * error state.
fe73aa5d 256 *
b93cc2e0
MD
257 * The chain's lock and reference are transfered to the new cluster, so
258 * the caller should not try to unlock the chain separately.
22211834
MD
259 *
260 * We fake the flags.
84e47819
MD
261 */
262hammer2_cluster_t *
263hammer2_cluster_from_chain(hammer2_chain_t *chain)
264{
265 hammer2_cluster_t *cluster;
266
267 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
4b7e61e0 268 cluster->array[0].chain = chain;
e513e77e 269 cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
84e47819
MD
270 cluster->nchains = 1;
271 cluster->focus = chain;
8db69c9f 272 cluster->focus_index = 0;
50456506 273 cluster->pmp = chain->pmp;
84e47819 274 cluster->refs = 1;
b93cc2e0 275 cluster->error = chain->error;
22211834
MD
276 cluster->flags = HAMMER2_CLUSTER_LOCKED |
277 HAMMER2_CLUSTER_WRHARD |
278 HAMMER2_CLUSTER_RDHARD |
279 HAMMER2_CLUSTER_MSYNCED |
280 HAMMER2_CLUSTER_SSYNCED;
84e47819
MD
281
282 return cluster;
278ab2b2
MD
283}
284
278ab2b2 285/*
b93cc2e0 286 * Add a reference to a cluster and its underlying chains.
278ab2b2
MD
287 *
288 * We must also ref the underlying chains in order to allow ref/unlock
289 * sequences to later re-lock.
290 */
291void
292hammer2_cluster_ref(hammer2_cluster_t *cluster)
293{
f7712c43 294 atomic_add_int(&cluster->refs, 1);
278ab2b2
MD
295}
296
297/*
298 * Drop the caller's reference to the cluster. When the ref count drops to
299 * zero this function frees the cluster and drops all underlying chains.
bca9f8e6
MD
300 *
301 * In-progress read I/Os are typically detached from the cluster once the
302 * first one returns (the remaining stay attached to the DIOs but are then
303 * ignored and drop naturally).
278ab2b2
MD
304 */
305void
306hammer2_cluster_drop(hammer2_cluster_t *cluster)
307{
84e47819 308 hammer2_chain_t *chain;
278ab2b2
MD
309 int i;
310
84e47819 311 KKASSERT(cluster->refs > 0);
84e47819 312 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
22211834 313 cluster->focus = NULL; /* safety XXX chg to assert */
8db69c9f 314 cluster->focus_index = 0;
f7712c43
MD
315
316 for (i = 0; i < cluster->nchains; ++i) {
317 chain = cluster->array[i].chain;
318 if (chain) {
319 hammer2_chain_drop(chain);
320 cluster->array[i].chain = NULL; /* safety */
321 }
322 }
323 cluster->nchains = 0; /* safety */
324
84e47819 325 kfree(cluster, M_HAMMER2);
fe73aa5d 326 /* cluster is invalid */
278ab2b2 327 }
278ab2b2
MD
328}
329
330void
331hammer2_cluster_wait(hammer2_cluster_t *cluster)
332{
333 tsleep(cluster->focus, 0, "h2clcw", 1);
334}
335
336/*
a6cf1052 337 * Lock a cluster. Cluster must already be referenced. Focus is maintained.
fe73aa5d 338 *
a6cf1052
MD
339 * WARNING! This function expects the caller to handle resolution of the
340 * cluster. We never re-resolve the cluster in this function,
341 * because it might be used to temporarily unlock/relock a cparent
342 * in an iteration or recursrion, and the cparents elements do not
343 * necessarily match.
278ab2b2 344 */
23c7c7dd 345void
eedd52a3 346hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
278ab2b2 347{
84e47819 348 hammer2_chain_t *chain;
22211834
MD
349 int i;
350
b8ba9690 351 /* cannot be on inode-embedded cluster template, must be on copy */
e513e77e 352 KKASSERT(cluster->refs > 0);
b8ba9690
MD
353 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
354 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
e513e77e 355 panic("hammer2_cluster_lock: cluster %p already locked!\n",
b8ba9690
MD
356 cluster);
357 }
358 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
359
23c7c7dd
MD
360 /*
361 * Lock chains and resolve state.
362 */
363 for (i = 0; i < cluster->nchains; ++i) {
eedd52a3
MD
364 if (i == idx)
365 continue;
23c7c7dd
MD
366 chain = cluster->array[i].chain;
367 if (chain == NULL)
368 continue;
369 hammer2_chain_lock(chain, how);
370 }
23c7c7dd
MD
371}
372
eedd52a3
MD
373void
374hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
375{
376 hammer2_cluster_lock_except(cluster, -1, how);
377}
378
a6cf1052
MD
379/*
380 * Calculate the clustering state for the cluster and set its focus.
381 * This routine must be called with care. For example, it should not
382 * normally be called after relocking a non-leaf cluster because parent
383 * clusters help iterations and each element might be at a slightly different
384 * indirect node (each node's topology is independently indexed).
385 *
386 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
387 * operations. Typically this is only set on a quorum of MASTERs or
388 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER
389 * is present, this bit is *not* set on a quorum of MASTERs. The
390 * synchronization code ignores this bit, but all hammer2_cluster_*() calls
391 * that create/modify/delete elements use it.
392 *
393 * The chains making up the cluster may be narrowed down based on quorum
394 * acceptability, and if RESOLVE_RDONLY is specified the chains can be
395 * narrowed down to a single chain as long as the entire subtopology is known
396 * to be intact. So, for example, we can narrow a read-only op to a single
397 * fast SLAVE but if we focus a CACHE chain we must still retain at least
398 * a SLAVE to ensure that the subtopology can be accessed.
399 *
400 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
401 * to be maintained once the topology is validated as-of the top level of
402 * the operation.
403 *
404 * If a failure occurs the operation must be aborted by higher-level code and
405 * retried. XXX
406 */
23c7c7dd
MD
407void
408hammer2_cluster_resolve(hammer2_cluster_t *cluster)
409{
410 hammer2_chain_t *chain;
a6cf1052 411 hammer2_chain_t *focus;
23c7c7dd
MD
412 hammer2_pfs_t *pmp;
413 hammer2_tid_t quorum_tid;
0cc33e20 414 hammer2_tid_t last_best_quorum_tid;
23c7c7dd
MD
415 int focus_pfs_type;
416 uint32_t nflags;
417 int ttlmasters;
418 int ttlslaves;
419 int nmasters;
420 int nslaves;
421 int nquorum;
e513e77e 422 int smpresent;
23c7c7dd
MD
423 int i;
424
425 cluster->error = 0;
0cc33e20 426 cluster->focus = NULL;
23c7c7dd 427
23c7c7dd 428 focus_pfs_type = 0;
22211834 429 nflags = 0;
22211834 430 ttlmasters = 0;
23c7c7dd 431 ttlslaves = 0;
22211834 432 nmasters = 0;
23c7c7dd 433 nslaves = 0;
22211834
MD
434
435 /*
23c7c7dd 436 * Calculate quorum
22211834 437 */
23c7c7dd
MD
438 pmp = cluster->pmp;
439 KKASSERT(pmp != NULL || cluster->nchains == 0);
22211834 440 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
e513e77e 441 smpresent = 0;
22211834 442
22211834 443 /*
23c7c7dd 444 * Pass 1
0cc33e20
MD
445 *
446 * NOTE: A NULL chain is not necessarily an error, it could be
447 * e.g. a lookup failure or the end of an iteration.
448 * Process normally.
22211834
MD
449 */
450 for (i = 0; i < cluster->nchains; ++i) {
451 chain = cluster->array[i].chain;
0cc33e20 452 if (chain && chain->error) {
b93cc2e0
MD
453 if (cluster->focus == NULL || cluster->focus == chain) {
454 /* error will be overridden by valid focus */
455 cluster->error = chain->error;
456 }
457
458 /*
459 * Must count total masters and slaves whether the
460 * chain is errored or not.
461 */
462 switch (cluster->pmp->pfs_types[i]) {
463 case HAMMER2_PFSTYPE_MASTER:
464 ++ttlmasters;
465 break;
466 case HAMMER2_PFSTYPE_SLAVE:
467 ++ttlslaves;
468 break;
469 }
470 continue;
471 }
22211834
MD
472 switch (cluster->pmp->pfs_types[i]) {
473 case HAMMER2_PFSTYPE_MASTER:
474 ++ttlmasters;
22211834
MD
475 break;
476 case HAMMER2_PFSTYPE_SLAVE:
477 ++ttlslaves;
478 break;
479 case HAMMER2_PFSTYPE_SOFT_MASTER:
480 nflags |= HAMMER2_CLUSTER_WRSOFT;
481 nflags |= HAMMER2_CLUSTER_RDSOFT;
e513e77e 482 smpresent = 1;
22211834
MD
483 break;
484 case HAMMER2_PFSTYPE_SOFT_SLAVE:
485 nflags |= HAMMER2_CLUSTER_RDSOFT;
486 break;
487 case HAMMER2_PFSTYPE_SUPROOT:
488 /*
489 * Degenerate cluster representing the super-root
e513e77e
MD
490 * topology on a single device. Fake stuff so
491 * cluster ops work as expected.
22211834
MD
492 */
493 nflags |= HAMMER2_CLUSTER_WRHARD;
494 nflags |= HAMMER2_CLUSTER_RDHARD;
8db69c9f 495 cluster->focus_index = i;
22211834 496 cluster->focus = chain;
0cc33e20 497 cluster->error = chain ? chain->error : 0;
22211834
MD
498 break;
499 default:
500 break;
501 }
502 }
503
504 /*
23c7c7dd 505 * Pass 2
0cc33e20
MD
506 *
507 * Resolve masters. Calculate nmasters for the highest matching
508 * TID, if a quorum cannot be attained try the next lower matching
509 * TID until we exhaust TIDs.
510 *
511 * NOTE: A NULL chain is not necessarily an error, it could be
512 * e.g. a lookup failure or the end of an iteration.
513 * Process normally.
514 */
515 last_best_quorum_tid = HAMMER2_TID_MAX;
516 quorum_tid = 0; /* fix gcc warning */
517
518 while (nmasters < nquorum && last_best_quorum_tid != 0) {
519 nmasters = 0;
520 quorum_tid = 0;
521
522 for (i = 0; i < cluster->nchains; ++i) {
523 if (cluster->pmp->pfs_types[i] !=
524 HAMMER2_PFSTYPE_MASTER) {
525 continue;
526 }
527 chain = cluster->array[i].chain;
528
529 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
530 /*
531 * Invalid as in unsynchronized, cannot be
532 * used to calculate the quorum.
533 */
534 } else if (chain == NULL && quorum_tid == 0) {
535 /*
536 * NULL chain on master matches NULL chains
537 * on other masters.
538 */
539 ++nmasters;
540 } else if (quorum_tid < last_best_quorum_tid &&
541 chain != NULL &&
542 (quorum_tid < chain->bref.modify_tid ||
543 nmasters == 0)) {
544 /*
545 * Better TID located, reset nmasters count.
546 */
547 nmasters = 1;
548 quorum_tid = chain->bref.modify_tid;
549 } else if (chain &&
550 quorum_tid == chain->bref.modify_tid) {
551 /*
552 * TID matches current collection.
553 */
554 ++nmasters;
555 }
556 }
557 if (nmasters >= nquorum)
558 break;
559 last_best_quorum_tid = quorum_tid;
560 }
561
562 /*
563 * Pass 3
564 *
565 * NOTE: A NULL chain is not necessarily an error, it could be
566 * e.g. a lookup failure or the end of an iteration.
567 * Process normally.
22211834 568 */
278ab2b2 569 for (i = 0; i < cluster->nchains; ++i) {
e513e77e 570 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
4b7e61e0 571 chain = cluster->array[i].chain;
0cc33e20 572 if (chain && chain->error) {
b93cc2e0
MD
573 if (cluster->focus == NULL || cluster->focus == chain) {
574 /* error will be overridden by valid focus */
575 cluster->error = chain->error;
576 }
577 continue;
578 }
23c7c7dd 579
22211834
MD
580 switch (cluster->pmp->pfs_types[i]) {
581 case HAMMER2_PFSTYPE_MASTER:
582 /*
583 * We must have enough up-to-date masters to reach
e513e77e
MD
584 * a quorum and the master modify_tid must match
585 * the quorum's modify_tid.
23c7c7dd 586 *
8db69c9f 587 * Do not select an errored or out-of-sync master.
22211834 588 */
8db69c9f
MD
589 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
590 nflags |= HAMMER2_CLUSTER_UNHARD;
591 } else if (nmasters >= nquorum &&
0cc33e20
MD
592 (chain == NULL || chain->error == 0) &&
593 ((chain == NULL && quorum_tid == 0) ||
594 (chain != NULL && quorum_tid ==
595 chain->bref.modify_tid))) {
22211834
MD
596 nflags |= HAMMER2_CLUSTER_WRHARD;
597 nflags |= HAMMER2_CLUSTER_RDHARD;
e513e77e
MD
598 if (!smpresent) {
599 cluster->array[i].flags |=
600 HAMMER2_CITEM_FEMOD;
601 }
22211834
MD
602 if (cluster->focus == NULL ||
603 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
604 focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
8db69c9f 605 cluster->focus_index = i;
0cc33e20
MD
606 cluster->focus = chain; /* NULL ok */
607 cluster->error = chain ? chain->error :
608 0;
4b7e61e0 609 }
0cc33e20 610 } else if (chain == NULL || chain->error == 0) {
b93cc2e0 611 nflags |= HAMMER2_CLUSTER_UNHARD;
84e47819 612 }
22211834
MD
613 break;
614 case HAMMER2_PFSTYPE_SLAVE:
615 /*
616 * We must have enough up-to-date masters to reach
e513e77e
MD
617 * a quorum and the slave modify_tid must match the
618 * quorum's modify_tid.
23c7c7dd
MD
619 *
620 * Do not select an errored slave.
22211834 621 */
8db69c9f
MD
622 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
623 nflags |= HAMMER2_CLUSTER_UNHARD;
624 } else if (nmasters >= nquorum &&
0cc33e20
MD
625 (chain == NULL || chain->error == 0) &&
626 ((chain == NULL && quorum_tid == 0) ||
627 (chain && quorum_tid ==
628 chain->bref.modify_tid))) {
22211834
MD
629 ++nslaves;
630 nflags |= HAMMER2_CLUSTER_RDHARD;
0cc33e20
MD
631#if 0
632 /* XXX optimize for RESOLVE_RDONLY */
22211834
MD
633 if (cluster->focus == NULL) {
634 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
8db69c9f 635 cluster->focus_index = i;
0cc33e20
MD
636 cluster->focus = chain; /* NULL ok */
637 cluster->error = chain ? chain->error :
638 0;
22211834 639 }
0cc33e20
MD
640#endif
641 } else if (chain == NULL || chain->error == 0) {
b93cc2e0 642 nflags |= HAMMER2_CLUSTER_UNSOFT;
22211834
MD
643 }
644 break;
645 case HAMMER2_PFSTYPE_SOFT_MASTER:
646 /*
647 * Directly mounted soft master always wins. There
648 * should be only one.
649 */
650 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
8db69c9f 651 cluster->focus_index = i;
22211834 652 cluster->focus = chain;
0cc33e20 653 cluster->error = chain ? chain->error : 0;
22211834 654 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
e513e77e 655 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
22211834
MD
656 break;
657 case HAMMER2_PFSTYPE_SOFT_SLAVE:
658 /*
659 * Directly mounted soft slave always wins. There
660 * should be only one.
661 */
662 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
663 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
8db69c9f 664 cluster->focus_index = i;
fe73aa5d 665 cluster->focus = chain;
0cc33e20 666 cluster->error = chain ? chain->error : 0;
22211834
MD
667 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
668 }
669 break;
e513e77e
MD
670 case HAMMER2_PFSTYPE_SUPROOT:
671 /*
672 * spmp (degenerate case)
673 */
674 KKASSERT(i == 0);
675 cluster->focus_index = i;
676 cluster->focus = chain;
0cc33e20 677 cluster->error = chain ? chain->error : 0;
e513e77e
MD
678 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
679 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
680 break;
22211834
MD
681 default:
682 break;
683 }
684 }
685
a6cf1052
MD
686 /*
687 * Focus now set, adjust ddflag. Skip this pass if the focus
688 * is bad or if we are at the PFS root (the bref won't match at
689 * the PFS root, obviously).
690 */
691 focus = cluster->focus;
692 if (focus) {
693 cluster->ddflag =
694 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
695 } else {
696 cluster->ddflag = 0;
697 goto skip4;
698 }
699 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
700 goto skip4;
701
702 /*
703 * Pass 4
704 *
705 * Validate the elements that were not marked invalid. They should
706 * match.
707 */
708 for (i = 0; i < cluster->nchains; ++i) {
709 int ddflag;
710
711 chain = cluster->array[i].chain;
712
713 if (chain == NULL)
714 continue;
715 if (chain == focus)
716 continue;
717 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
718 continue;
719
720 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
721 if (chain->bref.type != focus->bref.type ||
722 chain->bref.key != focus->bref.key ||
723 chain->bref.keybits != focus->bref.keybits ||
724 chain->bref.modify_tid != focus->bref.modify_tid ||
725 chain->bytes != focus->bytes ||
726 ddflag != cluster->ddflag) {
727 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
728 if (hammer2_debug & 1)
729 kprintf("cluster_resolve: matching modify_tid failed "
730 "bref test: idx=%d type=%02x/%02x "
731 "key=%016jx/%d-%016jx/%d "
732 "mod=%016jx/%016jx bytes=%u/%u\n",
733 i,
734 chain->bref.type, focus->bref.type,
735 chain->bref.key, chain->bref.keybits,
736 focus->bref.key, focus->bref.keybits,
737 chain->bref.modify_tid, focus->bref.modify_tid,
738 chain->bytes, focus->bytes);
739 if (hammer2_debug & 0x4000)
740 panic("cluster_resolve");
741 /* flag issue and force resync? */
742 }
743 }
744skip4:
745
b93cc2e0 746 if (ttlslaves == 0)
b93cc2e0 747 nflags |= HAMMER2_CLUSTER_NOSOFT;
e513e77e
MD
748 if (ttlmasters == 0)
749 nflags |= HAMMER2_CLUSTER_NOHARD;
b93cc2e0 750
22211834
MD
751 /*
752 * Set SSYNCED or MSYNCED for slaves and masters respectively if
753 * all available nodes (even if 0 are available) are fully
754 * synchronized. This is used by the synchronization thread to
755 * determine if there is work it could potentially accomplish.
756 */
757 if (nslaves == ttlslaves)
758 nflags |= HAMMER2_CLUSTER_SSYNCED;
759 if (nmasters == ttlmasters)
760 nflags |= HAMMER2_CLUSTER_MSYNCED;
761
762 /*
763 * Determine if the cluster was successfully locked for the
764 * requested operation and generate an error code. The cluster
765 * will not be locked (or ref'd) if an error is returned.
23c7c7dd
MD
766 *
767 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
768 * to determine if reading or writing is possible. If writing, the
769 * cluster still requires a call to hammer2_cluster_modify() first.
22211834
MD
770 */
771 atomic_set_int(&cluster->flags, nflags);
772 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
278ab2b2
MD
773}
774
c847e838
MD
775/*
776 * This is used by the XOPS subsystem to calculate the state of
777 * the collection and tell hammer2_xop_collect() what to do with it.
778 * The collection can be in various states of desynchronization, the
779 * caller specifically wants to resolve the passed-in key.
780 *
781 * Return values:
782 * 0 - Quorum agreement, key is valid
783 *
784 * ENOENT - Quorum agreement, end of scan
785 *
786 * ESRCH - Quorum agreement, key is INVALID (caller should
787 * skip key).
788 *
789 * EIO - Quorum agreement but all elements had errors.
790 *
791 * EDEADLK - No quorum agreement possible for key, a repair
792 * may be needed. Caller has to decide what to do,
793 * possibly iterating the key or generating an EIO.
794 *
795 * EINPROGRESS - No quorum agreement yet, but agreement is still
796 * possible if caller waits for more responses. Caller
797 * should not iterate key.
798 *
799 * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
800 */
801int
802hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
803{
804 hammer2_chain_t *chain;
805 hammer2_chain_t *focus;
806 hammer2_pfs_t *pmp;
807 hammer2_tid_t quorum_tid;
808 hammer2_tid_t last_best_quorum_tid;
809 uint32_t nflags;
810 int ttlmasters;
811 int ttlslaves;
812 int nmasters;
813 int nmasters_keymatch;
814 int nslaves;
815 int nquorum;
816 int umasters; /* unknown masters (still in progress) */
817 int smpresent;
818 int i;
819
820 cluster->error = 0;
821 cluster->focus = NULL;
822
823 nflags = 0;
824 ttlmasters = 0;
825 ttlslaves = 0;
826 nmasters = 0;
827 nmasters_keymatch = 0;
828 umasters = 0;
829 nslaves = 0;
830
831 /*
832 * Calculate quorum
833 */
834 pmp = cluster->pmp;
835 KKASSERT(pmp != NULL || cluster->nchains == 0);
836 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
837 smpresent = 0;
838
839 /*
840 * Pass 1
841 *
842 * NOTE: A NULL chain is not necessarily an error, it could be
843 * e.g. a lookup failure or the end of an iteration.
844 * Process normally.
845 */
846 for (i = 0; i < cluster->nchains; ++i) {
847 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
848 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
849
850 chain = cluster->array[i].chain;
851 if (chain && chain->error) {
852 if (cluster->focus == NULL || cluster->focus == chain) {
853 /* error will be overridden by valid focus */
854 cluster->error = chain->error;
855 }
856
857 /*
858 * Must count total masters and slaves whether the
859 * chain is errored or not.
860 */
861 switch (cluster->pmp->pfs_types[i]) {
862 case HAMMER2_PFSTYPE_MASTER:
863 ++ttlmasters;
864 break;
865 case HAMMER2_PFSTYPE_SLAVE:
866 ++ttlslaves;
867 break;
868 }
869 continue;
870 }
871 switch (cluster->pmp->pfs_types[i]) {
872 case HAMMER2_PFSTYPE_MASTER:
873 ++ttlmasters;
874 break;
875 case HAMMER2_PFSTYPE_SLAVE:
876 ++ttlslaves;
877 break;
878 case HAMMER2_PFSTYPE_SOFT_MASTER:
879 nflags |= HAMMER2_CLUSTER_WRSOFT;
880 nflags |= HAMMER2_CLUSTER_RDSOFT;
881 smpresent = 1;
882 break;
883 case HAMMER2_PFSTYPE_SOFT_SLAVE:
884 nflags |= HAMMER2_CLUSTER_RDSOFT;
885 break;
886 case HAMMER2_PFSTYPE_SUPROOT:
887 /*
888 * Degenerate cluster representing the super-root
889 * topology on a single device. Fake stuff so
890 * cluster ops work as expected.
891 */
892 nflags |= HAMMER2_CLUSTER_WRHARD;
893 nflags |= HAMMER2_CLUSTER_RDHARD;
894 cluster->focus_index = i;
895 cluster->focus = chain;
896 cluster->error = chain ? chain->error : 0;
897 break;
898 default:
899 break;
900 }
901 }
902
903 /*
904 * Pass 2
905 *
906 * Resolve nmasters - master nodes fully match
907 *
908 * Resolve umasters - master nodes operation still
909 * in progress
910 *
911 * Resolve nmasters_keymatch - master nodes match the passed-in
912 * key and may or may not match
913 * the quorum-agreed tid.
914 *
915 * The quorum-agreed TID is the highest matching TID.
916 */
917 last_best_quorum_tid = HAMMER2_TID_MAX;
918 quorum_tid = 0; /* fix gcc warning */
919
920 while (nmasters < nquorum && last_best_quorum_tid != 0) {
921 nmasters = 0;
922 quorum_tid = 0;
923
924 for (i = 0; i < cluster->nchains; ++i) {
925 /* XXX SOFT smpresent handling */
926 if (cluster->pmp->pfs_types[i] !=
927 HAMMER2_PFSTYPE_MASTER) {
928 continue;
929 }
930
931 chain = cluster->array[i].chain;
932
933 /*
934 * Skip elements still in progress. umasters keeps
935 * track of masters that might still be in-progress.
936 */
937 if (chain == NULL && (cluster->array[i].flags &
938 HAMMER2_CITEM_NULL) == 0) {
939 ++umasters;
940 continue;
941 }
942
943 /*
944 * Key match?
945 */
946 if (flags & HAMMER2_CHECK_NULL) {
947 if (chain == NULL) {
948 ++nmasters;
949 ++nmasters_keymatch;
950 }
951 } else if (chain && chain->bref.key == key) {
952 ++nmasters_keymatch;
953 if (quorum_tid < last_best_quorum_tid &&
954 (quorum_tid < chain->bref.modify_tid ||
955 nmasters == 0)) {
956 /*
957 * Better TID located, reset
958 * nmasters count.
959 */
960 nmasters = 0;
961 quorum_tid = chain->bref.modify_tid;
962 }
963 if (quorum_tid == chain->bref.modify_tid) {
964 /*
965 * TID matches current collection.
966 */
967 ++nmasters;
968 if (chain->error == 0) {
969 cluster->focus = chain;
970 cluster->focus_index = i;
971 }
972 }
973 }
974 }
975 if (nmasters >= nquorum)
976 break;
977 last_best_quorum_tid = quorum_tid;
978 }
979
980 /*
981 kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
982 nmasters, nquorum, nmasters_keymatch, umasters);
983 */
984
985 /*
986 * Early return if we do not have enough masters.
987 */
988 if (nmasters < nquorum) {
989 if (nmasters + umasters >= nquorum)
990 return EINPROGRESS;
991 if (nmasters_keymatch < nquorum)
992 return ESRCH;
993 return EDEADLK;
994 }
995
996 /*
997 * Validated end of scan.
998 */
999 if (flags & HAMMER2_CHECK_NULL)
1000 return ENOENT;
1001
1002 /*
1003 * If we have a NULL focus at this point the agreeing quorum all
1004 * had chain errors.
1005 */
1006 if (cluster->focus == NULL)
1007 return EIO;
1008
1009 /*
1010 * Pass 3
1011 *
1012 * We have quorum agreement, validate elements, not end of scan.
1013 */
1014 for (i = 0; i < cluster->nchains; ++i) {
1015 chain = cluster->array[i].chain;
1016 if (chain == NULL ||
1017 chain->bref.key != key ||
1018 chain->bref.modify_tid != quorum_tid) {
1019 continue;
1020 }
1021
1022 switch (cluster->pmp->pfs_types[i]) {
1023 case HAMMER2_PFSTYPE_MASTER:
1024 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
1025 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1026 nflags |= HAMMER2_CLUSTER_WRHARD;
1027 nflags |= HAMMER2_CLUSTER_RDHARD;
1028 break;
1029 case HAMMER2_PFSTYPE_SLAVE:
1030 /*
1031 * We must have enough up-to-date masters to reach
1032 * a quorum and the slave modify_tid must match the
1033 * quorum's modify_tid.
1034 *
1035 * Do not select an errored slave.
1036 */
1037 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1038 nflags |= HAMMER2_CLUSTER_RDHARD;
1039 ++nslaves;
1040 break;
1041 case HAMMER2_PFSTYPE_SOFT_MASTER:
1042 /*
1043 * Directly mounted soft master always wins. There
1044 * should be only one.
1045 */
1046 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
1047 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1048 break;
1049 case HAMMER2_PFSTYPE_SOFT_SLAVE:
1050 /*
1051 * Directly mounted soft slave always wins. There
1052 * should be only one.
1053 *
1054 * XXX
1055 */
1056 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1057 break;
1058 case HAMMER2_PFSTYPE_SUPROOT:
1059 /*
1060 * spmp (degenerate case)
1061 */
1062 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
1063 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1064 break;
1065 default:
1066 break;
1067 }
1068 }
1069
1070 /*
1071 * Focus now set, adjust ddflag. Skip this pass if the focus
1072 * is bad or if we are at the PFS root (the bref won't match at
1073 * the PFS root, obviously).
1074 */
1075 focus = cluster->focus;
1076 if (focus) {
1077 cluster->ddflag =
1078 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
1079 } else {
1080 cluster->ddflag = 0;
1081 goto skip4;
1082 }
1083 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1084 goto skip4;
1085
1086 /*
1087 * Pass 4
1088 *
1089 * Validate the elements that were not marked invalid. They should
1090 * match.
1091 */
1092 for (i = 0; i < cluster->nchains; ++i) {
1093 int ddflag;
1094
1095 chain = cluster->array[i].chain;
1096
1097 if (chain == NULL)
1098 continue;
1099 if (chain == focus)
1100 continue;
1101 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1102 continue;
1103
1104 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1105 if (chain->bref.type != focus->bref.type ||
1106 chain->bref.key != focus->bref.key ||
1107 chain->bref.keybits != focus->bref.keybits ||
1108 chain->bref.modify_tid != focus->bref.modify_tid ||
1109 chain->bytes != focus->bytes ||
1110 ddflag != cluster->ddflag) {
1111 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1112 if (hammer2_debug & 1)
1113 kprintf("cluster_resolve: matching modify_tid failed "
1114 "bref test: idx=%d type=%02x/%02x "
1115 "key=%016jx/%d-%016jx/%d "
1116 "mod=%016jx/%016jx bytes=%u/%u\n",
1117 i,
1118 chain->bref.type, focus->bref.type,
1119 chain->bref.key, chain->bref.keybits,
1120 focus->bref.key, focus->bref.keybits,
1121 chain->bref.modify_tid, focus->bref.modify_tid,
1122 chain->bytes, focus->bytes);
1123 if (hammer2_debug & 0x4000)
1124 panic("cluster_resolve");
1125 /* flag issue and force resync? */
1126 }
1127 }
1128skip4:
1129
1130 if (ttlslaves == 0)
1131 nflags |= HAMMER2_CLUSTER_NOSOFT;
1132 if (ttlmasters == 0)
1133 nflags |= HAMMER2_CLUSTER_NOHARD;
1134
1135 /*
1136 * Set SSYNCED or MSYNCED for slaves and masters respectively if
1137 * all available nodes (even if 0 are available) are fully
1138 * synchronized. This is used by the synchronization thread to
1139 * determine if there is work it could potentially accomplish.
1140 */
1141 if (nslaves == ttlslaves)
1142 nflags |= HAMMER2_CLUSTER_SSYNCED;
1143 if (nmasters == ttlmasters)
1144 nflags |= HAMMER2_CLUSTER_MSYNCED;
1145
1146 /*
1147 * Determine if the cluster was successfully locked for the
1148 * requested operation and generate an error code. The cluster
1149 * will not be locked (or ref'd) if an error is returned.
1150 *
1151 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
1152 * to determine if reading or writing is possible. If writing, the
1153 * cluster still requires a call to hammer2_cluster_modify() first.
1154 */
1155 atomic_set_int(&cluster->flags, nflags);
1156 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1157
1158 return 0;
1159}
1160
0057d3b8
MD
1161/*
1162 * This is used by the sync thread to force non-NULL elements of a copy
1163 * of the pmp->iroot cluster to be good which is required to prime the
1164 * sync.
1165 */
1166void
1167hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1168{
1169 int i;
1170
1171 for (i = 0; i < cluster->nchains; ++i) {
1172 if (cluster->array[i].chain)
1173 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1174 }
1175}
1176
278ab2b2
MD
1177/*
1178 * Copy a cluster, returned a ref'd cluster. All underlying chains
a6cf1052
MD
1179 * are also ref'd, but not locked. Focus state is also copied.
1180 *
1181 * Original cluster does not have to be locked but usually is.
1182 * New cluster will not be flagged as locked.
22211834 1183 *
a6cf1052
MD
1184 * Callers using this function to initialize a new cluster from an inode
1185 * generally lock and resolve the resulting cluster.
1186 *
1187 * Callers which use this function to save/restore a cluster structure
1188 * generally retain the focus state and do not re-resolve it. Caller should
1189 * not try to re-resolve internal (cparent) node state during an iteration
1190 * as the individual tracking elements of cparent in an iteration may not
1191 * match even though they are correct.
278ab2b2
MD
1192 */
1193hammer2_cluster_t *
fe73aa5d 1194hammer2_cluster_copy(hammer2_cluster_t *ocluster)
278ab2b2 1195{
506bd6d1 1196 hammer2_pfs_t *pmp = ocluster->pmp;
278ab2b2 1197 hammer2_cluster_t *ncluster;
837bd39b 1198 hammer2_chain_t *chain;
278ab2b2
MD
1199 int i;
1200
1201 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
1202 ncluster->pmp = pmp;
1203 ncluster->nchains = ocluster->nchains;
fe73aa5d
MD
1204 ncluster->refs = 1;
1205
1206 for (i = 0; i < ocluster->nchains; ++i) {
1207 chain = ocluster->array[i].chain;
1208 ncluster->array[i].chain = chain;
a6cf1052 1209 ncluster->array[i].flags = ocluster->array[i].flags;
fe73aa5d
MD
1210 if (chain)
1211 hammer2_chain_ref(chain);
278ab2b2 1212 }
a6cf1052
MD
1213 ncluster->focus_index = ocluster->focus_index;
1214 ncluster->focus = ocluster->focus;
1215 ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED |
1216 HAMMER2_CLUSTER_INODE);
1217
278ab2b2
MD
1218 return (ncluster);
1219}
1220
1221/*
a6cf1052 1222 * Unlock a cluster. Refcount and focus is maintained.
278ab2b2
MD
1223 */
1224void
eedd52a3 1225hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx)
278ab2b2 1226{
84e47819 1227 hammer2_chain_t *chain;
278ab2b2
MD
1228 int i;
1229
b8ba9690
MD
1230 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1231 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1232 cluster);
1233 }
e513e77e 1234 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
84e47819 1235 KKASSERT(cluster->refs > 0);
b8ba9690
MD
1236 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1237
84e47819 1238 for (i = 0; i < cluster->nchains; ++i) {
eedd52a3
MD
1239 if (i == idx)
1240 continue;
4b7e61e0 1241 chain = cluster->array[i].chain;
e513e77e 1242 if (chain)
84e47819 1243 hammer2_chain_unlock(chain);
84e47819 1244 }
278ab2b2
MD
1245}
1246
eedd52a3
MD
1247void
1248hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1249{
1250 hammer2_cluster_unlock_except(cluster, -1);
1251}
1252
278ab2b2
MD
1253/*
1254 * Resize the cluster's physical storage allocation in-place. This may
1255 * replace the cluster's chains.
1256 */
1257void
c603b86b 1258hammer2_cluster_resize(hammer2_inode_t *ip,
278ab2b2
MD
1259 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1260 int nradix, int flags)
1261{
4b7e61e0 1262 hammer2_chain_t *chain;
278ab2b2
MD
1263 int i;
1264
1265 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */
1266 KKASSERT(cparent->nchains == cluster->nchains);
1267
1268 for (i = 0; i < cluster->nchains; ++i) {
e513e77e
MD
1269 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1270 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1271 continue;
1272 }
4b7e61e0
MD
1273 chain = cluster->array[i].chain;
1274 if (chain) {
1275 KKASSERT(cparent->array[i].chain);
c603b86b 1276 hammer2_chain_resize(ip,
4b7e61e0 1277 cparent->array[i].chain, chain,
84e47819 1278 nradix, flags);
84e47819 1279 }
278ab2b2 1280 }
278ab2b2
MD
1281}
1282
1283/*
1284 * Set an inode's cluster modified, marking the related chains RW and
1285 * duplicating them if necessary.
1286 *
1287 * The passed-in chain is a localized copy of the chain previously acquired
1288 * when the inode was locked (and possilby replaced in the mean time), and
1289 * must also be updated. In fact, we update it first and then synchronize
1290 * the inode's cluster cache.
1291 */
1292hammer2_inode_data_t *
c603b86b 1293hammer2_cluster_modify_ip(hammer2_inode_t *ip,
278ab2b2
MD
1294 hammer2_cluster_t *cluster, int flags)
1295{
c603b86b
MD
1296 hammer2_inode_modify(ip);
1297 hammer2_cluster_modify(cluster, flags);
278ab2b2 1298 hammer2_inode_repoint(ip, NULL, cluster);
6a5f4fe6 1299 return (&hammer2_cluster_wdata(cluster)->ipdata);
278ab2b2
MD
1300}
1301
1302/*
bca9f8e6
MD
1303 * Adjust the cluster's chains to allow modification and adjust the
1304 * focus. Data will be accessible on return.
23c7c7dd
MD
1305 *
1306 * If our focused master errors on modify, re-resolve the cluster to
1307 * try to select a different master.
278ab2b2
MD
1308 */
1309void
c603b86b 1310hammer2_cluster_modify(hammer2_cluster_t *cluster, int flags)
278ab2b2 1311{
4b7e61e0 1312 hammer2_chain_t *chain;
23c7c7dd 1313 int resolve_again;
278ab2b2
MD
1314 int i;
1315
23c7c7dd 1316 resolve_again = 0;
84e47819 1317 for (i = 0; i < cluster->nchains; ++i) {
e513e77e
MD
1318 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1319 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1320 continue;
1321 }
4b7e61e0 1322 chain = cluster->array[i].chain;
b93cc2e0
MD
1323 if (chain == NULL)
1324 continue;
1325 if (chain->error)
1326 continue;
c603b86b 1327 hammer2_chain_modify(chain, flags);
b93cc2e0
MD
1328 if (cluster->focus == chain && chain->error) {
1329 cluster->error = chain->error;
1330 resolve_again = 1;
23c7c7dd 1331 }
84e47819 1332 }
23c7c7dd
MD
1333 if (resolve_again)
1334 hammer2_cluster_resolve(cluster);
278ab2b2
MD
1335}
1336
84e47819 1337/*
bca9f8e6
MD
1338 * Synchronize modifications from the focus to other chains in a cluster.
1339 * Convenient because nominal API users can just modify the contents of the
1340 * focus (at least for non-blockref data).
84e47819
MD
1341 *
1342 * Nominal front-end operations only edit non-block-table data in a single
1343 * chain. This code copies such modifications to the other chains in the
da6f36f4
MD
1344 * cluster. Blocktable modifications are handled on a chain-by-chain basis
1345 * by both the frontend and the backend and will explode in fireworks if
1346 * blindly copied.
84e47819 1347 */
6a5f4fe6
MD
1348void
1349hammer2_cluster_modsync(hammer2_cluster_t *cluster)
1350{
1351 hammer2_chain_t *focus;
1352 hammer2_chain_t *scan;
1353 const hammer2_inode_data_t *ripdata;
1354 hammer2_inode_data_t *wipdata;
1355 int i;
1356
1357 focus = cluster->focus;
1358 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
1359
1360 for (i = 0; i < cluster->nchains; ++i) {
e513e77e
MD
1361 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1362 continue;
4b7e61e0 1363 scan = cluster->array[i].chain;
6a5f4fe6
MD
1364 if (scan == NULL || scan == focus)
1365 continue;
b93cc2e0
MD
1366 if (scan->error)
1367 continue;
6a5f4fe6
MD
1368 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
1369 KKASSERT(focus->bytes == scan->bytes &&
1370 focus->bref.type == scan->bref.type);
1371 switch(focus->bref.type) {
1372 case HAMMER2_BREF_TYPE_INODE:
1373 ripdata = &focus->data->ipdata;
1374 wipdata = &scan->data->ipdata;
b0f58de8 1375 if ((ripdata->meta.op_flags &
6a5f4fe6
MD
1376 HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1377 bcopy(ripdata, wipdata,
1378 offsetof(hammer2_inode_data_t, u));
1379 break;
1380 }
5ceaaa82 1381 /* fall through to full copy */
6a5f4fe6
MD
1382 case HAMMER2_BREF_TYPE_DATA:
1383 bcopy(focus->data, scan->data, focus->bytes);
1384 break;
1385 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1386 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1387 case HAMMER2_BREF_TYPE_FREEMAP:
1388 case HAMMER2_BREF_TYPE_VOLUME:
1389 panic("hammer2_cluster_modsync: illegal node type");
1390 /* NOT REACHED */
1391 break;
1392 default:
1393 panic("hammer2_cluster_modsync: unknown node type");
1394 break;
1395 }
1396 }
1397}
1398
278ab2b2 1399/*
a6cf1052
MD
1400 * Lookup initialization/completion API. Returns a locked, fully resolved
1401 * cluster with one ref.
278ab2b2
MD
1402 */
1403hammer2_cluster_t *
1404hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
1405{
1406 hammer2_cluster_t *cluster;
278ab2b2 1407
e513e77e 1408 cluster = hammer2_cluster_copy(cparent);
278ab2b2
MD
1409 if (flags & HAMMER2_LOOKUP_SHARED) {
1410 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
1411 HAMMER2_RESOLVE_SHARED);
1412 } else {
1413 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1414 }
a6cf1052
MD
1415 hammer2_cluster_resolve(cluster);
1416
278ab2b2
MD
1417 return (cluster);
1418}
1419
1420void
1421hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1422{
e513e77e 1423 if (cparent) {
278ab2b2 1424 hammer2_cluster_unlock(cparent);
e513e77e
MD
1425 hammer2_cluster_drop(cparent);
1426 }
278ab2b2
MD
1427}
1428
1429/*
a6cf1052
MD
1430 * Locate first match or overlap under parent, return a new, locked, resolved
1431 * cluster with one ref.
1432 *
1433 * Must never be called with HAMMER2_LOOKUP_MATCHIND.
278ab2b2
MD
1434 */
1435hammer2_cluster_t *
1436hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
b8ba9690 1437 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
278ab2b2 1438{
506bd6d1 1439 hammer2_pfs_t *pmp;
278ab2b2
MD
1440 hammer2_cluster_t *cluster;
1441 hammer2_chain_t *chain;
1442 hammer2_key_t key_accum;
1443 hammer2_key_t key_next;
1444 int null_count;
a6cf1052 1445 int rflags;
278ab2b2 1446 int i;
278ab2b2 1447
a6cf1052
MD
1448 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1449
278ab2b2
MD
1450 pmp = cparent->pmp; /* can be NULL */
1451 key_accum = *key_nextp;
1452 null_count = 0;
a6cf1052
MD
1453 if (flags & HAMMER2_LOOKUP_SHARED)
1454 rflags = HAMMER2_RESOLVE_SHARED;
1455 else
1456 rflags = 0;
278ab2b2
MD
1457
1458 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1459 cluster->pmp = pmp; /* can be NULL */
1460 cluster->refs = 1;
b8ba9690
MD
1461 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1462 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
278ab2b2 1463
8db69c9f 1464 /*
a6cf1052
MD
1465 * Iterating earlier cluster elements with later elements still
1466 * locked is a problem, so we have to unlock the parent and then
1467 * re-lock as we go.
1468 */
1469 hammer2_cluster_unlock(cparent);
1470 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1471
1472 /*
1473 * Pass-1, issue lookups.
8db69c9f 1474 */
278ab2b2 1475 for (i = 0; i < cparent->nchains; ++i) {
8db69c9f 1476 cluster->array[i].flags = cparent->array[i].flags;
278ab2b2 1477 key_next = *key_nextp;
8db69c9f 1478
a6cf1052
MD
1479 /*
1480 * Always relock the parent as we go.
1481 */
1482 if (cparent->array[i].chain) {
1483 hammer2_chain_lock(cparent->array[i].chain, rflags);
1484 }
1485
8db69c9f
MD
1486 /*
1487 * Nothing to base the lookup, or parent was not synchronized.
1488 */
1489 if (cparent->array[i].chain == NULL ||
1490 (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
84e47819
MD
1491 ++null_count;
1492 continue;
1493 }
8db69c9f 1494
4b7e61e0
MD
1495 chain = hammer2_chain_lookup(&cparent->array[i].chain,
1496 &key_next,
278ab2b2 1497 key_beg, key_end,
4b7e61e0 1498 &cparent->array[i].cache_index,
b8ba9690 1499 flags);
4b7e61e0 1500 cluster->array[i].chain = chain;
278ab2b2
MD
1501 if (chain == NULL) {
1502 ++null_count;
278ab2b2
MD
1503 }
1504 if (key_accum > key_next)
1505 key_accum = key_next;
278ab2b2 1506 }
8db69c9f
MD
1507
1508 /*
a6cf1052 1509 * Cleanup
8db69c9f 1510 */
a6cf1052
MD
1511 cluster->nchains = i;
1512 *key_nextp = key_accum;
8db69c9f
MD
1513
1514 /*
a6cf1052
MD
1515 * The cluster must be resolved, out of sync elements may be present.
1516 *
1517 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
8db69c9f 1518 */
a6cf1052
MD
1519 if (null_count != i)
1520 hammer2_cluster_resolve(cluster);
1521 if (null_count == i ||
1522 (cluster->focus == NULL &&
1523 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1524 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1525 hammer2_cluster_unlock(cluster);
278ab2b2
MD
1526 hammer2_cluster_drop(cluster);
1527 cluster = NULL;
1528 }
1529
1530 return (cluster);
1531}
1532
1533/*
a6cf1052
MD
1534 * Locate next match or overlap under parent, replace the passed-in cluster.
1535 * The returned cluster is a new, locked, resolved cluster with one ref.
1536 *
1537 * Must never be called with HAMMER2_LOOKUP_MATCHIND.
278ab2b2
MD
1538 */
1539hammer2_cluster_t *
1540hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1541 hammer2_key_t *key_nextp,
1542 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1543{
8db69c9f
MD
1544 hammer2_chain_t *ochain;
1545 hammer2_chain_t *nchain;
278ab2b2
MD
1546 hammer2_key_t key_accum;
1547 hammer2_key_t key_next;
c7916d0b
MD
1548 int parent_index;
1549 int cluster_index;
278ab2b2 1550 int null_count;
a6cf1052 1551 int rflags;
278ab2b2
MD
1552 int i;
1553
a6cf1052
MD
1554 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1555
278ab2b2
MD
1556 key_accum = *key_nextp;
1557 null_count = 0;
c7916d0b
MD
1558 parent_index = cparent->focus_index; /* save prior focus */
1559 cluster_index = cluster->focus_index;
a6cf1052
MD
1560 if (flags & HAMMER2_LOOKUP_SHARED)
1561 rflags = HAMMER2_RESOLVE_SHARED;
1562 else
1563 rflags = 0;
c7916d0b
MD
1564
1565 cluster->focus = NULL; /* XXX needed any more? */
1566 /*cparent->focus = NULL;*/
1567 cluster->focus_index = 0; /* XXX needed any more? */
1568 /*cparent->focus_index = 0;*/
278ab2b2 1569
b8ba9690
MD
1570 cluster->ddflag = 0;
1571
c7916d0b
MD
1572 /*
1573 * The parent is always locked on entry, the iterator may be locked
1574 * depending on flags.
1575 *
1576 * We must temporarily unlock the passed-in clusters to avoid a
1577 * deadlock between elements of the cluster with other threads.
1578 * We will fixup the lock in the loop.
1579 *
1580 * Note that this will clear the focus.
1581 *
1582 * Reflag the clusters as locked, because we will relock them
1583 * as we go.
1584 */
1585 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1586 hammer2_cluster_unlock(cluster);
1587 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1588 }
1589 hammer2_cluster_unlock(cparent);
1590 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1591
278ab2b2
MD
1592 for (i = 0; i < cparent->nchains; ++i) {
1593 key_next = *key_nextp;
8db69c9f
MD
1594 ochain = cluster->array[i].chain;
1595
c7916d0b
MD
1596 /*
1597 * Always relock the parent as we go.
1598 */
a6cf1052
MD
1599 if (cparent->array[i].chain)
1600 hammer2_chain_lock(cparent->array[i].chain, rflags);
c7916d0b 1601
8db69c9f
MD
1602 /*
1603 * Nothing to iterate from. These cases can occur under
1604 * normal operations. For example, during synchronization
1605 * a slave might reach the end of its scan while records
1606 * are still left on the master(s).
1607 */
1608 if (ochain == NULL) {
84e47819
MD
1609 ++null_count;
1610 continue;
1611 }
8db69c9f
MD
1612 if (cparent->array[i].chain == NULL ||
1613 (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1614 (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
c7916d0b 1615 /* ochain has not yet been relocked */
e513e77e
MD
1616 hammer2_chain_drop(ochain);
1617 cluster->array[i].chain = NULL;
84e47819
MD
1618 ++null_count;
1619 continue;
1620 }
8db69c9f 1621
c7916d0b
MD
1622 /*
1623 * Relock the child if necessary. Parent and child will then
1624 * be locked as expected by hammer2_chain_next() and flags.
1625 */
1626 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
a6cf1052 1627 hammer2_chain_lock(ochain, rflags);
8db69c9f
MD
1628 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1629 &key_next, key_beg, key_end,
1630 &cparent->array[i].cache_index,
1631 flags);
a6cf1052 1632 /* ochain now invalid but can still be used for focus check */
c7916d0b
MD
1633 if (parent_index == i) {
1634 cparent->focus_index = i;
0cc33e20 1635 cparent->focus = cparent->array[i].chain;
c7916d0b 1636 }
8db69c9f
MD
1637
1638 cluster->array[i].chain = nchain;
1639 if (nchain == NULL) {
278ab2b2 1640 ++null_count;
84e47819 1641 }
278ab2b2
MD
1642 if (key_accum > key_next)
1643 key_accum = key_next;
1644 }
8db69c9f
MD
1645
1646 /*
a6cf1052 1647 * Cleanup
8db69c9f 1648 */
b8ba9690 1649 cluster->nchains = i;
a6cf1052 1650 *key_nextp = key_accum;
278ab2b2 1651
a6cf1052
MD
1652 /*
1653 * The cluster must be resolved, out of sync elements may be present.
1654 *
1655 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1656 */
1657 if (null_count != i)
1658 hammer2_cluster_resolve(cluster);
1659 if (null_count == i ||
1660 (cluster->focus == NULL &&
1661 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1662 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1663 hammer2_cluster_unlock(cluster);
278ab2b2
MD
1664 hammer2_cluster_drop(cluster);
1665 cluster = NULL;
278ab2b2
MD
1666 }
1667 return(cluster);
1668}
1669
8db69c9f
MD
1670/*
1671 * Advance just one chain in the cluster and recalculate the invalid bit.
0cc33e20
MD
1672 * The cluster index is allowed to be flagged invalid on input and is
1673 * recalculated on return.
1674 *
8db69c9f
MD
1675 * (used during synchronization to advance past a chain being deleted).
1676 *
1677 * The chain being advanced must not be the focus and the clusters in
1678 * question must have already passed normal cluster_lookup/cluster_next
1679 * checks.
1680 *
1681 * The cluster always remains intact on return, so void function.
1682 */
1683void
1684hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1685 hammer2_cluster_t *cluster,
1686 hammer2_key_t *key_nextp,
1687 hammer2_key_t key_beg,
1688 hammer2_key_t key_end,
1689 int i, int flags)
1690{
1691 hammer2_chain_t *ochain;
1692 hammer2_chain_t *nchain;
1693 hammer2_chain_t *focus;
1694 hammer2_key_t key_accum;
1695 hammer2_key_t key_next;
1696 int ddflag;
1697
1698 key_accum = *key_nextp;
1699 key_next = *key_nextp;
1700 ochain = cluster->array[i].chain;
1701 if (ochain == NULL)
1702 goto done;
1703 KKASSERT(ochain != cluster->focus);
1704
1705 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1706 &key_next, key_beg, key_end,
1707 &cparent->array[i].cache_index,
1708 flags);
a6cf1052 1709 /* ochain now invalid */
0cc33e20
MD
1710 if (cparent->focus_index == i)
1711 cparent->focus = cparent->array[i].chain;
8db69c9f
MD
1712
1713 /*
e513e77e
MD
1714 * Install nchain. Note that nchain can be NULL, and can also
1715 * be in an unlocked state depending on flags.
8db69c9f
MD
1716 */
1717 cluster->array[i].chain = nchain;
1718 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1719
1720 if (key_accum > key_next)
1721 key_accum = key_next;
1722
1723 focus = cluster->focus;
1724 if (focus == NULL)
1725 goto done;
1726 if (nchain == NULL)
1727 goto done;
1728#if 0
1729 if (nchain == focus) /* ASSERTED NOT TRUE */
1730 ...
1731#endif
1732 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1733 if (nchain->bref.type != focus->bref.type ||
1734 nchain->bref.key != focus->bref.key ||
1735 nchain->bref.keybits != focus->bref.keybits ||
e513e77e 1736 nchain->bref.modify_tid != focus->bref.modify_tid ||
8db69c9f
MD
1737 nchain->bytes != focus->bytes ||
1738 ddflag != cluster->ddflag) {
1739 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1740 }
1741
1742done:
1743 *key_nextp = key_accum;
1744#if 0
1745 /*
1746 * For now don't re-resolve cluster->flags.
1747 */
1748 hammer2_cluster_resolve(cluster);
1749#endif
1750}
1751
278ab2b2
MD
1752/*
1753 * Create a new cluster using the specified key
1754 */
1755int
c603b86b 1756hammer2_cluster_create(hammer2_pfs_t *pmp, hammer2_cluster_t *cparent,
278ab2b2 1757 hammer2_cluster_t **clusterp,
b3659de2
MD
1758 hammer2_key_t key, int keybits,
1759 int type, size_t bytes, int flags)
278ab2b2
MD
1760{
1761 hammer2_cluster_t *cluster;
278ab2b2
MD
1762 int error;
1763 int i;
1764
278ab2b2
MD
1765 if ((cluster = *clusterp) == NULL) {
1766 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1767 M_WAITOK | M_ZERO);
1768 cluster->pmp = pmp; /* can be NULL */
1769 cluster->refs = 1;
b8ba9690 1770 cluster->flags = HAMMER2_CLUSTER_LOCKED;
278ab2b2 1771 }
8db69c9f 1772 cluster->focus_index = 0;
84e47819 1773 cluster->focus = NULL;
84e47819
MD
1774
1775 /*
1776 * NOTE: cluster->array[] entries can initially be NULL. If
1777 * *clusterp is supplied, skip NULL entries, otherwise
1778 * create new chains.
1779 */
278ab2b2 1780 for (i = 0; i < cparent->nchains; ++i) {
e513e77e
MD
1781 if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1782 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
84e47819
MD
1783 continue;
1784 }
e513e77e
MD
1785 if (*clusterp) {
1786 if ((cluster->array[i].flags &
1787 HAMMER2_CITEM_FEMOD) == 0) {
1788 cluster->array[i].flags |=
1789 HAMMER2_CITEM_INVALID;
1790 continue;
1791 }
1792 if (cluster->array[i].chain == NULL)
1793 continue;
1794 }
c603b86b 1795 error = hammer2_chain_create(&cparent->array[i].chain,
4b7e61e0 1796 &cluster->array[i].chain, pmp,
b3659de2
MD
1797 key, keybits,
1798 type, bytes, flags);
0cc33e20
MD
1799 if (cparent->focus_index == i)
1800 cparent->focus = cparent->array[i].chain;
278ab2b2 1801 KKASSERT(error == 0);
8db69c9f
MD
1802 if (cluster->focus == NULL) {
1803 cluster->focus_index = i;
4b7e61e0 1804 cluster->focus = cluster->array[i].chain;
8db69c9f
MD
1805 }
1806 if (cparent->focus == cparent->array[i].chain) {
1807 cluster->focus_index = i;
b8ba9690 1808 cluster->focus = cluster->array[i].chain;
8db69c9f 1809 }
278ab2b2 1810 }
84e47819 1811 cluster->nchains = i;
278ab2b2 1812 *clusterp = cluster;
23c7c7dd 1813 hammer2_cluster_resolve(cluster);
278ab2b2
MD
1814
1815 return error;
1816}
1817
1818/*
da6f36f4 1819 * Rename a cluster to a new parent.
84e47819 1820 *
b93cc2e0
MD
1821 * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1822 * So the data_off field is not relevant. Only the key and
1823 * keybits are used.
278ab2b2
MD
1824 */
1825void
c603b86b 1826hammer2_cluster_rename(hammer2_blockref_t *bref,
b3659de2
MD
1827 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1828 int flags)
278ab2b2 1829{
84e47819
MD
1830 hammer2_chain_t *chain;
1831 hammer2_blockref_t xbref;
278ab2b2
MD
1832 int i;
1833
0cc33e20 1834#if 0
84e47819
MD
1835 cluster->focus = NULL;
1836 cparent->focus = NULL;
8db69c9f
MD
1837 cluster->focus_index = 0;
1838 cparent->focus_index = 0;
0cc33e20 1839#endif
84e47819 1840
278ab2b2 1841 for (i = 0; i < cluster->nchains; ++i) {
e513e77e
MD
1842 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1843 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1844 continue;
1845 }
4b7e61e0 1846 chain = cluster->array[i].chain;
84e47819
MD
1847 if (chain) {
1848 if (bref) {
1849 xbref = chain->bref;
1850 xbref.key = bref->key;
1851 xbref.keybits = bref->keybits;
c603b86b 1852 hammer2_chain_rename(&xbref,
4b7e61e0 1853 &cparent->array[i].chain,
b3659de2 1854 chain, flags);
84e47819 1855 } else {
c603b86b 1856 hammer2_chain_rename(NULL,
4b7e61e0 1857 &cparent->array[i].chain,
b3659de2 1858 chain, flags);
84e47819 1859 }
0cc33e20
MD
1860 if (cparent->focus_index == i)
1861 cparent->focus = cparent->array[i].chain;
22211834 1862 KKASSERT(cluster->array[i].chain == chain); /*remove*/
84e47819 1863 }
278ab2b2 1864 }
278ab2b2
MD
1865}
1866
1867/*
da6f36f4 1868 * Mark a cluster deleted
278ab2b2
MD
1869 */
1870void
c603b86b 1871hammer2_cluster_delete(hammer2_cluster_t *cparent,
da6f36f4 1872 hammer2_cluster_t *cluster, int flags)
278ab2b2 1873{
84e47819 1874 hammer2_chain_t *chain;
da6f36f4 1875 hammer2_chain_t *parent;
278ab2b2
MD
1876 int i;
1877
da6f36f4
MD
1878 if (cparent == NULL) {
1879 kprintf("cparent is NULL\n");
1880 return;
278ab2b2 1881 }
278ab2b2
MD
1882
1883 for (i = 0; i < cluster->nchains; ++i) {
e513e77e
MD
1884 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1885 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1886 continue;
1887 }
1888 parent = cparent->array[i].chain;
4b7e61e0 1889 chain = cluster->array[i].chain;
850687d2
MD
1890 if (chain == NULL)
1891 continue;
1892 if (chain->parent != parent) {
1893 kprintf("hammer2_cluster_delete: parent "
1894 "mismatch chain=%p parent=%p against=%p\n",
1895 chain, chain->parent, parent);
1896 } else {
c603b86b 1897 hammer2_chain_delete(parent, chain, flags);
da6f36f4 1898 }
278ab2b2
MD
1899 }
1900}
1901
1902/*
1903 * Create a snapshot of the specified {parent, ochain} with the specified
1904 * label. The originating hammer2_inode must be exclusively locked for
1905 * safety.
1906 *
1907 * The ioctl code has already synced the filesystem.
1908 */
1909int
c603b86b
MD
1910hammer2_cluster_snapshot(hammer2_cluster_t *ocluster,
1911 hammer2_ioc_pfs_t *pmp)
278ab2b2 1912{
506bd6d1 1913 hammer2_dev_t *hmp;
bca9f8e6 1914 const hammer2_inode_data_t *ripdata;
6a5f4fe6 1915 hammer2_inode_data_t *wipdata;
4b7e61e0 1916 hammer2_chain_t *nchain;
278ab2b2
MD
1917 hammer2_inode_t *nip;
1918 size_t name_len;
1919 hammer2_key_t lhc;
1920 struct vattr vat;
5ceaaa82 1921#if 0
278ab2b2 1922 uuid_t opfs_clid;
5ceaaa82 1923#endif
278ab2b2
MD
1924 int error;
1925
c603b86b 1926 kprintf("snapshot %s\n", pmp->name);
278ab2b2 1927
c603b86b
MD
1928 name_len = strlen(pmp->name);
1929 lhc = hammer2_dirhash(pmp->name, name_len);
278ab2b2 1930
bca9f8e6
MD
1931 /*
1932 * Get the clid
1933 */
1934 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
5ceaaa82 1935#if 0
b0f58de8 1936 opfs_clid = ripdata->meta.pfs_clid;
5ceaaa82
MD
1937#endif
1938 hmp = ocluster->focus->hmp; /* XXX find synchronized local disk */
278ab2b2
MD
1939
1940 /*
1941 * Create the snapshot directory under the super-root
1942 *
1943 * Set PFS type, generate a unique filesystem id, and generate
1944 * a cluster id. Use the same clid when snapshotting a PFS root,
1945 * which theoretically allows the snapshot to be used as part of
1946 * the same cluster (perhaps as a cache).
1947 *
1948 * Copy the (flushed) blockref array. Theoretically we could use
1949 * chain_duplicate() but it becomes difficult to disentangle
1950 * the shared core so for now just brute-force it.
1951 */
1952 VATTR_NULL(&vat);
1953 vat.va_type = VDIR;
1954 vat.va_mode = 0755;
c603b86b
MD
1955 nip = hammer2_inode_create(hmp->spmp->iroot, &vat, proc0.p_ucred,
1956 pmp->name, name_len,
e12ae3a5 1957 1, 0, 0,
506bd6d1 1958 HAMMER2_INSERT_PFSROOT, &error);
278ab2b2
MD
1959
1960 if (nip) {
c603b86b
MD
1961 hammer2_inode_modify(nip);
1962 nchain = hammer2_inode_chain(nip, 0, HAMMER2_RESOLVE_ALWAYS);
1963 hammer2_chain_modify(nchain, 0);
1964 wipdata = &nchain->data->ipdata;
1965
1966 nip->meta.pfs_type = HAMMER2_PFSTYPE_MASTER;
1967 nip->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1968 nip->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT;
1969 kern_uuidgen(&nip->meta.pfs_fsid, 1);
5ceaaa82
MD
1970
1971 /*
c603b86b
MD
1972 * Give the snapshot its own private cluster id. As a
1973 * snapshot no further synchronization with the original
1974 * cluster will be done.
5ceaaa82
MD
1975 */
1976#if 0
18e8ab5f 1977 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
c603b86b 1978 nip->meta.pfs_clid = opfs_clid;
278ab2b2 1979 else
c603b86b 1980 kern_uuidgen(&nip->meta.pfs_clid, 1);
5ceaaa82 1981#endif
c603b86b
MD
1982 kern_uuidgen(&nip->meta.pfs_clid, 1);
1983 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
278ab2b2
MD
1984
1985 /* XXX hack blockset copy */
6a5f4fe6
MD
1986 /* XXX doesn't work with real cluster */
1987 KKASSERT(ocluster->nchains == 1);
c603b86b 1988 wipdata->meta = nip->meta;
bca9f8e6 1989 wipdata->u.blockset = ripdata->u.blockset;
c603b86b
MD
1990 hammer2_flush(nchain, 1);
1991 hammer2_chain_unlock(nchain);
1992 hammer2_chain_drop(nchain);
1993 hammer2_inode_unlock(nip, NULL);
278ab2b2
MD
1994 }
1995 return (error);
1996}
1f179146 1997
da6f36f4
MD
1998/*
1999 * Return locked parent cluster given a locked child. The child remains
7750fd72
MD
2000 * locked on return. The new parent's focus follows the child's focus
2001 * and the parent is always resolved.
c7916d0b
MD
2002 *
2003 * We must temporarily unlock the passed-in cluster to avoid a deadlock
2004 * between elements of the cluster.
a6cf1052
MD
2005 *
2006 * We must not try to hammer2_cluster_resolve() cparent. The individual
2007 * parent chains for the nodes are the correct parents for the cluster but
2008 * do not necessarily match, so resolve would likely implode.
da6f36f4
MD
2009 */
2010hammer2_cluster_t *
2011hammer2_cluster_parent(hammer2_cluster_t *cluster)
2012{
2013 hammer2_cluster_t *cparent;
2014 int i;
2015
fe73aa5d 2016 cparent = hammer2_cluster_copy(cluster);
c7916d0b 2017 hammer2_cluster_unlock(cluster);
b8ba9690 2018
fe73aa5d 2019 for (i = 0; i < cparent->nchains; ++i) {
da6f36f4
MD
2020 hammer2_chain_t *chain;
2021 hammer2_chain_t *rchain;
2022
fe73aa5d
MD
2023 /*
2024 * Calculate parent for each element. Old chain has an extra
2025 * ref for cparent but the lock remains with cluster.
2026 */
2027 chain = cparent->array[i].chain;
da6f36f4
MD
2028 if (chain == NULL)
2029 continue;
da6f36f4
MD
2030 while ((rchain = chain->parent) != NULL) {
2031 hammer2_chain_ref(rchain);
da6f36f4 2032 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
da6f36f4
MD
2033 if (chain->parent == rchain)
2034 break;
2035 hammer2_chain_unlock(rchain);
e513e77e 2036 hammer2_chain_drop(rchain);
da6f36f4 2037 }
4b7e61e0 2038 cparent->array[i].chain = rchain;
fe73aa5d 2039 hammer2_chain_drop(chain);
da6f36f4 2040 }
b8ba9690 2041 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
a6cf1052 2042 /* hammer2_cluster_resolve(cparent); */
c7916d0b 2043 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
b8ba9690 2044
da6f36f4
MD
2045 return cparent;
2046}
2047
bca9f8e6
MD
2048/************************************************************************
2049 * CLUSTER I/O *
2050 ************************************************************************
2051 *
2052 *
2053 * WARNING! blockref[] array data is not universal. These functions should
2054 * only be used to access universal data.
2055 *
2056 * NOTE! The rdata call will wait for at least one of the chain I/Os to
2057 * complete if necessary. The I/O's should have already been
2058 * initiated by the cluster_lock/chain_lock operation.
2059 *
2060 * The cluster must already be in a modified state before wdata
2061 * is called. The data will already be available for this case.
2062 */
2063const hammer2_media_data_t *
2064hammer2_cluster_rdata(hammer2_cluster_t *cluster)
2065{
c847e838
MD
2066 KKASSERT(cluster->focus != NULL);
2067 return(cluster->focus->data);
2068}
2069
2070const hammer2_media_data_t *
2071hammer2_cluster_rdata_bytes(hammer2_cluster_t *cluster, size_t *bytesp)
2072{
2073 KKASSERT(cluster->focus != NULL);
2074 *bytesp = cluster->focus->bytes;
bca9f8e6
MD
2075 return(cluster->focus->data);
2076}
2077
2078hammer2_media_data_t *
2079hammer2_cluster_wdata(hammer2_cluster_t *cluster)
2080{
c847e838 2081 KKASSERT(cluster->focus != NULL);
bca9f8e6
MD
2082 KKASSERT(hammer2_cluster_modified(cluster));
2083 return(cluster->focus->data);
2084}
2085
2086/*
b8ba9690
MD
2087 * Load cluster data asynchronously with callback.
2088 *
2089 * The callback is made for the first validated data found, or NULL
2090 * if no valid data is available.
bca9f8e6
MD
2091 *
2092 * NOTE! The cluster structure is either unique or serialized (e.g. embedded
2093 * in the inode with an exclusive lock held), the chain structure may be
2094 * shared.
2095 */
2096void
2097hammer2_cluster_load_async(hammer2_cluster_t *cluster,
2098 void (*callback)(hammer2_iocb_t *iocb), void *ptr)
2099{
2100 hammer2_chain_t *chain;
2101 hammer2_iocb_t *iocb;
506bd6d1 2102 hammer2_dev_t *hmp;
bca9f8e6
MD
2103 hammer2_blockref_t *bref;
2104 int i;
2105
0a61bbb3
MD
2106 i = cluster->focus_index;
2107 chain = cluster->focus;
bca9f8e6
MD
2108
2109 iocb = &cluster->iocb;
2110 iocb->callback = callback;
2111 iocb->dio = NULL; /* for already-validated case */
2112 iocb->cluster = cluster;
2113 iocb->chain = chain;
2114 iocb->ptr = ptr;
2115 iocb->lbase = (off_t)i;
2116 iocb->flags = 0;
2117 iocb->error = 0;
2118
2119 /*
2120 * Data already validated
2121 */
2122 if (chain->data) {
2123 callback(iocb);
2124 return;
2125 }
2126
2127 /*
2128 * We must resolve to a device buffer, either by issuing I/O or
2129 * by creating a zero-fill element. We do not mark the buffer
2130 * dirty when creating a zero-fill element (the hammer2_chain_modify()
2131 * API must still be used to do that).
2132 *
2133 * The device buffer is variable-sized in powers of 2 down
2134 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage
2135 * chunk always contains buffers of the same size. (XXX)
2136 *
2137 * The minimum physical IO size may be larger than the variable
2138 * block size.
b8ba9690
MD
2139 *
2140 * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
2141 * matches hammer2_devblksize()? Or does the freemap's
2142 * pre-zeroing handle the case for us?
bca9f8e6
MD
2143 */
2144 bref = &chain->bref;
2145 hmp = chain->hmp;
2146
2147#if 0
2148 /* handled by callback? <- TODO XXX even needed for loads? */
2149 /*
2150 * The getblk() optimization for a 100% overwrite can only be used
2151 * if the physical block size matches the request.
2152 */
2153 if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
2154 chain->bytes == hammer2_devblksize(chain->bytes)) {
2155 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
2156 KKASSERT(error == 0);
2157 iocb->dio = dio;
2158 callback(iocb);
2159 return;
2160 }
2161#endif
2162
2163 /*
2164 * Otherwise issue a read
2165 */
2166 hammer2_adjreadcounter(&chain->bref, chain->bytes);
2167 hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
2168}