2 * Copyright (c) 2011-2013 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
18 * 3. Neither the name of The DragonFly Project nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific, prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * This subsystem implements most of the core support functions for
37 * the hammer2_chain and hammer2_chain_core structures.
39 * Chains represent the filesystem media topology in-memory. Any given
40 * chain can represent an inode, indirect block, data, or other types
43 * This module provides APIs for direct and indirect block searches,
44 * iterations, recursions, creation, deletion, replication, and snapshot
45 * views (used by the flush and snapshot code).
47 * Generally speaking any modification made to a chain must propagate all
48 * the way back to the volume header, issuing copy-on-write updates to the
49 * blockref tables all the way up. Any chain except the volume header itself
50 * can be flushed to disk at any time, in any order. None of it matters
51 * until we get to the point where we want to synchronize the volume header
52 * (see the flush code).
54 * The chain structure supports snapshot views in time, which are primarily
55 * used until the related data and meta-data is flushed to allow the
56 * filesystem to make snapshots without requiring it to first flush,
57 * and to allow the filesystem flush and modify the filesystem concurrently
58 * with minimal or no stalls.
60 #include <sys/cdefs.h>
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/types.h>
65 #include <sys/kern_syscall.h>
70 static int hammer2_indirect_optimize; /* XXX SYSCTL */
72 static hammer2_chain_t *hammer2_chain_create_indirect(
73 hammer2_trans_t *trans, hammer2_chain_t *parent,
74 hammer2_key_t key, int keybits, int for_type, int *errorp);
75 static void adjreadcounter(hammer2_blockref_t *bref, size_t bytes);
78 * We use a red-black tree to guarantee safe lookups under shared locks.
80 * Chains can be overloaded onto the same index, creating a different
81 * view of a blockref table based on a transaction id. The RBTREE
82 * deconflicts the view by sub-sorting on delete_tid.
84 * NOTE: Any 'current' chain which is not yet deleted will have a
85 * delete_tid of HAMMER2_MAX_TID (0xFFF....FFFLLU).
87 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
90 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
92 if (chain1->index < chain2->index)
94 if (chain1->index > chain2->index)
96 if (chain1->delete_tid < chain2->delete_tid)
98 if (chain1->delete_tid > chain2->delete_tid)
105 hammer2_isclusterable(hammer2_chain_t *chain)
107 if (hammer2_cluster_enable) {
108 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
109 chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
110 chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
118 * Recursively set the SUBMODIFIED flag up to the root starting at chain's
119 * parent. SUBMODIFIED is not set in chain itself.
121 * This function only operates on current-time transactions and is not
122 * used during flushes. Instead, the flush code manages the flag itself.
125 hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain)
127 hammer2_chain_core_t *above;
129 if (trans->flags & HAMMER2_TRANS_ISFLUSH)
131 while ((above = chain->above) != NULL) {
132 spin_lock(&above->cst.spin);
133 chain = above->first_parent;
134 while (hammer2_chain_refactor_test(chain, 1))
135 chain = chain->next_parent;
136 atomic_set_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
137 spin_unlock(&above->cst.spin);
142 * Allocate a new disconnected chain element representing the specified
143 * bref. chain->refs is set to 1 and the passed bref is copied to
144 * chain->bref. chain->bytes is derived from the bref.
146 * chain->core is NOT allocated and the media data and bp pointers are left
147 * NULL. The caller must call chain_core_alloc() to allocate or associate
148 * a core with the chain.
150 * NOTE: Returns a referenced but unlocked (because there is no core) chain.
153 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_trans_t *trans,
154 hammer2_blockref_t *bref)
156 hammer2_chain_t *chain;
157 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
160 * Construct the appropriate system structure.
163 case HAMMER2_BREF_TYPE_INODE:
164 case HAMMER2_BREF_TYPE_INDIRECT:
165 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
166 case HAMMER2_BREF_TYPE_DATA:
167 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
168 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO);
170 case HAMMER2_BREF_TYPE_VOLUME:
171 case HAMMER2_BREF_TYPE_FREEMAP:
173 panic("hammer2_chain_alloc volume type illegal for op");
176 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
182 chain->index = -1; /* not yet assigned */
183 chain->bytes = bytes;
185 chain->flags = HAMMER2_CHAIN_ALLOCATED;
186 chain->delete_tid = HAMMER2_MAX_TID;
188 chain->modify_tid = trans->sync_tid;
194 * Associate an existing core with the chain or allocate a new core.
196 * The core is not locked. No additional refs on the chain are made.
199 hammer2_chain_core_alloc(hammer2_chain_t *chain, hammer2_chain_core_t *core)
201 hammer2_chain_t **scanp;
203 KKASSERT(chain->core == NULL);
204 KKASSERT(chain->next_parent == NULL);
207 core = kmalloc(sizeof(*core), chain->hmp->mchain,
209 RB_INIT(&core->rbtree);
212 ccms_cst_init(&core->cst, chain);
213 core->first_parent = chain;
215 atomic_add_int(&core->sharecnt, 1);
217 spin_lock(&core->cst.spin);
218 if (core->first_parent == NULL) {
219 core->first_parent = chain;
221 scanp = &core->first_parent;
223 scanp = &(*scanp)->next_parent;
225 hammer2_chain_ref(chain); /* next_parent link */
227 spin_unlock(&core->cst.spin);
232 * Add a reference to a chain element, preventing its destruction.
235 hammer2_chain_ref(hammer2_chain_t *chain)
237 atomic_add_int(&chain->refs, 1);
241 * Drop the caller's reference to the chain. When the ref count drops to
242 * zero this function will disassociate the chain from its parent and
243 * deallocate it, then recursely drop the parent using the implied ref
244 * from the chain's chain->parent.
246 * WARNING! Just because we are able to deallocate a chain doesn't mean
247 * that chain->core->rbtree is empty. There can still be a sharecnt
248 * on chain->core and RBTREE entries that refer to different parents.
250 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain);
253 hammer2_chain_drop(hammer2_chain_t *chain)
259 if (chain->flags & HAMMER2_CHAIN_MOVED)
261 if (chain->flags & HAMMER2_CHAIN_MODIFIED)
263 KKASSERT(chain->refs > need);
272 chain = hammer2_chain_lastdrop(chain);
274 if (atomic_cmpset_int(&chain->refs, refs, refs - 1))
276 /* retry the same chain */
282 * Safe handling of the 1->0 transition on chain. Returns a chain for
283 * recursive drop or NULL, possibly returning the same chain of the atomic
286 * The cst spinlock is allowed nest child-to-parent (not parent-to-child).
290 hammer2_chain_lastdrop(hammer2_chain_t *chain)
292 hammer2_mount_t *hmp;
293 hammer2_chain_core_t *above;
294 hammer2_chain_core_t *core;
295 hammer2_chain_t *rdrop1;
296 hammer2_chain_t *rdrop2;
299 * Spinlock the core and check to see if it is empty. If it is
300 * not empty we leave chain intact with refs == 0.
302 if ((core = chain->core) != NULL) {
303 spin_lock(&core->cst.spin);
304 if (RB_ROOT(&core->rbtree)) {
305 if (atomic_cmpset_int(&chain->refs, 1, 0)) {
306 /* 1->0 transition successful */
307 spin_unlock(&core->cst.spin);
310 /* 1->0 transition failed, retry */
311 spin_unlock(&core->cst.spin);
322 * Spinlock the parent and try to drop the last ref. On success
323 * remove chain from its parent.
325 if ((above = chain->above) != NULL) {
326 spin_lock(&above->cst.spin);
327 if (!atomic_cmpset_int(&chain->refs, 1, 0)) {
328 /* 1->0 transition failed */
329 spin_unlock(&above->cst.spin);
331 spin_unlock(&core->cst.spin);
337 * 1->0 transition successful
339 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
340 RB_REMOVE(hammer2_chain_tree, &above->rbtree, chain);
341 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
345 * Calculate a chain to return for a recursive drop.
347 * XXX this needs help, we have a potential deep-recursion
348 * problem which we try to address but sometimes we wind up
349 * with two elements that have to be dropped.
351 * If the chain has an associated core with refs at 0
352 * the chain must be the first in the core's linked list
353 * by definition, and we will recursively drop the ref
354 * implied by the chain->next_parent field.
356 * Otherwise if the rbtree containing chain is empty we try
357 * to recursively drop our parent (only the first one could
358 * possibly have refs == 0 since the rest are linked via
361 * Otherwise we try to recursively drop a sibling.
363 if (chain->next_parent) {
364 KKASSERT(core != NULL);
365 rdrop1 = chain->next_parent;
367 if (RB_EMPTY(&above->rbtree)) {
368 rdrop2 = above->first_parent;
369 if (rdrop2 == NULL || rdrop2->refs ||
370 atomic_cmpset_int(&rdrop2->refs, 0, 1) == 0) {
374 rdrop2 = RB_ROOT(&above->rbtree);
375 if (atomic_cmpset_int(&rdrop2->refs, 0, 1) == 0)
378 spin_unlock(&above->cst.spin);
379 above = NULL; /* safety */
381 if (chain->next_parent) {
382 KKASSERT(core != NULL);
383 rdrop1 = chain->next_parent;
388 * We still have the core spinlock (if core is non-NULL). The
389 * above spinlock is gone.
392 KKASSERT(core->first_parent == chain);
393 if (chain->next_parent) {
394 /* parent should already be set */
395 KKASSERT(rdrop1 == chain->next_parent);
397 core->first_parent = chain->next_parent;
398 chain->next_parent = NULL;
401 if (atomic_fetchadd_int(&core->sharecnt, -1) == 1) {
403 * On the 1->0 transition of core we can destroy
406 spin_unlock(&core->cst.spin);
407 KKASSERT(core->cst.count == 0);
408 KKASSERT(core->cst.upgrade == 0);
409 kfree(core, hmp->mchain);
411 spin_unlock(&core->cst.spin);
413 core = NULL; /* safety */
417 * All spin locks are gone, finish freeing stuff.
419 KKASSERT((chain->flags & (HAMMER2_CHAIN_MOVED |
420 HAMMER2_CHAIN_MODIFIED)) == 0);
422 switch(chain->bref.type) {
423 case HAMMER2_BREF_TYPE_VOLUME:
424 case HAMMER2_BREF_TYPE_FREEMAP:
427 case HAMMER2_BREF_TYPE_INODE:
429 kfree(chain->data, hmp->mchain);
433 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
435 kfree(chain->data, hmp->mchain);
440 KKASSERT(chain->data == NULL);
444 KKASSERT(chain->bp == NULL);
447 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) {
448 chain->flags &= ~HAMMER2_CHAIN_ALLOCATED;
449 kfree(chain, hmp->mchain);
451 if (rdrop1 && rdrop2) {
452 hammer2_chain_drop(rdrop1);
461 * Ref and lock a chain element, acquiring its data with I/O if necessary,
462 * and specify how you would like the data to be resolved.
464 * Returns 0 on success or an error code if the data could not be acquired.
465 * The chain element is locked on return regardless of whether an error
468 * The lock is allowed to recurse, multiple locking ops will aggregate
469 * the requested resolve types. Once data is assigned it will not be
470 * removed until the last unlock.
472 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
473 * (typically used to avoid device/logical buffer
476 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
477 * the INITIAL-create state (indirect blocks only).
479 * Do not resolve data elements for DATA chains.
480 * (typically used to avoid device/logical buffer
483 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
485 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
486 * it will be locked exclusive.
488 * NOTE: Embedded elements (volume header, inodes) are always resolved
491 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
492 * element will instantiate and zero its buffer, and flush it on
495 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
496 * so as not to instantiate a device buffer, which could alias against
497 * a logical file buffer. However, if ALWAYS is specified the
498 * device buffer will be instantiated anyway.
500 * WARNING! If data must be fetched a shared lock will temporarily be
501 * upgraded to exclusive. However, a deadlock can occur if
502 * the caller owns more than one shared lock.
505 hammer2_chain_lock(hammer2_chain_t *chain, int how)
507 hammer2_mount_t *hmp;
508 hammer2_chain_core_t *core;
509 hammer2_blockref_t *bref;
520 * Ref and lock the element. Recursive locks are allowed.
522 if ((how & HAMMER2_RESOLVE_NOREF) == 0)
523 hammer2_chain_ref(chain);
524 atomic_add_int(&chain->lockcnt, 1);
527 KKASSERT(hmp != NULL);
530 * Get the appropriate lock.
533 if (how & HAMMER2_RESOLVE_SHARED)
534 ccms_thread_lock(&core->cst, CCMS_STATE_SHARED);
536 ccms_thread_lock(&core->cst, CCMS_STATE_EXCLUSIVE);
539 * If we already have a valid data pointer no further action is
546 * Do we have to resolve the data?
548 switch(how & HAMMER2_RESOLVE_MASK) {
549 case HAMMER2_RESOLVE_NEVER:
551 case HAMMER2_RESOLVE_MAYBE:
552 if (chain->flags & HAMMER2_CHAIN_INITIAL)
554 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
557 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE)
560 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF)
563 case HAMMER2_RESOLVE_ALWAYS:
568 * Upgrade to an exclusive lock so we can safely manipulate the
569 * buffer cache. If another thread got to it before us we
572 ostate = ccms_thread_lock_upgrade(&core->cst);
574 ccms_thread_lock_downgrade(&core->cst, ostate);
579 * We must resolve to a device buffer, either by issuing I/O or
580 * by creating a zero-fill element. We do not mark the buffer
581 * dirty when creating a zero-fill element (the hammer2_chain_modify()
582 * API must still be used to do that).
584 * The device buffer is variable-sized in powers of 2 down
585 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage
586 * chunk always contains buffers of the same size. (XXX)
588 * The minimum physical IO size may be larger than the variable
593 psize = hammer2_devblksize(chain->bytes);
594 pmask = (hammer2_off_t)psize - 1;
595 pbase = bref->data_off & ~pmask;
596 boff = bref->data_off & (HAMMER2_OFF_MASK & pmask);
597 KKASSERT(pbase != 0);
598 peof = (pbase + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64;
601 * The getblk() optimization can only be used on newly created
602 * elements if the physical block size matches the request.
604 if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
605 chain->bytes == psize) {
606 chain->bp = getblk(hmp->devvp, pbase, psize, 0, 0);
608 } else if (hammer2_isclusterable(chain)) {
609 error = cluster_read(hmp->devvp, peof, pbase, psize,
610 psize, HAMMER2_PBUFSIZE*4,
612 adjreadcounter(&chain->bref, chain->bytes);
614 error = bread(hmp->devvp, pbase, psize, &chain->bp);
615 adjreadcounter(&chain->bref, chain->bytes);
619 kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
620 (intmax_t)pbase, error);
623 ccms_thread_lock_downgrade(&core->cst, ostate);
628 * Zero the data area if the chain is in the INITIAL-create state.
629 * Mark the buffer for bdwrite(). This clears the INITIAL state
630 * but does not mark the chain modified.
632 bdata = (char *)chain->bp->b_data + boff;
633 if (chain->flags & HAMMER2_CHAIN_INITIAL) {
634 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
635 bzero(bdata, chain->bytes);
636 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
640 * Setup the data pointer, either pointing it to an embedded data
641 * structure and copying the data from the buffer, or pointing it
644 * The buffer is not retained when copying to an embedded data
645 * structure in order to avoid potential deadlocks or recursions
646 * on the same physical buffer.
648 switch (bref->type) {
649 case HAMMER2_BREF_TYPE_VOLUME:
650 case HAMMER2_BREF_TYPE_FREEMAP:
652 * Copy data from bp to embedded buffer
654 panic("hammer2_chain_lock: called on unresolved volume header");
657 KKASSERT(pbase == 0);
658 KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
659 bcopy(bdata, &hmp->voldata, chain->bytes);
660 chain->data = (void *)&hmp->voldata;
665 case HAMMER2_BREF_TYPE_INODE:
667 * Copy data from bp to embedded buffer, do not retain the
670 KKASSERT(chain->bytes == sizeof(chain->data->ipdata));
671 atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED);
672 chain->data = kmalloc(sizeof(chain->data->ipdata),
673 hmp->mchain, M_WAITOK | M_ZERO);
674 bcopy(bdata, &chain->data->ipdata, chain->bytes);
678 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
679 KKASSERT(chain->bytes == sizeof(chain->data->bmdata));
680 atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED);
681 chain->data = kmalloc(sizeof(chain->data->bmdata),
682 hmp->mchain, M_WAITOK | M_ZERO);
683 bcopy(bdata, &chain->data->bmdata, chain->bytes);
687 case HAMMER2_BREF_TYPE_INDIRECT:
688 case HAMMER2_BREF_TYPE_DATA:
689 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
692 * Point data at the device buffer and leave bp intact.
694 chain->data = (void *)bdata;
699 * Make sure the bp is not specifically owned by this thread before
700 * restoring to a possibly shared lock, so another hammer2 thread
704 BUF_KERNPROC(chain->bp);
705 ccms_thread_lock_downgrade(&core->cst, ostate);
710 * Asynchronously read the device buffer (dbp) and execute the specified
711 * callback. The caller should pass-in a locked chain (shared lock is ok).
712 * The function is responsible for unlocking the chain and for disposing
715 * NOTE! A NULL dbp (but non-NULL data) will be passed to the function
716 * if the dbp is integrated into the chain, because we do not want
717 * the caller to dispose of dbp in that situation.
719 static void hammer2_chain_load_async_callback(struct bio *bio);
722 hammer2_chain_load_async(hammer2_chain_t *chain,
723 void (*func)(hammer2_chain_t *, struct buf *, char *, void *),
726 hammer2_cbinfo_t *cbinfo;
727 hammer2_mount_t *hmp;
728 hammer2_blockref_t *bref;
738 func(chain, NULL, (char *)chain->data, arg);
743 * We must resolve to a device buffer, either by issuing I/O or
744 * by creating a zero-fill element. We do not mark the buffer
745 * dirty when creating a zero-fill element (the hammer2_chain_modify()
746 * API must still be used to do that).
748 * The device buffer is variable-sized in powers of 2 down
749 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage
750 * chunk always contains buffers of the same size. (XXX)
752 * The minimum physical IO size may be larger than the variable
757 psize = hammer2_devblksize(chain->bytes);
758 pmask = (hammer2_off_t)psize - 1;
759 pbase = bref->data_off & ~pmask;
760 boff = bref->data_off & (HAMMER2_OFF_MASK & pmask);
761 KKASSERT(pbase != 0);
762 peof = (pbase + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64;
767 * The getblk() optimization can only be used on newly created
768 * elements if the physical block size matches the request.
770 if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
771 chain->bytes == psize) {
772 dbp = getblk(hmp->devvp, pbase, psize, 0, 0);
773 /*atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);*/
774 bdata = (char *)dbp->b_data + boff;
775 bzero(bdata, chain->bytes);
776 /*atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);*/
777 func(chain, dbp, bdata, arg);
781 adjreadcounter(&chain->bref, chain->bytes);
782 cbinfo = kmalloc(sizeof(*cbinfo), hmp->mchain, M_INTWAIT | M_ZERO);
783 cbinfo->chain = chain;
788 cluster_readcb(hmp->devvp, peof, pbase, psize,
789 HAMMER2_PBUFSIZE*4, HAMMER2_PBUFSIZE*4,
790 hammer2_chain_load_async_callback, cbinfo);
794 hammer2_chain_load_async_callback(struct bio *bio)
796 hammer2_cbinfo_t *cbinfo;
797 hammer2_mount_t *hmp;
802 * Nobody is waiting for bio/dbp to complete, we are
803 * responsible for handling the biowait() equivalent
804 * on dbp which means clearing BIO_DONE and BIO_SYNC
805 * and calling bpdone() if it hasn't already been called
806 * to restore any covered holes in the buffer's backing
810 if ((bio->bio_flags & BIO_DONE) == 0)
812 bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
815 * Extract the auxillary info and issue the callback.
816 * Finish up with the dbp after it returns.
818 cbinfo = bio->bio_caller_info1.ptr;
819 /*ccms_thread_lock_setown(cbinfo->chain->core);*/
820 data = dbp->b_data + cbinfo->boff;
821 hmp = cbinfo->chain->hmp;
823 cbinfo = bio->bio_caller_info1.ptr;
824 cbinfo->func(cbinfo->chain, dbp, data, cbinfo->arg);
825 /* cbinfo->chain is stale now */
827 kfree(cbinfo, hmp->mchain);
831 * Unlock and deref a chain element.
833 * On the last lock release any non-embedded data (chain->bp) will be
837 hammer2_chain_unlock(hammer2_chain_t *chain)
839 hammer2_chain_core_t *core = chain->core;
845 * The core->cst lock can be shared across several chains so we
846 * need to track the per-chain lockcnt separately.
848 * If multiple locks are present (or being attempted) on this
849 * particular chain we can just unlock, drop refs, and return.
851 * Otherwise fall-through on the 1->0 transition.
854 lockcnt = chain->lockcnt;
855 KKASSERT(lockcnt > 0);
858 if (atomic_cmpset_int(&chain->lockcnt,
859 lockcnt, lockcnt - 1)) {
860 ccms_thread_unlock(&core->cst);
861 hammer2_chain_drop(chain);
865 if (atomic_cmpset_int(&chain->lockcnt, 1, 0))
872 * On the 1->0 transition we upgrade the core lock (if necessary)
873 * to exclusive for terminal processing. If after upgrading we find
874 * that lockcnt is non-zero, another thread is racing us and will
875 * handle the unload for us later on, so just cleanup and return
876 * leaving the data/bp intact
878 * Otherwise if lockcnt is still 0 it is possible for it to become
879 * non-zero and race, but since we hold the core->cst lock
880 * exclusively all that will happen is that the chain will be
881 * reloaded after we unload it.
883 ostate = ccms_thread_lock_upgrade(&core->cst);
884 if (chain->lockcnt) {
885 ccms_thread_unlock_upgraded(&core->cst, ostate);
886 hammer2_chain_drop(chain);
891 * Shortcut the case if the data is embedded or not resolved.
893 * Do NOT NULL out chain->data (e.g. inode data), it might be
896 * The DIRTYBP flag is non-applicable in this situation and can
897 * be cleared to keep the flags state clean.
899 if (chain->bp == NULL) {
900 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
901 ccms_thread_unlock_upgraded(&core->cst, ostate);
902 hammer2_chain_drop(chain);
909 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
911 } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
912 switch(chain->bref.type) {
913 case HAMMER2_BREF_TYPE_DATA:
914 counterp = &hammer2_ioa_file_write;
916 case HAMMER2_BREF_TYPE_INODE:
917 counterp = &hammer2_ioa_meta_write;
919 case HAMMER2_BREF_TYPE_INDIRECT:
920 counterp = &hammer2_ioa_indr_write;
922 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
923 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
924 counterp = &hammer2_ioa_fmap_write;
927 counterp = &hammer2_ioa_volu_write;
930 *counterp += chain->bytes;
932 switch(chain->bref.type) {
933 case HAMMER2_BREF_TYPE_DATA:
934 counterp = &hammer2_iod_file_write;
936 case HAMMER2_BREF_TYPE_INODE:
937 counterp = &hammer2_iod_meta_write;
939 case HAMMER2_BREF_TYPE_INDIRECT:
940 counterp = &hammer2_iod_indr_write;
942 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
943 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
944 counterp = &hammer2_iod_fmap_write;
947 counterp = &hammer2_iod_volu_write;
950 *counterp += chain->bytes;
956 * If a device buffer was used for data be sure to destroy the
957 * buffer when we are done to avoid aliases (XXX what about the
958 * underlying VM pages?).
960 * NOTE: Freemap leaf's use reserved blocks and thus no aliasing
965 * XXX our primary cache is now the block device, not
966 * the logical file. don't release the buffer.
968 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
969 chain->bp->b_flags |= B_RELBUF;
973 * The DIRTYBP flag tracks whether we have to bdwrite() the buffer
974 * or not. The flag will get re-set when chain_modify() is called,
975 * even if MODIFIED is already set, allowing the OS to retire the
976 * buffer independent of a hammer2 flus.
979 if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
980 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
981 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
982 atomic_clear_int(&chain->flags,
983 HAMMER2_CHAIN_IOFLUSH);
984 chain->bp->b_flags |= B_RELBUF;
985 cluster_awrite(chain->bp);
987 chain->bp->b_flags |= B_CLUSTEROK;
991 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
992 atomic_clear_int(&chain->flags,
993 HAMMER2_CHAIN_IOFLUSH);
994 chain->bp->b_flags |= B_RELBUF;
997 /* bp might still be dirty */
1002 ccms_thread_unlock_upgraded(&core->cst, ostate);
1003 hammer2_chain_drop(chain);
1007 * Resize the chain's physical storage allocation in-place. This may
1008 * replace the passed-in chain with a new chain.
1010 * Chains can be resized smaller without reallocating the storage.
1011 * Resizing larger will reallocate the storage.
1013 * Must be passed an exclusively locked parent and chain, returns a new
1014 * exclusively locked chain at the same index and unlocks the old chain.
1015 * Flushes the buffer if necessary.
1017 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
1018 * to avoid instantiating a device buffer that conflicts with the vnode
1019 * data buffer. That is, the passed-in bp is a logical buffer, whereas
1020 * any chain-oriented bp would be a device buffer.
1022 * XXX flags currently ignored, uses chain->bp to detect data/no-data.
1023 * XXX return error if cannot resize.
1026 hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
1028 hammer2_chain_t *parent, hammer2_chain_t **chainp,
1029 int nradix, int flags)
1031 hammer2_mount_t *hmp;
1032 hammer2_chain_t *chain;
1033 hammer2_off_t pbase;
1043 * Only data and indirect blocks can be resized for now.
1044 * (The volu root, inodes, and freemap elements use a fixed size).
1046 KKASSERT(chain != &hmp->vchain);
1047 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1048 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
1051 * Nothing to do if the element is already the proper size
1053 obytes = chain->bytes;
1054 nbytes = 1U << nradix;
1055 if (obytes == nbytes)
1059 * Delete the old chain and duplicate it at the same (parent, index),
1060 * returning a new chain. This allows the old chain to still be
1061 * used by the flush code. Duplication occurs in-place.
1063 * The parent does not have to be locked for the delete/duplicate call,
1064 * but is in this particular code path.
1066 * NOTE: If we are not crossing a synchronization point the
1067 * duplication code will simply reuse the existing chain
1070 hammer2_chain_delete_duplicate(trans, &chain, 0);
1073 * Set MODIFIED and add a chain ref to prevent destruction. Both
1074 * modified flags share the same ref. (duplicated chains do not
1075 * start out MODIFIED unless possibly if the duplication code
1076 * decided to reuse the existing chain as-is).
1078 * If the chain is already marked MODIFIED then we can safely
1079 * return the previous allocation to the pool without having to
1080 * worry about snapshots. XXX check flush synchronization.
1082 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1083 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1084 hammer2_chain_ref(chain);
1088 * Relocate the block, even if making it smaller (because different
1089 * block sizes may be in different regions).
1091 hammer2_freemap_alloc(trans, chain->hmp, &chain->bref, nbytes);
1092 chain->bytes = nbytes;
1093 /*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */
1096 * The device buffer may be larger than the allocation size.
1098 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
1099 bbytes = HAMMER2_MINIOSIZE;
1100 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
1101 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
1104 * For now just support it on DATA chains (and not on indirect
1107 KKASSERT(chain->bp == NULL);
1110 * Make sure the chain is marked MOVED and SUBMOD is set in the
1111 * parent(s) so the adjustments are picked up by flush.
1113 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1114 hammer2_chain_ref(chain);
1115 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1117 hammer2_chain_setsubmod(trans, chain);
1122 * Set a chain modified, making it read-write and duplicating it if necessary.
1123 * This function will assign a new physical block to the chain if necessary
1125 * Duplication of already-modified chains is possible when the modification
1126 * crosses a flush synchronization boundary.
1128 * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
1129 * level or the COW operation will not work.
1131 * Data blocks - The chain is usually locked RESOLVE_NEVER so as not to
1132 * run the data through the device buffers.
1134 * This function may return a different chain than was passed, in which case
1135 * the old chain will be unlocked and the new chain will be locked.
1137 * ip->chain may be adjusted by hammer2_chain_modify_ip().
1139 hammer2_inode_data_t *
1140 hammer2_chain_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
1141 hammer2_chain_t **chainp, int flags)
1143 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
1144 hammer2_chain_modify(trans, chainp, flags);
1145 if (ip->chain != *chainp)
1146 hammer2_inode_repoint(ip, NULL, *chainp);
1147 return(&ip->chain->data->ipdata);
1151 hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
1154 hammer2_mount_t *hmp;
1155 hammer2_chain_t *chain;
1156 hammer2_off_t pbase;
1157 hammer2_off_t pmask;
1159 hammer2_tid_t flush_tid;
1168 * Data must be resolved if already assigned unless explicitly
1169 * flagged otherwise.
1173 if (chain->data == NULL && (flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
1174 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
1175 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1176 hammer2_chain_unlock(chain);
1180 * data is not optional for freemap chains (we must always be sure
1181 * to copy the data on COW storage allocations).
1183 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
1184 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
1185 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) ||
1186 (flags & HAMMER2_MODIFY_OPTDATA) == 0);
1190 * If the chain is already marked MODIFIED we can usually just
1191 * return. However, if a modified chain is modified again in
1192 * a synchronization-point-crossing manner we have to issue a
1193 * delete/duplicate on the chain to avoid flush interference.
1195 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
1197 * Which flush_tid do we need to check? If the chain is
1198 * related to the freemap we have to use the freemap flush
1199 * tid (free_flush_tid), otherwise we use the normal filesystem
1200 * flush tid (topo_flush_tid). The two flush domains are
1201 * almost completely independent of each other.
1203 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
1204 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
1205 flush_tid = hmp->topo_flush_tid; /* XXX */
1206 goto skipxx; /* XXX */
1208 flush_tid = hmp->topo_flush_tid;
1214 if (chain->modify_tid <= flush_tid &&
1215 trans->sync_tid > flush_tid) {
1217 * Modifications cross synchronization point,
1218 * requires delete-duplicate.
1220 KKASSERT((flags & HAMMER2_MODIFY_ASSERTNOCOPY) == 0);
1221 hammer2_chain_delete_duplicate(trans, chainp, 0);
1223 /* fall through using duplicate */
1227 * Quick return path, set DIRTYBP to ensure that
1228 * the later retirement of bp will write it out.
1230 * quick return path also needs the modify_tid
1234 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
1235 if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
1236 chain->bref.modify_tid = trans->sync_tid;
1237 chain->modify_tid = trans->sync_tid;
1242 * modify_tid is only update for primary modifications, not for
1243 * propagated brefs. mirror_tid will be updated regardless during
1244 * the flush, no need to set it here.
1246 if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
1247 chain->bref.modify_tid = trans->sync_tid;
1250 * Set MODIFIED and add a chain ref to prevent destruction. Both
1251 * modified flags share the same ref.
1253 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1254 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1255 hammer2_chain_ref(chain);
1259 * Adjust chain->modify_tid so the flusher knows when the
1260 * modification occurred.
1262 chain->modify_tid = trans->sync_tid;
1265 * The modification or re-modification requires an allocation and
1268 * We normally always allocate new storage here. If storage exists
1269 * and MODIFY_NOREALLOC is passed in, we do not allocate new storage.
1271 if (chain != &hmp->vchain &&
1272 chain != &hmp->fchain &&
1273 ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 ||
1274 (flags & HAMMER2_MODIFY_NOREALLOC) == 0)
1276 hammer2_freemap_alloc(trans, chain->hmp,
1277 &chain->bref, chain->bytes);
1278 /* XXX failed allocation */
1282 * Do not COW if OPTDATA is set. INITIAL flag remains unchanged.
1283 * (OPTDATA does not prevent [re]allocation of storage, only the
1284 * related copy-on-write op).
1286 if (flags & HAMMER2_MODIFY_OPTDATA)
1290 * Clearing the INITIAL flag (for indirect blocks) indicates that
1291 * we've processed the uninitialized storage allocation.
1293 * If this flag is already clear we are likely in a copy-on-write
1294 * situation but we have to be sure NOT to bzero the storage if
1295 * no data is present.
1297 if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1298 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1306 * We currently should never instantiate a device buffer for a
1307 * file data chain. (We definitely can for a freemap chain).
1309 * XXX we can now do this
1311 KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
1315 * Instantiate data buffer and possibly execute COW operation
1317 switch(chain->bref.type) {
1318 case HAMMER2_BREF_TYPE_VOLUME:
1319 case HAMMER2_BREF_TYPE_FREEMAP:
1320 case HAMMER2_BREF_TYPE_INODE:
1321 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1323 * The data is embedded, no copy-on-write operation is
1326 KKASSERT(chain->bp == NULL);
1328 case HAMMER2_BREF_TYPE_DATA:
1329 case HAMMER2_BREF_TYPE_INDIRECT:
1330 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1332 * Perform the copy-on-write operation
1334 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain);
1336 psize = hammer2_devblksize(chain->bytes);
1337 pmask = (hammer2_off_t)psize - 1;
1338 pbase = chain->bref.data_off & ~pmask;
1339 boff = chain->bref.data_off & (HAMMER2_OFF_MASK & pmask);
1340 KKASSERT(pbase != 0);
1341 peof = (pbase + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64;
1344 * The getblk() optimization can only be used if the
1345 * chain element size matches the physical block size.
1347 if (chain->bp && chain->bp->b_loffset == pbase) {
1350 } else if (chain->bytes == psize) {
1351 nbp = getblk(hmp->devvp, pbase, psize, 0, 0);
1353 } else if (hammer2_isclusterable(chain)) {
1354 error = cluster_read(hmp->devvp, peof, pbase, psize,
1355 psize, HAMMER2_PBUFSIZE*4,
1357 adjreadcounter(&chain->bref, chain->bytes);
1359 error = bread(hmp->devvp, pbase, psize, &nbp);
1360 adjreadcounter(&chain->bref, chain->bytes);
1362 KKASSERT(error == 0);
1363 bdata = (char *)nbp->b_data + boff;
1366 * Copy or zero-fill on write depending on whether
1367 * chain->data exists or not. Retire the existing bp
1368 * based on the DIRTYBP flag. Set the DIRTYBP flag to
1369 * indicate that retirement of nbp should use bdwrite().
1372 KKASSERT(chain->bp != NULL);
1373 if (chain->data != bdata) {
1374 bcopy(chain->data, bdata, chain->bytes);
1376 } else if (wasinitial) {
1377 bzero(bdata, chain->bytes);
1380 * We have a problem. We were asked to COW but
1381 * we don't have any data to COW with!
1383 panic("hammer2_chain_modify: having a COW %p\n",
1386 if (chain->bp != nbp) {
1388 if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
1389 chain->bp->b_flags |= B_CLUSTEROK;
1392 chain->bp->b_flags |= B_RELBUF;
1397 BUF_KERNPROC(chain->bp);
1399 chain->data = bdata;
1400 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
1403 panic("hammer2_chain_modify: illegal non-embedded type %d",
1409 hammer2_chain_setsubmod(trans, chain);
1413 * Mark the volume as having been modified. This short-cut version
1414 * does not have to lock the volume's chain, which allows the ioctl
1415 * code to make adjustments to connections without deadlocking. XXX
1417 * No ref is made on vchain when flagging it MODIFIED.
1420 hammer2_modify_volume(hammer2_mount_t *hmp)
1422 hammer2_voldata_lock(hmp);
1423 hammer2_voldata_unlock(hmp, 1);
1427 * Locate an in-memory chain. The parent must be locked. The in-memory
1428 * chain is returned with a reference and without a lock, or NULL
1431 * This function returns the chain at the specified index with the highest
1432 * delete_tid. The caller must check whether the chain is flagged
1433 * CHAIN_DELETED or not. However, because chain iterations can be removed
1434 * from memory we must ALSO check that DELETED chains are not flushed. A
1435 * DELETED chain which has been flushed must be ignored (the caller must
1436 * check the parent's blockref array).
1438 * NOTE: If no chain is found the caller usually must check the on-media
1439 * array to determine if a blockref exists at the index.
1441 struct hammer2_chain_find_info {
1442 hammer2_chain_t *best;
1443 hammer2_tid_t delete_tid;
1449 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data)
1451 struct hammer2_chain_find_info *info = data;
1453 if (child->index < info->index)
1455 if (child->index > info->index)
1462 hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
1464 struct hammer2_chain_find_info *info = data;
1466 if (info->delete_tid < child->delete_tid) {
1467 info->delete_tid = child->delete_tid;
1475 hammer2_chain_find_locked(hammer2_chain_t *parent, int index)
1477 struct hammer2_chain_find_info info;
1478 hammer2_chain_t *child;
1481 info.delete_tid = 0;
1484 RB_SCAN(hammer2_chain_tree, &parent->core->rbtree,
1485 hammer2_chain_find_cmp, hammer2_chain_find_callback,
1493 hammer2_chain_find(hammer2_chain_t *parent, int index)
1495 hammer2_chain_t *child;
1497 spin_lock(&parent->core->cst.spin);
1498 child = hammer2_chain_find_locked(parent, index);
1500 hammer2_chain_ref(child);
1501 spin_unlock(&parent->core->cst.spin);
1507 * Return a locked chain structure with all associated data acquired.
1508 * (if LOOKUP_NOLOCK is requested the returned chain is only referenced).
1510 * Caller must hold the parent locked shared or exclusive since we may
1511 * need the parent's bref array to find our block.
1513 * The returned child is locked as requested. If NOLOCK, the returned
1514 * child is still at least referenced.
1517 hammer2_chain_get(hammer2_chain_t *parent, int index, int flags)
1519 hammer2_blockref_t *bref;
1520 hammer2_mount_t *hmp = parent->hmp;
1521 hammer2_chain_core_t *above = parent->core;
1522 hammer2_chain_t *chain;
1523 hammer2_chain_t dummy;
1527 * Figure out how to lock. MAYBE can be used to optimized
1528 * the initial-create state for indirect blocks.
1530 if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
1531 how = HAMMER2_RESOLVE_NEVER;
1533 how = HAMMER2_RESOLVE_MAYBE;
1534 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
1535 how |= HAMMER2_RESOLVE_SHARED;
1539 * First see if we have a (possibly modified) chain element cached
1540 * for this (parent, index). Acquire the data if necessary.
1542 * If chain->data is non-NULL the chain should already be marked
1546 dummy.index = index;
1547 dummy.delete_tid = HAMMER2_MAX_TID;
1548 spin_lock(&above->cst.spin);
1549 chain = RB_FIND(hammer2_chain_tree, &above->rbtree, &dummy);
1551 hammer2_chain_ref(chain);
1552 spin_unlock(&above->cst.spin);
1553 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1554 hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF);
1557 spin_unlock(&above->cst.spin);
1560 * The parent chain must not be in the INITIAL state.
1562 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1563 panic("hammer2_chain_get: Missing bref(1)");
1568 * No RBTREE entry found, lookup the bref and issue I/O (switch on
1569 * the parent's bref to determine where and how big the array is).
1571 switch(parent->bref.type) {
1572 case HAMMER2_BREF_TYPE_INODE:
1573 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1574 bref = &parent->data->ipdata.u.blockset.blockref[index];
1576 case HAMMER2_BREF_TYPE_INDIRECT:
1577 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1578 KKASSERT(parent->data != NULL);
1579 KKASSERT(index >= 0 &&
1580 index < parent->bytes / sizeof(hammer2_blockref_t));
1581 bref = &parent->data->npdata[index];
1583 case HAMMER2_BREF_TYPE_VOLUME:
1584 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1585 bref = &hmp->voldata.sroot_blockset.blockref[index];
1587 case HAMMER2_BREF_TYPE_FREEMAP:
1588 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1589 bref = &hmp->voldata.freemap_blockset.blockref[index];
1593 panic("hammer2_chain_get: unrecognized blockref type: %d",
1596 if (bref->type == 0) {
1597 panic("hammer2_chain_get: Missing bref(2)");
1602 * Allocate a chain structure representing the existing media
1603 * entry. Resulting chain has one ref and is not locked.
1605 * The locking operation we do later will issue I/O to read it.
1607 chain = hammer2_chain_alloc(hmp, NULL, bref);
1608 hammer2_chain_core_alloc(chain, NULL); /* ref'd chain returned */
1611 * Link the chain into its parent. A spinlock is required to safely
1612 * access the RBTREE, and it is possible to collide with another
1613 * hammer2_chain_get() operation because the caller might only hold
1614 * a shared lock on the parent.
1616 KKASSERT(parent->refs > 0);
1617 spin_lock(&above->cst.spin);
1618 chain->above = above;
1619 chain->index = index;
1620 if (RB_INSERT(hammer2_chain_tree, &above->rbtree, chain)) {
1621 chain->above = NULL;
1623 spin_unlock(&above->cst.spin);
1624 hammer2_chain_drop(chain);
1627 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
1628 spin_unlock(&above->cst.spin);
1631 * Our new chain is referenced but NOT locked. Lock the chain
1632 * below. The locking operation also resolves its data.
1634 * If NOLOCK is set the release will release the one-and-only lock.
1636 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1637 hammer2_chain_lock(chain, how); /* recusive lock */
1638 hammer2_chain_drop(chain); /* excess ref */
1644 * Lookup initialization/completion API
1647 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags)
1649 if (flags & HAMMER2_LOOKUP_SHARED) {
1650 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
1651 HAMMER2_RESOLVE_SHARED);
1653 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
1659 hammer2_chain_lookup_done(hammer2_chain_t *parent)
1662 hammer2_chain_unlock(parent);
1667 hammer2_chain_getparent(hammer2_chain_t **parentp, int how)
1669 hammer2_chain_t *oparent;
1670 hammer2_chain_t *nparent;
1671 hammer2_chain_core_t *above;
1674 above = oparent->above;
1676 spin_lock(&above->cst.spin);
1677 nparent = above->first_parent;
1678 while (hammer2_chain_refactor_test(nparent, 1))
1679 nparent = nparent->next_parent;
1680 hammer2_chain_ref(nparent); /* protect nparent, use in lock */
1681 spin_unlock(&above->cst.spin);
1683 hammer2_chain_unlock(oparent);
1684 hammer2_chain_lock(nparent, how | HAMMER2_RESOLVE_NOREF);
1691 * Locate any key between key_beg and key_end inclusive. (*parentp)
1692 * typically points to an inode but can also point to a related indirect
1693 * block and this function will recurse upwards and find the inode again.
1695 * WARNING! THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER! ANY KEY
1696 * WITHIN THE RANGE CAN BE RETURNED. HOWEVER, AN ITERATION
1697 * WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN
1698 * AND ALL IN-RANGE KEYS WILL EVENTUALLY BE RETURNED (NOT
1699 * NECESSARILY IN ORDER).
1701 * (*parentp) must be exclusively locked and referenced and can be an inode
1702 * or an existing indirect block within the inode.
1704 * On return (*parentp) will be modified to point at the deepest parent chain
1705 * element encountered during the search, as a helper for an insertion or
1706 * deletion. The new (*parentp) will be locked and referenced and the old
1707 * will be unlocked and dereferenced (no change if they are both the same).
1709 * The matching chain will be returned exclusively locked. If NOLOCK is
1710 * requested the chain will be returned only referenced.
1712 * NULL is returned if no match was found, but (*parentp) will still
1713 * potentially be adjusted.
1715 * This function will also recurse up the chain if the key is not within the
1716 * current parent's range. (*parentp) can never be set to NULL. An iteration
1717 * can simply allow (*parentp) to float inside the loop.
1719 * NOTE! chain->data is not always resolved. By default it will not be
1720 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use
1721 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/
1722 * BREF_TYPE_DATA as the device buffer can alias the logical file
1726 hammer2_chain_lookup(hammer2_chain_t **parentp,
1727 hammer2_key_t key_beg, hammer2_key_t key_end,
1730 hammer2_mount_t *hmp;
1731 hammer2_chain_t *parent;
1732 hammer2_chain_t *chain;
1733 hammer2_chain_t *tmp;
1734 hammer2_blockref_t *base;
1735 hammer2_blockref_t *bref;
1736 hammer2_key_t scan_beg;
1737 hammer2_key_t scan_end;
1740 int how_always = HAMMER2_RESOLVE_ALWAYS;
1741 int how_maybe = HAMMER2_RESOLVE_MAYBE;
1743 if (flags & HAMMER2_LOOKUP_ALWAYS)
1744 how_maybe = how_always;
1746 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) {
1747 how_maybe |= HAMMER2_RESOLVE_SHARED;
1748 how_always |= HAMMER2_RESOLVE_SHARED;
1752 * Recurse (*parentp) upward if necessary until the parent completely
1753 * encloses the key range or we hit the inode.
1758 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1759 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1760 scan_beg = parent->bref.key;
1761 scan_end = scan_beg +
1762 ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1763 if (key_beg >= scan_beg && key_end <= scan_end)
1765 parent = hammer2_chain_getparent(parentp, how_maybe);
1770 * Locate the blockref array. Currently we do a fully associative
1771 * search through the array.
1773 switch(parent->bref.type) {
1774 case HAMMER2_BREF_TYPE_INODE:
1776 * Special shortcut for embedded data returns the inode
1777 * itself. Callers must detect this condition and access
1778 * the embedded data (the strategy code does this for us).
1780 * This is only applicable to regular files and softlinks.
1782 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1783 if (flags & HAMMER2_LOOKUP_NOLOCK)
1784 hammer2_chain_ref(parent);
1786 hammer2_chain_lock(parent, how_always);
1789 base = &parent->data->ipdata.u.blockset.blockref[0];
1790 count = HAMMER2_SET_COUNT;
1792 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1793 case HAMMER2_BREF_TYPE_INDIRECT:
1795 * Handle MATCHIND on the parent
1797 if (flags & HAMMER2_LOOKUP_MATCHIND) {
1798 scan_beg = parent->bref.key;
1799 scan_end = scan_beg +
1800 ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1801 if (key_beg == scan_beg && key_end == scan_end) {
1803 hammer2_chain_lock(chain, how_maybe);
1808 * Optimize indirect blocks in the INITIAL state to avoid
1811 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1814 if (parent->data == NULL)
1815 panic("parent->data is NULL");
1816 base = &parent->data->npdata[0];
1818 count = parent->bytes / sizeof(hammer2_blockref_t);
1820 case HAMMER2_BREF_TYPE_VOLUME:
1821 base = &hmp->voldata.sroot_blockset.blockref[0];
1822 count = HAMMER2_SET_COUNT;
1824 case HAMMER2_BREF_TYPE_FREEMAP:
1825 base = &hmp->voldata.freemap_blockset.blockref[0];
1826 count = HAMMER2_SET_COUNT;
1829 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
1831 base = NULL; /* safety */
1832 count = 0; /* safety */
1836 * If the element and key overlap we use the element.
1838 * NOTE! Deleted elements are effectively invisible. Deletions
1839 * proactively clear the parent bref to the deleted child
1840 * so we do not try to shadow here to avoid parent updates
1841 * (which would be difficult since multiple deleted elements
1842 * might represent different flush synchronization points).
1845 scan_beg = 0; /* avoid compiler warning */
1846 scan_end = 0; /* avoid compiler warning */
1848 for (i = 0; i < count; ++i) {
1849 tmp = hammer2_chain_find(parent, i);
1851 if (tmp->flags & HAMMER2_CHAIN_DELETED) {
1852 hammer2_chain_drop(tmp);
1856 KKASSERT(bref->type != 0);
1857 } else if (base == NULL || base[i].type == 0) {
1862 scan_beg = bref->key;
1863 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1865 hammer2_chain_drop(tmp);
1866 if (key_beg <= scan_end && key_end >= scan_beg)
1870 if (key_beg == key_end)
1872 return (hammer2_chain_next(parentp, NULL,
1873 key_beg, key_end, flags));
1877 * Acquire the new chain element. If the chain element is an
1878 * indirect block we must search recursively.
1880 * It is possible for the tmp chain above to be removed from
1881 * the RBTREE but the parent lock ensures it would not have been
1882 * destroyed from the media, so the chain_get() code will simply
1883 * reload it from the media in that case.
1885 chain = hammer2_chain_get(parent, i, flags);
1890 * If the chain element is an indirect block it becomes the new
1891 * parent and we loop on it.
1893 * The parent always has to be locked with at least RESOLVE_MAYBE
1894 * so we can access its data. It might need a fixup if the caller
1895 * passed incompatible flags. Be careful not to cause a deadlock
1896 * as a data-load requires an exclusive lock.
1898 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key
1899 * range is within the requested key range we return the indirect
1900 * block and do NOT loop. This is usually only used to acquire
1903 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1904 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1905 hammer2_chain_unlock(parent);
1906 *parentp = parent = chain;
1907 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1908 hammer2_chain_lock(chain,
1910 HAMMER2_RESOLVE_NOREF);
1911 } else if ((flags & HAMMER2_LOOKUP_NODATA) &&
1912 chain->data == NULL) {
1913 hammer2_chain_ref(chain);
1914 hammer2_chain_unlock(chain);
1915 hammer2_chain_lock(chain,
1917 HAMMER2_RESOLVE_NOREF);
1923 * All done, return the chain
1929 * After having issued a lookup we can iterate all matching keys.
1931 * If chain is non-NULL we continue the iteration from just after it's index.
1933 * If chain is NULL we assume the parent was exhausted and continue the
1934 * iteration at the next parent.
1936 * parent must be locked on entry and remains locked throughout. chain's
1937 * lock status must match flags. Chain is always at least referenced.
1939 * WARNING! The MATCHIND flag does not apply to this function.
1942 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain,
1943 hammer2_key_t key_beg, hammer2_key_t key_end,
1946 hammer2_mount_t *hmp;
1947 hammer2_chain_t *parent;
1948 hammer2_chain_t *tmp;
1949 hammer2_blockref_t *base;
1950 hammer2_blockref_t *bref;
1951 hammer2_key_t scan_beg;
1952 hammer2_key_t scan_end;
1954 int how_maybe = HAMMER2_RESOLVE_MAYBE;
1957 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
1958 how_maybe |= HAMMER2_RESOLVE_SHARED;
1965 * Calculate the next index and recalculate the parent if necessary.
1969 * Continue iteration within current parent. If not NULL
1970 * the passed-in chain may or may not be locked, based on
1971 * the LOOKUP_NOLOCK flag (passed in as returned from lookup
1974 i = chain->index + 1;
1975 if (flags & HAMMER2_LOOKUP_NOLOCK)
1976 hammer2_chain_drop(chain);
1978 hammer2_chain_unlock(chain);
1981 * Any scan where the lookup returned degenerate data embedded
1982 * in the inode has an invalid index and must terminate.
1984 if (chain == parent)
1987 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
1988 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1990 * We reached the end of the iteration.
1995 * Continue iteration with next parent unless the current
1996 * parent covers the range.
1998 scan_beg = parent->bref.key;
1999 scan_end = scan_beg +
2000 ((hammer2_key_t)1 << parent->bref.keybits) - 1;
2001 if (key_beg >= scan_beg && key_end <= scan_end)
2004 i = parent->index + 1;
2005 parent = hammer2_chain_getparent(parentp, how_maybe);
2010 * Locate the blockref array. Currently we do a fully associative
2011 * search through the array.
2013 switch(parent->bref.type) {
2014 case HAMMER2_BREF_TYPE_INODE:
2015 base = &parent->data->ipdata.u.blockset.blockref[0];
2016 count = HAMMER2_SET_COUNT;
2018 case HAMMER2_BREF_TYPE_INDIRECT:
2019 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2020 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2023 KKASSERT(parent->data != NULL);
2024 base = &parent->data->npdata[0];
2026 count = parent->bytes / sizeof(hammer2_blockref_t);
2028 case HAMMER2_BREF_TYPE_VOLUME:
2029 base = &hmp->voldata.sroot_blockset.blockref[0];
2030 count = HAMMER2_SET_COUNT;
2032 case HAMMER2_BREF_TYPE_FREEMAP:
2033 base = &hmp->voldata.freemap_blockset.blockref[0];
2034 count = HAMMER2_SET_COUNT;
2037 panic("hammer2_chain_next: unrecognized blockref type: %d",
2039 base = NULL; /* safety */
2040 count = 0; /* safety */
2043 KKASSERT(i <= count);
2046 * Look for the key. If we are unable to find a match and an exact
2047 * match was requested we return NULL. If a range was requested we
2048 * run hammer2_chain_next() to iterate.
2050 * NOTE! Deleted elements are effectively invisible. Deletions
2051 * proactively clear the parent bref to the deleted child
2052 * so we do not try to shadow here to avoid parent updates
2053 * (which would be difficult since multiple deleted elements
2054 * might represent different flush synchronization points).
2057 scan_beg = 0; /* avoid compiler warning */
2058 scan_end = 0; /* avoid compiler warning */
2061 tmp = hammer2_chain_find(parent, i);
2063 if (tmp->flags & HAMMER2_CHAIN_DELETED) {
2064 hammer2_chain_drop(tmp);
2069 } else if (base == NULL || base[i].type == 0) {
2075 scan_beg = bref->key;
2076 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
2078 hammer2_chain_drop(tmp);
2079 if (key_beg <= scan_end && key_end >= scan_beg)
2085 * If we couldn't find a match recurse up a parent to continue the
2092 * Acquire the new chain element. If the chain element is an
2093 * indirect block we must search recursively.
2095 chain = hammer2_chain_get(parent, i, flags);
2100 * If the chain element is an indirect block it becomes the new
2101 * parent and we loop on it.
2103 * The parent always has to be locked with at least RESOLVE_MAYBE
2104 * so we can access its data. It might need a fixup if the caller
2105 * passed incompatible flags. Be careful not to cause a deadlock
2106 * as a data-load requires an exclusive lock.
2108 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key
2109 * range is within the requested key range we return the indirect
2110 * block and do NOT loop. This is usually only used to acquire
2113 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2114 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2115 if ((flags & HAMMER2_LOOKUP_MATCHIND) == 0 ||
2116 key_beg > scan_beg || key_end < scan_end) {
2117 hammer2_chain_unlock(parent);
2118 *parentp = parent = chain;
2120 if (flags & HAMMER2_LOOKUP_NOLOCK) {
2121 hammer2_chain_lock(parent,
2123 HAMMER2_RESOLVE_NOREF);
2124 } else if ((flags & HAMMER2_LOOKUP_NODATA) &&
2125 parent->data == NULL) {
2126 hammer2_chain_ref(parent);
2127 hammer2_chain_unlock(parent);
2128 hammer2_chain_lock(parent,
2130 HAMMER2_RESOLVE_NOREF);
2138 * All done, return chain
2144 * Create and return a new hammer2 system memory structure of the specified
2145 * key, type and size and insert it under (*parentp). This is a full
2146 * insertion, based on the supplied key/keybits, and may involve creating
2147 * indirect blocks and moving other chains around via delete/duplicate.
2149 * (*parentp) must be exclusive locked and may be replaced on return
2150 * depending on how much work the function had to do.
2152 * (*chainp) usually starts out NULL and returns the newly created chain,
2153 * but if the caller desires the caller may allocate a disconnected chain
2154 * and pass it in instead. (It is also possible for the caller to use
2155 * chain_duplicate() to create a disconnected chain, manipulate it, then
2156 * pass it into this function to insert it).
2158 * This function should NOT be used to insert INDIRECT blocks. It is
2159 * typically used to create/insert inodes and data blocks.
2161 * Caller must pass-in an exclusively locked parent the new chain is to
2162 * be inserted under, and optionally pass-in a disconnected, exclusively
2163 * locked chain to insert (else we create a new chain). The function will
2164 * adjust (*parentp) as necessary, create or connect the chain, and
2165 * return an exclusively locked chain in *chainp.
2168 hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
2169 hammer2_chain_t **chainp,
2170 hammer2_key_t key, int keybits, int type, size_t bytes)
2172 hammer2_mount_t *hmp;
2173 hammer2_chain_t *chain;
2174 hammer2_chain_t *child;
2175 hammer2_chain_t *parent = *parentp;
2176 hammer2_chain_core_t *above;
2177 hammer2_blockref_t dummy;
2178 hammer2_blockref_t *base;
2184 above = parent->core;
2185 KKASSERT(ccms_thread_lock_owned(&above->cst));
2189 if (chain == NULL) {
2191 * First allocate media space and construct the dummy bref,
2192 * then allocate the in-memory chain structure. Set the
2193 * INITIAL flag for fresh chains.
2195 bzero(&dummy, sizeof(dummy));
2198 dummy.keybits = keybits;
2199 dummy.data_off = hammer2_getradix(bytes);
2200 dummy.methods = parent->bref.methods;
2201 chain = hammer2_chain_alloc(hmp, trans, &dummy);
2202 hammer2_chain_core_alloc(chain, NULL);
2204 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
2207 * Lock the chain manually, chain_lock will load the chain
2208 * which we do NOT want to do. (note: chain->refs is set
2209 * to 1 by chain_alloc() for us, but lockcnt is not).
2212 ccms_thread_lock(&chain->core->cst, CCMS_STATE_EXCLUSIVE);
2216 * We do NOT set INITIAL here (yet). INITIAL is only
2217 * used for indirect blocks.
2219 * Recalculate bytes to reflect the actual media block
2222 bytes = (hammer2_off_t)1 <<
2223 (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
2224 chain->bytes = bytes;
2227 case HAMMER2_BREF_TYPE_VOLUME:
2228 case HAMMER2_BREF_TYPE_FREEMAP:
2229 panic("hammer2_chain_create: called with volume type");
2231 case HAMMER2_BREF_TYPE_INODE:
2232 KKASSERT(bytes == HAMMER2_INODE_BYTES);
2233 atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED);
2234 chain->data = kmalloc(sizeof(chain->data->ipdata),
2235 hmp->mchain, M_WAITOK | M_ZERO);
2237 case HAMMER2_BREF_TYPE_INDIRECT:
2238 panic("hammer2_chain_create: cannot be used to"
2239 "create indirect block");
2241 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2242 panic("hammer2_chain_create: cannot be used to"
2243 "create freemap root or node");
2245 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2246 KKASSERT(bytes == sizeof(chain->data->bmdata));
2247 atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED);
2248 chain->data = kmalloc(sizeof(chain->data->bmdata),
2249 hmp->mchain, M_WAITOK | M_ZERO);
2251 case HAMMER2_BREF_TYPE_DATA:
2253 /* leave chain->data NULL */
2254 KKASSERT(chain->data == NULL);
2259 * Potentially update the existing chain's key/keybits.
2261 * Do NOT mess with the current state of the INITIAL flag.
2263 chain->bref.key = key;
2264 chain->bref.keybits = keybits;
2265 KKASSERT(chain->above == NULL);
2269 above = parent->core;
2272 * Locate a free blockref in the parent's array
2274 switch(parent->bref.type) {
2275 case HAMMER2_BREF_TYPE_INODE:
2276 KKASSERT((parent->data->ipdata.op_flags &
2277 HAMMER2_OPFLAG_DIRECTDATA) == 0);
2278 KKASSERT(parent->data != NULL);
2279 base = &parent->data->ipdata.u.blockset.blockref[0];
2280 count = HAMMER2_SET_COUNT;
2282 case HAMMER2_BREF_TYPE_INDIRECT:
2283 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2284 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2287 KKASSERT(parent->data != NULL);
2288 base = &parent->data->npdata[0];
2290 count = parent->bytes / sizeof(hammer2_blockref_t);
2292 case HAMMER2_BREF_TYPE_VOLUME:
2293 KKASSERT(parent->data != NULL);
2294 base = &hmp->voldata.sroot_blockset.blockref[0];
2295 count = HAMMER2_SET_COUNT;
2297 case HAMMER2_BREF_TYPE_FREEMAP:
2298 KKASSERT(parent->data != NULL);
2299 base = &hmp->voldata.freemap_blockset.blockref[0];
2300 count = HAMMER2_SET_COUNT;
2303 panic("hammer2_chain_create: unrecognized blockref type: %d",
2310 * Scan for an unallocated bref, also skipping any slots occupied
2311 * by in-memory chain elements that may not yet have been updated
2312 * in the parent's bref array.
2314 * We don't have to hold the spinlock to save an empty slot as
2315 * new slots can only transition from empty if the parent is
2316 * locked exclusively.
2318 spin_lock(&above->cst.spin);
2319 for (i = 0; i < count; ++i) {
2320 child = hammer2_chain_find_locked(parent, i);
2322 if (child->flags & HAMMER2_CHAIN_DELETED)
2328 if (base[i].type == 0)
2331 spin_unlock(&above->cst.spin);
2334 * If no free blockref could be found we must create an indirect
2335 * block and move a number of blockrefs into it. With the parent
2336 * locked we can safely lock each child in order to move it without
2337 * causing a deadlock.
2339 * This may return the new indirect block or the old parent depending
2340 * on where the key falls. NULL is returned on error.
2343 hammer2_chain_t *nparent;
2345 nparent = hammer2_chain_create_indirect(trans, parent,
2348 if (nparent == NULL) {
2350 hammer2_chain_drop(chain);
2354 if (parent != nparent) {
2355 hammer2_chain_unlock(parent);
2356 parent = *parentp = nparent;
2362 * Link the chain into its parent. Later on we will have to set
2363 * the MOVED bit in situations where we don't mark the new chain
2364 * as being modified.
2366 if (chain->above != NULL)
2367 panic("hammer2: hammer2_chain_create: chain already connected");
2368 KKASSERT(chain->above == NULL);
2369 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
2371 chain->above = above;
2373 spin_lock(&above->cst.spin);
2374 if (RB_INSERT(hammer2_chain_tree, &above->rbtree, chain))
2375 panic("hammer2_chain_create: collision");
2376 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
2377 spin_unlock(&above->cst.spin);
2381 * Mark the newly created chain modified.
2383 * Device buffers are not instantiated for DATA elements
2384 * as these are handled by logical buffers.
2386 * Indirect and freemap node indirect blocks are handled
2387 * by hammer2_chain_create_indirect() and not by this
2390 * Data for all other bref types is expected to be
2391 * instantiated (INODE, LEAF).
2393 switch(chain->bref.type) {
2394 case HAMMER2_BREF_TYPE_DATA:
2395 hammer2_chain_modify(trans, &chain,
2396 HAMMER2_MODIFY_OPTDATA |
2397 HAMMER2_MODIFY_ASSERTNOCOPY);
2399 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2400 case HAMMER2_BREF_TYPE_INODE:
2401 hammer2_chain_modify(trans, &chain,
2402 HAMMER2_MODIFY_ASSERTNOCOPY);
2406 * Remaining types are not supported by this function.
2407 * In particular, INDIRECT and LEAF_NODE types are
2408 * handled by create_indirect().
2410 panic("hammer2_chain_create: bad type: %d",
2417 * When reconnecting a chain we must set MOVED and setsubmod
2418 * so the flush recognizes that it must update the bref in
2421 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2422 hammer2_chain_ref(chain);
2423 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2425 hammer2_chain_setsubmod(trans, chain);
2435 * Replace (*chainp) with a duplicate. The original *chainp is unlocked
2436 * and the replacement will be returned locked. Both the original and the
2437 * new chain will share the same RBTREE (have the same chain->core), with
2438 * the new chain becoming the 'current' chain (meaning it is the first in
2439 * the linked list at core->chain_first).
2441 * If (parent, i) then the new duplicated chain is inserted under the parent
2442 * at the specified index (the parent must not have a ref at that index).
2444 * If (NULL, -1) then the new duplicated chain is not inserted anywhere,
2445 * similar to if it had just been chain_alloc()'d (suitable for passing into
2446 * hammer2_chain_create() after this function returns).
2448 * NOTE! Duplication is used in order to retain the original topology to
2449 * support flush synchronization points. Both the original and the
2450 * new chain will have the same transaction id and thus the operation
2451 * appears atomic w/regards to media flushes.
2453 static void hammer2_chain_dup_fixup(hammer2_chain_t *ochain,
2454 hammer2_chain_t *nchain);
2457 hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t *parent, int i,
2458 hammer2_chain_t **chainp, hammer2_blockref_t *bref)
2460 hammer2_mount_t *hmp;
2461 hammer2_blockref_t *base;
2462 hammer2_chain_t *ochain;
2463 hammer2_chain_t *nchain;
2464 hammer2_chain_t *scan;
2465 hammer2_chain_core_t *above;
2472 * First create a duplicate of the chain structure, associating
2473 * it with the same core, making it the same size, pointing it
2474 * to the same bref (the same media block).
2479 bref = &ochain->bref;
2480 nchain = hammer2_chain_alloc(hmp, trans, bref);
2481 hammer2_chain_core_alloc(nchain, ochain->core);
2482 bytes = (hammer2_off_t)1 <<
2483 (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
2484 nchain->bytes = bytes;
2485 nchain->modify_tid = ochain->modify_tid;
2487 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER);
2488 hammer2_chain_dup_fixup(ochain, nchain);
2491 * If parent is not NULL, insert into the parent at the requested
2492 * index. The newly duplicated chain must be marked MOVED and
2493 * SUBMODIFIED set in its parent(s).
2495 * Having both chains locked is extremely important for atomicy.
2499 * Locate a free blockref in the parent's array
2501 above = parent->core;
2502 KKASSERT(ccms_thread_lock_owned(&above->cst));
2504 switch(parent->bref.type) {
2505 case HAMMER2_BREF_TYPE_INODE:
2506 KKASSERT((parent->data->ipdata.op_flags &
2507 HAMMER2_OPFLAG_DIRECTDATA) == 0);
2508 KKASSERT(parent->data != NULL);
2509 base = &parent->data->ipdata.u.blockset.blockref[0];
2510 count = HAMMER2_SET_COUNT;
2512 case HAMMER2_BREF_TYPE_INDIRECT:
2513 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2514 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2517 KKASSERT(parent->data != NULL);
2518 base = &parent->data->npdata[0];
2520 count = parent->bytes / sizeof(hammer2_blockref_t);
2522 case HAMMER2_BREF_TYPE_VOLUME:
2523 KKASSERT(parent->data != NULL);
2524 base = &hmp->voldata.sroot_blockset.blockref[0];
2525 count = HAMMER2_SET_COUNT;
2527 case HAMMER2_BREF_TYPE_FREEMAP:
2528 KKASSERT(parent->data != NULL);
2529 base = &hmp->voldata.freemap_blockset.blockref[0];
2530 count = HAMMER2_SET_COUNT;
2533 panic("hammer2_chain_create: unrecognized "
2534 "blockref type: %d",
2539 KKASSERT(i >= 0 && i < count);
2541 KKASSERT((nchain->flags & HAMMER2_CHAIN_DELETED) == 0);
2542 KKASSERT(parent->refs > 0);
2544 spin_lock(&above->cst.spin);
2545 nchain->above = above;
2547 scan = hammer2_chain_find_locked(parent, i);
2548 KKASSERT(base == NULL || base[i].type == 0 ||
2550 (scan->flags & HAMMER2_CHAIN_DELETED));
2551 if (RB_INSERT(hammer2_chain_tree, &above->rbtree,
2553 panic("hammer2_chain_duplicate: collision");
2555 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_ONRBTREE);
2556 spin_unlock(&above->cst.spin);
2558 if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2559 hammer2_chain_ref(nchain);
2560 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED);
2562 hammer2_chain_setsubmod(trans, nchain);
2566 * We have to unlock ochain to flush any dirty data, asserting the
2567 * case (data == NULL) to catch any extra locks that might have been
2568 * present, then transfer state to nchain.
2570 oflags = ochain->flags;
2571 odata = ochain->data;
2572 hammer2_chain_unlock(ochain);
2573 KKASSERT((ochain->flags & HAMMER2_CHAIN_EMBEDDED) ||
2574 ochain->data == NULL);
2576 if (oflags & HAMMER2_CHAIN_INITIAL)
2577 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL);
2580 * WARNING! We should never resolve DATA to device buffers
2581 * (XXX allow it if the caller did?), and since
2582 * we currently do not have the logical buffer cache
2583 * buffer in-hand to fix its cached physical offset
2584 * we also force the modify code to not COW it. XXX
2586 if (oflags & HAMMER2_CHAIN_MODIFIED) {
2587 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2588 hammer2_chain_modify(trans, &nchain,
2589 HAMMER2_MODIFY_OPTDATA |
2590 HAMMER2_MODIFY_NOREALLOC |
2591 HAMMER2_MODIFY_ASSERTNOCOPY);
2592 } else if (oflags & HAMMER2_CHAIN_INITIAL) {
2593 hammer2_chain_modify(trans, &nchain,
2594 HAMMER2_MODIFY_OPTDATA |
2595 HAMMER2_MODIFY_ASSERTNOCOPY);
2597 hammer2_chain_modify(trans, &nchain,
2598 HAMMER2_MODIFY_ASSERTNOCOPY);
2600 hammer2_chain_drop(nchain);
2602 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2603 hammer2_chain_drop(nchain);
2604 } else if (oflags & HAMMER2_CHAIN_INITIAL) {
2605 hammer2_chain_drop(nchain);
2607 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_ALWAYS |
2608 HAMMER2_RESOLVE_NOREF);
2609 hammer2_chain_unlock(nchain);
2612 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2618 * When the chain is in the INITIAL state we must still
2619 * ensure that a block has been assigned so MOVED processing
2620 * works as expected.
2622 KKASSERT (nchain->bref.type != HAMMER2_BREF_TYPE_DATA);
2623 hammer2_chain_modify(trans, &nchain,
2624 HAMMER2_MODIFY_OPTDATA |
2625 HAMMER2_MODIFY_ASSERTNOCOPY);
2628 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_MAYBE |
2629 HAMMER2_RESOLVE_NOREF); /* eat excess ref */
2630 hammer2_chain_unlock(nchain);
2634 * Special in-place delete-duplicate sequence which does not require a
2635 * locked parent. (*chainp) is marked DELETED and atomically replaced
2636 * with a duplicate. Atomicy is at the very-fine spin-lock level in
2637 * order to ensure that lookups do not race us.
2640 hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
2643 hammer2_mount_t *hmp;
2644 hammer2_chain_t *ochain;
2645 hammer2_chain_t *nchain;
2646 hammer2_chain_core_t *above;
2652 * First create a duplicate of the chain structure
2656 nchain = hammer2_chain_alloc(hmp, trans, &ochain->bref); /* 1 ref */
2657 if (flags & HAMMER2_DELDUP_RECORE)
2658 hammer2_chain_core_alloc(nchain, NULL);
2660 hammer2_chain_core_alloc(nchain, ochain->core);
2661 above = ochain->above;
2663 bytes = (hammer2_off_t)1 <<
2664 (int)(ochain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
2665 nchain->bytes = bytes;
2666 nchain->modify_tid = ochain->modify_tid;
2669 * Lock nchain and insert into ochain's core hierarchy, marking
2670 * ochain DELETED at the same time. Having both chains locked
2671 * is extremely important for atomicy.
2673 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER);
2674 hammer2_chain_dup_fixup(ochain, nchain);
2675 /* extra ref still present from original allocation */
2677 nchain->index = ochain->index;
2679 spin_lock(&above->cst.spin);
2680 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_ONRBTREE);
2681 ochain->delete_tid = trans->sync_tid;
2682 nchain->above = above;
2683 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_DELETED);
2684 if ((ochain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2685 hammer2_chain_ref(ochain);
2686 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_MOVED);
2688 if (RB_INSERT(hammer2_chain_tree, &above->rbtree, nchain)) {
2689 panic("hammer2_chain_delete_duplicate: collision");
2691 spin_unlock(&above->cst.spin);
2694 * We have to unlock ochain to flush any dirty data, asserting the
2695 * case (data == NULL) to catch any extra locks that might have been
2696 * present, then transfer state to nchain.
2698 oflags = ochain->flags;
2699 odata = ochain->data;
2700 hammer2_chain_unlock(ochain); /* replacing ochain */
2701 KKASSERT(ochain->bref.type == HAMMER2_BREF_TYPE_INODE ||
2702 ochain->data == NULL);
2704 if (oflags & HAMMER2_CHAIN_INITIAL)
2705 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL);
2708 * WARNING! We should never resolve DATA to device buffers
2709 * (XXX allow it if the caller did?), and since
2710 * we currently do not have the logical buffer cache
2711 * buffer in-hand to fix its cached physical offset
2712 * we also force the modify code to not COW it. XXX
2714 if (oflags & HAMMER2_CHAIN_MODIFIED) {
2715 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2716 hammer2_chain_modify(trans, &nchain,
2717 HAMMER2_MODIFY_OPTDATA |
2718 HAMMER2_MODIFY_NOREALLOC |
2719 HAMMER2_MODIFY_ASSERTNOCOPY);
2720 } else if (oflags & HAMMER2_CHAIN_INITIAL) {
2721 hammer2_chain_modify(trans, &nchain,
2722 HAMMER2_MODIFY_OPTDATA |
2723 HAMMER2_MODIFY_ASSERTNOCOPY);
2725 hammer2_chain_modify(trans, &nchain,
2726 HAMMER2_MODIFY_ASSERTNOCOPY);
2728 hammer2_chain_drop(nchain);
2730 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2731 hammer2_chain_drop(nchain);
2732 } else if (oflags & HAMMER2_CHAIN_INITIAL) {
2733 hammer2_chain_drop(nchain);
2735 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_ALWAYS |
2736 HAMMER2_RESOLVE_NOREF);
2737 hammer2_chain_unlock(nchain);
2742 * Unconditionally set the MOVED and SUBMODIFIED bit to force
2743 * update of parent bref and indirect blockrefs during flush.
2745 if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2746 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED);
2747 hammer2_chain_ref(nchain);
2749 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2750 hammer2_chain_setsubmod(trans, nchain);
2755 * Helper function to fixup inodes. The caller procedure stack may hold
2756 * multiple locks on ochain if it represents an inode, preventing our
2757 * unlock from retiring its state to the buffer cache.
2759 * In this situation any attempt to access the buffer cache could result
2760 * either in stale data or a deadlock. Work around the problem by copying
2761 * the embedded data directly.
2765 hammer2_chain_dup_fixup(hammer2_chain_t *ochain, hammer2_chain_t *nchain)
2767 if (ochain->data == NULL)
2769 switch(ochain->bref.type) {
2770 case HAMMER2_BREF_TYPE_INODE:
2771 KKASSERT(nchain->data == NULL);
2772 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_EMBEDDED);
2773 nchain->data = kmalloc(sizeof(nchain->data->ipdata),
2774 ochain->hmp->mchain, M_WAITOK | M_ZERO);
2775 nchain->data->ipdata = ochain->data->ipdata;
2777 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2778 KKASSERT(nchain->data == NULL);
2779 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_EMBEDDED);
2780 nchain->data = kmalloc(sizeof(nchain->data->bmdata),
2781 ochain->hmp->mchain, M_WAITOK | M_ZERO);
2782 bcopy(ochain->data->bmdata,
2783 nchain->data->bmdata,
2784 sizeof(nchain->data->bmdata));
2792 * Create a snapshot of the specified {parent, chain} with the specified
2795 * (a) We create a duplicate connected to the super-root as the specified
2798 * (b) We issue a restricted flush using the current transaction on the
2801 * (c) We disconnect and reallocate the duplicate's core.
2804 hammer2_chain_snapshot(hammer2_trans_t *trans, hammer2_inode_t *ip,
2805 hammer2_ioc_pfs_t *pfs)
2807 hammer2_cluster_t *cluster;
2808 hammer2_mount_t *hmp;
2809 hammer2_chain_t *chain;
2810 hammer2_chain_t *nchain;
2811 hammer2_chain_t *parent;
2812 hammer2_inode_data_t *ipdata;
2817 name_len = strlen(pfs->name);
2818 lhc = hammer2_dirhash(pfs->name, name_len);
2819 cluster = ip->pmp->mount_cluster;
2820 hmp = ip->chain->hmp;
2821 KKASSERT(hmp == cluster->hmp); /* XXX */
2824 * Create disconnected duplicate
2826 KKASSERT((trans->flags & HAMMER2_TRANS_RESTRICTED) == 0);
2828 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_MAYBE);
2829 hammer2_chain_duplicate(trans, NULL, -1, &nchain, NULL);
2830 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_RECYCLE |
2831 HAMMER2_CHAIN_SNAPSHOT);
2834 * Create named entry in the super-root.
2836 parent = hammer2_chain_lookup_init(hmp->schain, 0);
2838 while (error == 0) {
2839 chain = hammer2_chain_lookup(&parent, lhc, lhc, 0);
2842 if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK)
2844 hammer2_chain_unlock(chain);
2848 hammer2_chain_create(trans, &parent, &nchain, lhc, 0,
2849 HAMMER2_BREF_TYPE_INODE,
2850 HAMMER2_INODE_BYTES);
2851 hammer2_chain_modify(trans, &nchain, HAMMER2_MODIFY_ASSERTNOCOPY);
2852 hammer2_chain_lookup_done(parent);
2853 parent = NULL; /* safety */
2858 ipdata = &nchain->data->ipdata;
2859 ipdata->name_key = lhc;
2860 ipdata->name_len = name_len;
2861 ksnprintf(ipdata->filename, sizeof(ipdata->filename), "%s", pfs->name);
2864 * Set PFS type, generate a unique filesystem id, and generate
2865 * a cluster id. Use the same clid when snapshotting a PFS root,
2866 * which theoretically allows the snapshot to be used as part of
2867 * the same cluster (perhaps as a cache).
2869 ipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
2870 kern_uuidgen(&ipdata->pfs_fsid, 1);
2871 if (ip->chain == cluster->rchain)
2872 ipdata->pfs_clid = ip->chain->data->ipdata.pfs_clid;
2874 kern_uuidgen(&ipdata->pfs_clid, 1);
2877 * Issue a restricted flush of the snapshot. This is a synchronous
2880 trans->flags |= HAMMER2_TRANS_RESTRICTED;
2881 kprintf("SNAPSHOTA\n");
2882 tsleep(trans, 0, "snapslp", hz*4);
2883 kprintf("SNAPSHOTB\n");
2884 hammer2_chain_flush(trans, nchain);
2885 trans->flags &= ~HAMMER2_TRANS_RESTRICTED;
2889 * Remove the link b/c nchain is a snapshot and snapshots don't
2890 * follow CHAIN_DELETED semantics ?
2895 KKASSERT(chain->duplink == nchain);
2896 KKASSERT(chain->core == nchain->core);
2897 KKASSERT(nchain->refs >= 2);
2898 chain->duplink = nchain->duplink;
2899 atomic_clear_int(&nchain->flags, HAMMER2_CHAIN_DUPTARGET);
2900 hammer2_chain_drop(nchain);
2903 kprintf("snapshot %s nchain->refs %d nchain->flags %08x\n",
2904 pfs->name, nchain->refs, nchain->flags);
2905 hammer2_chain_unlock(nchain);
2911 * Create an indirect block that covers one or more of the elements in the
2912 * current parent. Either returns the existing parent with no locking or
2913 * ref changes or returns the new indirect block locked and referenced
2914 * and leaving the original parent lock/ref intact as well.
2916 * If an error occurs, NULL is returned and *errorp is set to the error.
2918 * The returned chain depends on where the specified key falls.
2920 * The key/keybits for the indirect mode only needs to follow three rules:
2922 * (1) That all elements underneath it fit within its key space and
2924 * (2) That all elements outside it are outside its key space.
2926 * (3) When creating the new indirect block any elements in the current
2927 * parent that fit within the new indirect block's keyspace must be
2928 * moved into the new indirect block.
2930 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
2931 * keyspace the the current parent, but lookup/iteration rules will
2932 * ensure (and must ensure) that rule (2) for all parents leading up
2933 * to the nearest inode or the root volume header is adhered to. This
2934 * is accomplished by always recursing through matching keyspaces in
2935 * the hammer2_chain_lookup() and hammer2_chain_next() API.
2937 * The current implementation calculates the current worst-case keyspace by
2938 * iterating the current parent and then divides it into two halves, choosing
2939 * whichever half has the most elements (not necessarily the half containing
2940 * the requested key).
2942 * We can also opt to use the half with the least number of elements. This
2943 * causes lower-numbered keys (aka logical file offsets) to recurse through
2944 * fewer indirect blocks and higher-numbered keys to recurse through more.
2945 * This also has the risk of not moving enough elements to the new indirect
2946 * block and being forced to create several indirect blocks before the element
2949 * Must be called with an exclusively locked parent.
2951 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent,
2952 hammer2_key_t *keyp, int keybits,
2953 hammer2_blockref_t *base, int count);
2954 static int hammer2_chain_indkey_normal(hammer2_chain_t *parent,
2955 hammer2_key_t *keyp, int keybits,
2956 hammer2_blockref_t *base, int count);
2959 hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
2960 hammer2_key_t create_key, int create_bits,
2961 int for_type, int *errorp)
2963 hammer2_mount_t *hmp;
2964 hammer2_chain_core_t *above;
2965 hammer2_chain_core_t *icore;
2966 hammer2_blockref_t *base;
2967 hammer2_blockref_t *bref;
2968 hammer2_chain_t *chain;
2969 hammer2_chain_t *child;
2970 hammer2_chain_t *ichain;
2971 hammer2_chain_t dummy;
2972 hammer2_key_t key = create_key;
2973 int keybits = create_bits;
2979 * Calculate the base blockref pointer or NULL if the chain
2980 * is known to be empty. We need to calculate the array count
2981 * for RB lookups either way.
2985 KKASSERT(ccms_thread_lock_owned(&parent->core->cst));
2986 above = parent->core;
2988 /*hammer2_chain_modify(trans, &parent, HAMMER2_MODIFY_OPTDATA);*/
2989 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2992 switch(parent->bref.type) {
2993 case HAMMER2_BREF_TYPE_INODE:
2994 count = HAMMER2_SET_COUNT;
2996 case HAMMER2_BREF_TYPE_INDIRECT:
2997 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2998 count = parent->bytes / sizeof(hammer2_blockref_t);
3000 case HAMMER2_BREF_TYPE_VOLUME:
3001 count = HAMMER2_SET_COUNT;
3003 case HAMMER2_BREF_TYPE_FREEMAP:
3004 count = HAMMER2_SET_COUNT;
3007 panic("hammer2_chain_create_indirect: "
3008 "unrecognized blockref type: %d",
3014 switch(parent->bref.type) {
3015 case HAMMER2_BREF_TYPE_INODE:
3016 base = &parent->data->ipdata.u.blockset.blockref[0];
3017 count = HAMMER2_SET_COUNT;
3019 case HAMMER2_BREF_TYPE_INDIRECT:
3020 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3021 base = &parent->data->npdata[0];
3022 count = parent->bytes / sizeof(hammer2_blockref_t);
3024 case HAMMER2_BREF_TYPE_VOLUME:
3025 base = &hmp->voldata.sroot_blockset.blockref[0];
3026 count = HAMMER2_SET_COUNT;
3028 case HAMMER2_BREF_TYPE_FREEMAP:
3029 base = &hmp->voldata.freemap_blockset.blockref[0];
3030 count = HAMMER2_SET_COUNT;
3033 panic("hammer2_chain_create_indirect: "
3034 "unrecognized blockref type: %d",
3042 * dummy used in later chain allocation (no longer used for lookups).
3044 bzero(&dummy, sizeof(dummy));
3045 dummy.delete_tid = HAMMER2_MAX_TID;
3048 * When creating an indirect block for a freemap node or leaf
3049 * the key/keybits must be fitted to static radix levels because
3050 * particular radix levels use particular reserved blocks in the
3053 * This routine calculates the key/radix of the indirect block
3054 * we need to create, and whether it is on the high-side or the
3057 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3058 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3059 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits,
3062 keybits = hammer2_chain_indkey_normal(parent, &key, keybits,
3067 * Normalize the key for the radix being represented, keeping the
3068 * high bits and throwing away the low bits.
3070 key &= ~(((hammer2_key_t)1 << keybits) - 1);
3073 * How big should our new indirect block be? It has to be at least
3074 * as large as its parent.
3076 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
3077 nbytes = HAMMER2_IND_BYTES_MIN;
3079 nbytes = HAMMER2_IND_BYTES_MAX;
3080 if (nbytes < count * sizeof(hammer2_blockref_t))
3081 nbytes = count * sizeof(hammer2_blockref_t);
3084 * Ok, create our new indirect block
3086 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3087 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3088 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE;
3090 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
3092 dummy.bref.key = key;
3093 dummy.bref.keybits = keybits;
3094 dummy.bref.data_off = hammer2_getradix(nbytes);
3095 dummy.bref.methods = parent->bref.methods;
3097 ichain = hammer2_chain_alloc(hmp, trans, &dummy.bref);
3098 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
3099 hammer2_chain_core_alloc(ichain, NULL);
3100 icore = ichain->core;
3101 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE);
3102 hammer2_chain_drop(ichain); /* excess ref from alloc */
3105 * We have to mark it modified to allocate its block, but use
3106 * OPTDATA to allow it to remain in the INITIAL state. Otherwise
3107 * it won't be acted upon by the flush code.
3109 * XXX leave the node unmodified, depend on the SUBMODIFIED
3110 * flush to assign and modify parent blocks.
3112 hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);
3115 * Iterate the original parent and move the matching brefs into
3116 * the new indirect block.
3118 * At the same time locate an empty slot (or what will become an
3119 * empty slot) and assign the new indirect block to that slot.
3121 * XXX handle flushes.
3123 spin_lock(&above->cst.spin);
3124 for (i = 0; i < count; ++i) {
3126 * For keying purposes access the bref from the media or
3127 * from our in-memory cache. In cases where the in-memory
3128 * cache overrides the media the keyrefs will be the same
3129 * anyway so we can avoid checking the cache when the media
3132 child = hammer2_chain_find_locked(parent, i);
3134 if (child->flags & HAMMER2_CHAIN_DELETED) {
3135 if (ichain->index < 0)
3139 bref = &child->bref;
3140 } else if (base && base[i].type) {
3143 if (ichain->index < 0)
3149 * Skip keys that are not within the key/radix of the new
3150 * indirect block. They stay in the parent.
3152 if ((~(((hammer2_key_t)1 << keybits) - 1) &
3153 (key ^ bref->key)) != 0) {
3158 * This element is being moved from the parent, its slot
3159 * is available for our new indirect block.
3161 if (ichain->index < 0)
3165 * Load the new indirect block by acquiring or allocating
3166 * the related chain entries, then move them to the new
3167 * parent (ichain) by deleting them from their old location
3168 * and inserting a duplicate of the chain and any modified
3169 * sub-chain in the new location.
3171 * We must set MOVED in the chain being duplicated and
3172 * SUBMODIFIED in the parent(s) so the flush code knows
3173 * what is going on. The latter is done after the loop.
3175 * WARNING! above->cst.spin must be held when parent is
3176 * modified, even though we own the full blown lock,
3177 * to deal with setsubmod and rename races.
3178 * (XXX remove this req).
3180 spin_unlock(&above->cst.spin);
3181 chain = hammer2_chain_get(parent, i, HAMMER2_LOOKUP_NODATA);
3182 hammer2_chain_delete(trans, chain);
3183 hammer2_chain_duplicate(trans, ichain, i, &chain, NULL);
3184 hammer2_chain_unlock(chain);
3185 KKASSERT(parent->refs > 0);
3187 spin_lock(&above->cst.spin);
3189 spin_unlock(&above->cst.spin);
3192 * Insert the new indirect block into the parent now that we've
3193 * cleared out some entries in the parent. We calculated a good
3194 * insertion index in the loop above (ichain->index).
3196 * We don't have to set MOVED here because we mark ichain modified
3197 * down below (so the normal modified -> flush -> set-moved sequence
3200 * The insertion shouldn't race as this is a completely new block
3201 * and the parent is locked.
3203 if (ichain->index < 0)
3204 kprintf("indirect parent %p count %d key %016jx/%d\n",
3205 parent, count, (intmax_t)key, keybits);
3206 KKASSERT(ichain->index >= 0);
3207 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
3208 spin_lock(&above->cst.spin);
3209 if (RB_INSERT(hammer2_chain_tree, &above->rbtree, ichain))
3210 panic("hammer2_chain_create_indirect: ichain insertion");
3211 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_ONRBTREE);
3212 ichain->above = above;
3213 spin_unlock(&above->cst.spin);
3216 * Mark the new indirect block modified after insertion, which
3217 * will propagate up through parent all the way to the root and
3218 * also allocate the physical block in ichain for our caller,
3219 * and assign ichain->data to a pre-zero'd space (because there
3220 * is not prior data to copy into it).
3222 * We have to set SUBMODIFIED in ichain's flags manually so the
3223 * flusher knows it has to recurse through it to get to all of
3224 * our moved blocks, then call setsubmod() to set the bit
3227 /*hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);*/
3228 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
3229 hammer2_chain_setsubmod(trans, ichain);
3232 * Figure out what to return.
3234 if (~(((hammer2_key_t)1 << keybits) - 1) &
3235 (create_key ^ key)) {
3237 * Key being created is outside the key range,
3238 * return the original parent.
3240 hammer2_chain_unlock(ichain);
3243 * Otherwise its in the range, return the new parent.
3244 * (leave both the new and old parent locked).
3253 * Calculate the keybits and highside/lowside of the freemap node the
3254 * caller is creating.
3256 * This routine will specify the next higher-level freemap key/radix
3257 * representing the lowest-ordered set. By doing so, eventually all
3258 * low-ordered sets will be moved one level down.
3260 * We have to be careful here because the freemap reserves a limited
3261 * number of blocks for a limited number of levels. So we can't just
3262 * push indiscriminately.
3265 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
3266 int keybits, hammer2_blockref_t *base, int count)
3268 hammer2_chain_core_t *above;
3269 hammer2_chain_t *child;
3270 hammer2_blockref_t *bref;
3277 above = parent->core;
3283 * Calculate the range of keys in the array being careful to skip
3284 * slots which are overridden with a deletion.
3286 spin_lock(&above->cst.spin);
3287 for (i = 0; i < count; ++i) {
3288 child = hammer2_chain_find_locked(parent, i);
3290 if (child->flags & HAMMER2_CHAIN_DELETED)
3292 bref = &child->bref;
3293 } else if (base && base[i].type) {
3299 if (keybits > bref->keybits) {
3301 keybits = bref->keybits;
3302 } else if (keybits == bref->keybits && bref->key < key) {
3306 spin_unlock(&above->cst.spin);
3309 * Return the keybits for a higher-level FREEMAP_NODE covering
3313 case HAMMER2_FREEMAP_LEVEL0_RADIX:
3314 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX;
3316 case HAMMER2_FREEMAP_LEVEL1_RADIX:
3317 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX;
3319 case HAMMER2_FREEMAP_LEVEL2_RADIX:
3320 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX;
3322 case HAMMER2_FREEMAP_LEVEL3_RADIX:
3323 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX;
3325 case HAMMER2_FREEMAP_LEVEL4_RADIX:
3326 panic("hammer2_chain_indkey_freemap: level too high");
3329 panic("hammer2_chain_indkey_freemap: bad radix");
3338 * Calculate the keybits and highside/lowside of the indirect block the
3339 * caller is creating.
3342 hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
3343 int keybits, hammer2_blockref_t *base, int count)
3345 hammer2_chain_core_t *above;
3346 hammer2_chain_t *child;
3347 hammer2_blockref_t *bref;
3355 above = parent->core;
3360 * Calculate the range of keys in the array being careful to skip
3361 * slots which are overridden with a deletion. Once the scan
3362 * completes we will cut the key range in half and shift half the
3363 * range into the new indirect block.
3365 spin_lock(&above->cst.spin);
3366 for (i = 0; i < count; ++i) {
3367 child = hammer2_chain_find_locked(parent, i);
3369 if (child->flags & HAMMER2_CHAIN_DELETED)
3371 bref = &child->bref;
3372 } else if (base && base[i].type) {
3379 * Expand our calculated key range (key, keybits) to fit
3380 * the scanned key. nkeybits represents the full range
3381 * that we will later cut in half (two halves @ nkeybits - 1).
3384 if (nkeybits < bref->keybits) {
3385 if (bref->keybits > 64) {
3386 kprintf("bad bref index %d chain %p bref %p\n",
3390 nkeybits = bref->keybits;
3392 while (nkeybits < 64 &&
3393 (~(((hammer2_key_t)1 << nkeybits) - 1) &
3394 (key ^ bref->key)) != 0) {
3399 * If the new key range is larger we have to determine
3400 * which side of the new key range the existing keys fall
3401 * under by checking the high bit, then collapsing the
3402 * locount into the hicount or vise-versa.
3404 if (keybits != nkeybits) {
3405 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
3416 * The newly scanned key will be in the lower half or the
3417 * higher half of the (new) key range.
3419 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
3424 spin_unlock(&above->cst.spin);
3425 bref = NULL; /* now invalid (safety) */
3428 * Adjust keybits to represent half of the full range calculated
3429 * above (radix 63 max)
3434 * Select whichever half contains the most elements. Theoretically
3435 * we can select either side as long as it contains at least one
3436 * element (in order to ensure that a free slot is present to hold
3437 * the indirect block).
3439 if (hammer2_indirect_optimize) {
3441 * Insert node for least number of keys, this will arrange
3442 * the first few blocks of a large file or the first few
3443 * inodes in a directory with fewer indirect blocks when
3446 if (hicount < locount && hicount != 0)
3447 key |= (hammer2_key_t)1 << keybits;
3449 key &= ~(hammer2_key_t)1 << keybits;
3452 * Insert node for most number of keys, best for heavily
3455 if (hicount > locount)
3456 key |= (hammer2_key_t)1 << keybits;
3458 key &= ~(hammer2_key_t)1 << keybits;
3466 * Sets CHAIN_DELETED and CHAIN_MOVED in the chain being deleted and
3467 * set chain->delete_tid.
3469 * This function does NOT generate a modification to the parent. It
3470 * would be nearly impossible to figure out which parent to modify anyway.
3471 * Such modifications are handled by the flush code and are properly merged
3472 * using the flush synchronization point.
3474 * The find/get code will properly overload the RBTREE check on top of
3475 * the bref check to detect deleted entries.
3477 * This function is NOT recursive. Any entity already pushed into the
3478 * chain (such as an inode) may still need visibility into its contents,
3479 * as well as the ability to read and modify the contents. For example,
3480 * for an unlinked file which is still open.
3482 * NOTE: This function does NOT set chain->modify_tid, allowing future
3483 * code to distinguish between live and deleted chains by testing
3486 * NOTE: Deletions normally do not occur in the middle of a duplication
3487 * chain but we use a trick for hardlink migration that refactors
3488 * the originating inode without deleting it, so we make no assumptions
3492 hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain)
3494 KKASSERT(ccms_thread_lock_owned(&chain->core->cst));
3497 * Nothing to do if already marked.
3499 if (chain->flags & HAMMER2_CHAIN_DELETED)
3503 * We must set MOVED along with DELETED for the flush code to
3504 * recognize the operation and properly disconnect the chain
3507 * The setting of DELETED causes finds, lookups, and _next iterations
3508 * to no longer recognize the chain. RB_SCAN()s will still have
3509 * visibility (needed for flush serialization points).
3511 * We need the spinlock on the core whos RBTREE contains chain
3512 * to protect against races.
3514 spin_lock(&chain->above->cst.spin);
3515 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3516 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
3517 hammer2_chain_ref(chain);
3518 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
3520 chain->delete_tid = trans->sync_tid;
3521 spin_unlock(&chain->above->cst.spin);
3522 hammer2_chain_setsubmod(trans, chain);
3526 hammer2_chain_wait(hammer2_chain_t *chain)
3528 tsleep(chain, 0, "chnflw", 1);
3533 adjreadcounter(hammer2_blockref_t *bref, size_t bytes)
3537 switch(bref->type) {
3538 case HAMMER2_BREF_TYPE_DATA:
3539 counterp = &hammer2_iod_file_read;
3541 case HAMMER2_BREF_TYPE_INODE:
3542 counterp = &hammer2_iod_meta_read;
3544 case HAMMER2_BREF_TYPE_INDIRECT:
3545 counterp = &hammer2_iod_indr_read;
3547 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3548 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
3549 counterp = &hammer2_iod_fmap_read;
3552 counterp = &hammer2_iod_volu_read;