2 * Copyright (c) 2011-2013 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
18 * 3. Neither the name of The DragonFly Project nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific, prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * This subsystem implements most of the core support functions for
37 * the hammer2_chain and hammer2_chain_core structures.
39 * Chains represent the filesystem media topology in-memory. Any given
40 * chain can represent an inode, indirect block, data, or other types
43 * This module provides APIs for direct and indirect block searches,
44 * iterations, recursions, creation, deletion, replication, and snapshot
45 * views (used by the flush and snapshot code).
47 * Generally speaking any modification made to a chain must propagate all
48 * the way back to the volume header, issuing copy-on-write updates to the
49 * blockref tables all the way up. Any chain except the volume header itself
50 * can be flushed to disk at any time, in any order. None of it matters
51 * until we get to the point where we want to synchronize the volume header
52 * (see the flush code).
54 * The chain structure supports snapshot views in time, which are primarily
55 * used until the related data and meta-data is flushed to allow the
56 * filesystem to make snapshots without requiring it to first flush,
57 * and to allow the filesystem flush and modify the filesystem concurrently
58 * with minimal or no stalls.
60 #include <sys/cdefs.h>
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/types.h>
65 #include <sys/kern_syscall.h>
70 static int hammer2_indirect_optimize; /* XXX SYSCTL */
72 static hammer2_chain_t *hammer2_chain_create_indirect(
73 hammer2_trans_t *trans, hammer2_chain_t *parent,
74 hammer2_key_t key, int keybits, int for_type, int *errorp);
77 * We use a red-black tree to guarantee safe lookups under shared locks.
79 * Chains can be overloaded onto the same index, creating a different
80 * view of a blockref table based on a transaction id. The RBTREE
81 * deconflicts the view by sub-sorting on delete_tid.
83 * NOTE: Any 'current' chain which is not yet deleted will have a
84 * delete_tid of HAMMER2_MAX_TID (0xFFF....FFFLLU).
86 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
89 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
91 if (chain1->index < chain2->index)
93 if (chain1->index > chain2->index)
95 if (chain1->delete_tid < chain2->delete_tid)
97 if (chain1->delete_tid > chain2->delete_tid)
103 * Recursively set the SUBMODIFIED flag up to the root starting at chain's
104 * parent. SUBMODIFIED is not set in chain itself.
106 * This function only operates on current-time transactions and is not
107 * used during flushes. Instead, the flush code manages the flag itself.
110 hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain)
112 hammer2_chain_core_t *above;
114 if (trans->flags & HAMMER2_TRANS_ISFLUSH)
116 while ((above = chain->above) != NULL) {
117 spin_lock(&above->cst.spin);
118 chain = above->first_parent;
119 while (hammer2_chain_refactor_test(chain, 1))
120 chain = chain->next_parent;
121 atomic_set_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
122 spin_unlock(&above->cst.spin);
127 * Allocate a new disconnected chain element representing the specified
128 * bref. chain->refs is set to 1 and the passed bref is copied to
129 * chain->bref. chain->bytes is derived from the bref.
131 * chain->core is NOT allocated and the media data and bp pointers are left
132 * NULL. The caller must call chain_core_alloc() to allocate or associate
133 * a core with the chain.
135 * NOTE: Returns a referenced but unlocked (because there is no core) chain.
138 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_trans_t *trans,
139 hammer2_blockref_t *bref)
141 hammer2_chain_t *chain;
142 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
145 * Construct the appropriate system structure.
148 case HAMMER2_BREF_TYPE_INODE:
149 case HAMMER2_BREF_TYPE_INDIRECT:
150 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
151 case HAMMER2_BREF_TYPE_DATA:
152 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
153 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO);
155 case HAMMER2_BREF_TYPE_VOLUME:
156 case HAMMER2_BREF_TYPE_FREEMAP:
158 panic("hammer2_chain_alloc volume type illegal for op");
161 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
167 chain->index = -1; /* not yet assigned */
168 chain->bytes = bytes;
170 chain->flags = HAMMER2_CHAIN_ALLOCATED;
171 chain->delete_tid = HAMMER2_MAX_TID;
173 chain->modify_tid = trans->sync_tid;
179 * Associate an existing core with the chain or allocate a new core.
181 * The core is not locked. No additional refs on the chain are made.
184 hammer2_chain_core_alloc(hammer2_chain_t *chain, hammer2_chain_core_t *core)
186 hammer2_chain_t **scanp;
188 KKASSERT(chain->core == NULL);
189 KKASSERT(chain->next_parent == NULL);
192 core = kmalloc(sizeof(*core), chain->hmp->mchain,
194 RB_INIT(&core->rbtree);
197 ccms_cst_init(&core->cst, chain);
198 core->first_parent = chain;
200 atomic_add_int(&core->sharecnt, 1);
202 spin_lock(&core->cst.spin);
203 if (core->first_parent == NULL) {
204 core->first_parent = chain;
206 scanp = &core->first_parent;
208 scanp = &(*scanp)->next_parent;
210 hammer2_chain_ref(chain); /* next_parent link */
212 spin_unlock(&core->cst.spin);
217 * Add a reference to a chain element, preventing its destruction.
220 hammer2_chain_ref(hammer2_chain_t *chain)
222 atomic_add_int(&chain->refs, 1);
226 * Drop the caller's reference to the chain. When the ref count drops to
227 * zero this function will disassociate the chain from its parent and
228 * deallocate it, then recursely drop the parent using the implied ref
229 * from the chain's chain->parent.
231 * WARNING! Just because we are able to deallocate a chain doesn't mean
232 * that chain->core->rbtree is empty. There can still be a sharecnt
233 * on chain->core and RBTREE entries that refer to different parents.
235 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain);
238 hammer2_chain_drop(hammer2_chain_t *chain)
244 if (chain->flags & HAMMER2_CHAIN_MOVED)
246 if (chain->flags & HAMMER2_CHAIN_MODIFIED)
248 KKASSERT(chain->refs > need);
257 chain = hammer2_chain_lastdrop(chain);
259 if (atomic_cmpset_int(&chain->refs, refs, refs - 1))
261 /* retry the same chain */
267 * Safe handling of the 1->0 transition on chain. Returns a chain for
268 * recursive drop or NULL, possibly returning the same chain of the atomic
271 * The cst spinlock is allowed nest child-to-parent (not parent-to-child).
275 hammer2_chain_lastdrop(hammer2_chain_t *chain)
277 hammer2_mount_t *hmp;
278 hammer2_chain_core_t *above;
279 hammer2_chain_core_t *core;
280 hammer2_chain_t *rdrop1;
281 hammer2_chain_t *rdrop2;
284 * Spinlock the core and check to see if it is empty. If it is
285 * not empty we leave chain intact with refs == 0.
287 if ((core = chain->core) != NULL) {
288 spin_lock(&core->cst.spin);
289 if (RB_ROOT(&core->rbtree)) {
290 if (atomic_cmpset_int(&chain->refs, 1, 0)) {
291 /* 1->0 transition successful */
292 spin_unlock(&core->cst.spin);
295 /* 1->0 transition failed, retry */
296 spin_unlock(&core->cst.spin);
307 * Spinlock the parent and try to drop the last ref. On success
308 * remove chain from its parent.
310 if ((above = chain->above) != NULL) {
311 spin_lock(&above->cst.spin);
312 if (!atomic_cmpset_int(&chain->refs, 1, 0)) {
313 /* 1->0 transition failed */
314 spin_unlock(&above->cst.spin);
316 spin_unlock(&core->cst.spin);
322 * 1->0 transition successful
324 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
325 RB_REMOVE(hammer2_chain_tree, &above->rbtree, chain);
326 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
330 * Calculate a chain to return for a recursive drop.
332 * XXX this needs help, we have a potential deep-recursion
333 * problem which we try to address but sometimes we wind up
334 * with two elements that have to be dropped.
336 * If the chain has an associated core with refs at 0
337 * the chain must be the first in the core's linked list
338 * by definition, and we will recursively drop the ref
339 * implied by the chain->next_parent field.
341 * Otherwise if the rbtree containing chain is empty we try
342 * to recursively drop our parent (only the first one could
343 * possibly have refs == 0 since the rest are linked via
346 * Otherwise we try to recursively drop a sibling.
348 if (chain->next_parent) {
349 KKASSERT(core != NULL);
350 rdrop1 = chain->next_parent;
352 if (RB_EMPTY(&above->rbtree)) {
353 rdrop2 = above->first_parent;
354 if (rdrop2 == NULL || rdrop2->refs ||
355 atomic_cmpset_int(&rdrop2->refs, 0, 1) == 0) {
359 rdrop2 = RB_ROOT(&above->rbtree);
360 if (atomic_cmpset_int(&rdrop2->refs, 0, 1) == 0)
363 spin_unlock(&above->cst.spin);
364 above = NULL; /* safety */
366 if (chain->next_parent) {
367 KKASSERT(core != NULL);
368 rdrop1 = chain->next_parent;
373 * We still have the core spinlock (if core is non-NULL). The
374 * above spinlock is gone.
377 KKASSERT(core->first_parent == chain);
378 if (chain->next_parent) {
379 /* parent should already be set */
380 KKASSERT(rdrop1 == chain->next_parent);
382 core->first_parent = chain->next_parent;
383 chain->next_parent = NULL;
386 if (atomic_fetchadd_int(&core->sharecnt, -1) == 1) {
388 * On the 1->0 transition of core we can destroy
391 spin_unlock(&core->cst.spin);
392 KKASSERT(core->cst.count == 0);
393 KKASSERT(core->cst.upgrade == 0);
394 kfree(core, hmp->mchain);
396 spin_unlock(&core->cst.spin);
398 core = NULL; /* safety */
402 * All spin locks are gone, finish freeing stuff.
404 KKASSERT((chain->flags & (HAMMER2_CHAIN_MOVED |
405 HAMMER2_CHAIN_MODIFIED)) == 0);
407 switch(chain->bref.type) {
408 case HAMMER2_BREF_TYPE_VOLUME:
409 case HAMMER2_BREF_TYPE_FREEMAP:
412 case HAMMER2_BREF_TYPE_INODE:
414 kfree(chain->data, hmp->minode);
419 KKASSERT(chain->data == NULL);
423 KKASSERT(chain->bp == NULL);
426 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) {
427 chain->flags &= ~HAMMER2_CHAIN_ALLOCATED;
428 kfree(chain, hmp->mchain);
430 if (rdrop1 && rdrop2) {
431 hammer2_chain_drop(rdrop1);
440 * Ref and lock a chain element, acquiring its data with I/O if necessary,
441 * and specify how you would like the data to be resolved.
443 * Returns 0 on success or an error code if the data could not be acquired.
444 * The chain element is locked on return regardless of whether an error
447 * The lock is allowed to recurse, multiple locking ops will aggregate
448 * the requested resolve types. Once data is assigned it will not be
449 * removed until the last unlock.
451 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
452 * (typically used to avoid device/logical buffer
455 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
456 * the INITIAL-create state (indirect blocks only).
458 * Do not resolve data elements for DATA chains.
459 * (typically used to avoid device/logical buffer
462 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
464 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
465 * it will be locked exclusive.
467 * NOTE: Embedded elements (volume header, inodes) are always resolved
470 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
471 * element will instantiate and zero its buffer, and flush it on
474 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
475 * so as not to instantiate a device buffer, which could alias against
476 * a logical file buffer. However, if ALWAYS is specified the
477 * device buffer will be instantiated anyway.
479 * WARNING! If data must be fetched a shared lock will temporarily be
480 * upgraded to exclusive. However, a deadlock can occur if
481 * the caller owns more than one shared lock.
484 hammer2_chain_lock(hammer2_chain_t *chain, int how)
486 hammer2_mount_t *hmp;
487 hammer2_chain_core_t *core;
488 hammer2_blockref_t *bref;
498 * Ref and lock the element. Recursive locks are allowed.
500 if ((how & HAMMER2_RESOLVE_NOREF) == 0)
501 hammer2_chain_ref(chain);
502 atomic_add_int(&chain->lockcnt, 1);
505 KKASSERT(hmp != NULL);
508 * Get the appropriate lock.
511 if (how & HAMMER2_RESOLVE_SHARED)
512 ccms_thread_lock(&core->cst, CCMS_STATE_SHARED);
514 ccms_thread_lock(&core->cst, CCMS_STATE_EXCLUSIVE);
517 * If we already have a valid data pointer no further action is
524 * Do we have to resolve the data?
526 switch(how & HAMMER2_RESOLVE_MASK) {
527 case HAMMER2_RESOLVE_NEVER:
529 case HAMMER2_RESOLVE_MAYBE:
530 if (chain->flags & HAMMER2_CHAIN_INITIAL)
532 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
535 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE)
538 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF)
541 case HAMMER2_RESOLVE_ALWAYS:
546 * Upgrade to an exclusive lock so we can safely manipulate the
547 * buffer cache. If another thread got to it before us we
550 ostate = ccms_thread_lock_upgrade(&core->cst);
552 ccms_thread_lock_downgrade(&core->cst, ostate);
557 * We must resolve to a device buffer, either by issuing I/O or
558 * by creating a zero-fill element. We do not mark the buffer
559 * dirty when creating a zero-fill element (the hammer2_chain_modify()
560 * API must still be used to do that).
562 * The device buffer is variable-sized in powers of 2 down
563 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage
564 * chunk always contains buffers of the same size. (XXX)
566 * The minimum physical IO size may be larger than the variable
571 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
572 bbytes = HAMMER2_MINIOSIZE;
573 pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
574 peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
575 boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
576 KKASSERT(pbase != 0);
579 * The getblk() optimization can only be used on newly created
580 * elements if the physical block size matches the request.
582 if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
583 chain->bytes == bbytes) {
584 chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
586 } else if (hammer2_cluster_enable) {
587 error = cluster_read(hmp->devvp, peof, pbase, bbytes,
588 HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
591 error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
595 kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
596 (intmax_t)pbase, error);
599 ccms_thread_lock_downgrade(&core->cst, ostate);
604 * Zero the data area if the chain is in the INITIAL-create state.
605 * Mark the buffer for bdwrite(). This clears the INITIAL state
606 * but does not mark the chain modified.
608 bdata = (char *)chain->bp->b_data + boff;
609 if (chain->flags & HAMMER2_CHAIN_INITIAL) {
610 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
611 bzero(bdata, chain->bytes);
612 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
616 * Setup the data pointer, either pointing it to an embedded data
617 * structure and copying the data from the buffer, or pointing it
620 * The buffer is not retained when copying to an embedded data
621 * structure in order to avoid potential deadlocks or recursions
622 * on the same physical buffer.
624 switch (bref->type) {
625 case HAMMER2_BREF_TYPE_VOLUME:
626 case HAMMER2_BREF_TYPE_FREEMAP:
628 * Copy data from bp to embedded buffer
630 panic("hammer2_chain_lock: called on unresolved volume header");
633 KKASSERT(pbase == 0);
634 KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
635 bcopy(bdata, &hmp->voldata, chain->bytes);
636 chain->data = (void *)&hmp->voldata;
641 case HAMMER2_BREF_TYPE_INODE:
643 * Copy data from bp to embedded buffer, do not retain the
646 KKASSERT(chain->bytes == sizeof(chain->data->ipdata));
647 chain->data = kmalloc(sizeof(chain->data->ipdata),
648 hmp->minode, M_WAITOK | M_ZERO);
649 bcopy(bdata, &chain->data->ipdata, chain->bytes);
653 case HAMMER2_BREF_TYPE_INDIRECT:
654 case HAMMER2_BREF_TYPE_DATA:
655 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
656 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
659 * Point data at the device buffer and leave bp intact.
661 chain->data = (void *)bdata;
666 * Make sure the bp is not specifically owned by this thread before
667 * restoring to a possibly shared lock, so another hammer2 thread
671 BUF_KERNPROC(chain->bp);
672 ccms_thread_lock_downgrade(&core->cst, ostate);
677 * Unlock and deref a chain element.
679 * On the last lock release any non-embedded data (chain->bp) will be
683 hammer2_chain_unlock(hammer2_chain_t *chain)
685 hammer2_chain_core_t *core = chain->core;
691 * The core->cst lock can be shared across several chains so we
692 * need to track the per-chain lockcnt separately.
694 * If multiple locks are present (or being attempted) on this
695 * particular chain we can just unlock, drop refs, and return.
697 * Otherwise fall-through on the 1->0 transition.
700 lockcnt = chain->lockcnt;
701 KKASSERT(lockcnt > 0);
704 if (atomic_cmpset_int(&chain->lockcnt,
705 lockcnt, lockcnt - 1)) {
706 ccms_thread_unlock(&core->cst);
707 hammer2_chain_drop(chain);
711 if (atomic_cmpset_int(&chain->lockcnt, 1, 0))
718 * On the 1->0 transition we upgrade the core lock (if necessary)
719 * to exclusive for terminal processing. If after upgrading we find
720 * that lockcnt is non-zero, another thread is racing us and will
721 * handle the unload for us later on, so just cleanup and return
722 * leaving the data/bp intact
724 * Otherwise if lockcnt is still 0 it is possible for it to become
725 * non-zero and race, but since we hold the core->cst lock
726 * exclusively all that will happen is that the chain will be
727 * reloaded after we unload it.
729 ostate = ccms_thread_lock_upgrade(&core->cst);
730 if (chain->lockcnt) {
731 ccms_thread_unlock_upgraded(&core->cst, ostate);
732 hammer2_chain_drop(chain);
737 * Shortcut the case if the data is embedded or not resolved.
739 * Do NOT NULL out chain->data (e.g. inode data), it might be
742 * The DIRTYBP flag is non-applicable in this situation and can
743 * be cleared to keep the flags state clean.
745 if (chain->bp == NULL) {
746 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
747 ccms_thread_unlock_upgraded(&core->cst, ostate);
748 hammer2_chain_drop(chain);
755 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
757 } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
758 switch(chain->bref.type) {
759 case HAMMER2_BREF_TYPE_DATA:
760 counterp = &hammer2_ioa_file_write;
762 case HAMMER2_BREF_TYPE_INODE:
763 counterp = &hammer2_ioa_meta_write;
765 case HAMMER2_BREF_TYPE_INDIRECT:
766 counterp = &hammer2_ioa_indr_write;
768 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
769 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
770 counterp = &hammer2_ioa_fmap_write;
773 counterp = &hammer2_ioa_volu_write;
778 switch(chain->bref.type) {
779 case HAMMER2_BREF_TYPE_DATA:
780 counterp = &hammer2_iod_file_write;
782 case HAMMER2_BREF_TYPE_INODE:
783 counterp = &hammer2_iod_meta_write;
785 case HAMMER2_BREF_TYPE_INDIRECT:
786 counterp = &hammer2_iod_indr_write;
788 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
789 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
790 counterp = &hammer2_iod_fmap_write;
793 counterp = &hammer2_iod_volu_write;
802 * If a device buffer was used for data be sure to destroy the
803 * buffer when we are done to avoid aliases (XXX what about the
804 * underlying VM pages?).
806 * NOTE: Freemap leaf's use reserved blocks and thus no aliasing
809 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
810 chain->bp->b_flags |= B_RELBUF;
813 * The DIRTYBP flag tracks whether we have to bdwrite() the buffer
814 * or not. The flag will get re-set when chain_modify() is called,
815 * even if MODIFIED is already set, allowing the OS to retire the
816 * buffer independent of a hammer2 flus.
819 if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
820 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
821 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
822 atomic_clear_int(&chain->flags,
823 HAMMER2_CHAIN_IOFLUSH);
824 chain->bp->b_flags |= B_RELBUF;
825 cluster_awrite(chain->bp);
827 chain->bp->b_flags |= B_CLUSTEROK;
831 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
832 atomic_clear_int(&chain->flags,
833 HAMMER2_CHAIN_IOFLUSH);
834 chain->bp->b_flags |= B_RELBUF;
837 /* bp might still be dirty */
842 ccms_thread_unlock_upgraded(&core->cst, ostate);
843 hammer2_chain_drop(chain);
847 * Resize the chain's physical storage allocation in-place. This may
848 * replace the passed-in chain with a new chain.
850 * Chains can be resized smaller without reallocating the storage.
851 * Resizing larger will reallocate the storage.
853 * Must be passed an exclusively locked parent and chain, returns a new
854 * exclusively locked chain at the same index and unlocks the old chain.
855 * Flushes the buffer if necessary.
857 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
858 * to avoid instantiating a device buffer that conflicts with the vnode
859 * data buffer. That is, the passed-in bp is a logical buffer, whereas
860 * any chain-oriented bp would be a device buffer.
862 * XXX flags currently ignored, uses chain->bp to detect data/no-data.
863 * XXX return error if cannot resize.
866 hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
868 hammer2_chain_t *parent, hammer2_chain_t **chainp,
869 int nradix, int flags)
871 hammer2_mount_t *hmp = trans->hmp;
872 hammer2_chain_t *chain = *chainp;
880 * Only data and indirect blocks can be resized for now.
881 * (The volu root, inodes, and freemap elements use a fixed size).
883 KKASSERT(chain != &hmp->vchain);
884 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
885 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
888 * Nothing to do if the element is already the proper size
890 obytes = chain->bytes;
891 nbytes = 1U << nradix;
892 if (obytes == nbytes)
896 * Delete the old chain and duplicate it at the same (parent, index),
897 * returning a new chain. This allows the old chain to still be
898 * used by the flush code. Duplication occurs in-place.
900 * The parent does not have to be locked for the delete/duplicate call,
901 * but is in this particular code path.
903 * NOTE: If we are not crossing a synchronization point the
904 * duplication code will simply reuse the existing chain
907 hammer2_chain_delete_duplicate(trans, &chain, 0);
910 * Set MODIFIED and add a chain ref to prevent destruction. Both
911 * modified flags share the same ref. (duplicated chains do not
912 * start out MODIFIED unless possibly if the duplication code
913 * decided to reuse the existing chain as-is).
915 * If the chain is already marked MODIFIED then we can safely
916 * return the previous allocation to the pool without having to
917 * worry about snapshots. XXX check flush synchronization.
919 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
920 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
921 hammer2_chain_ref(chain);
925 * Relocate the block, even if making it smaller (because different
926 * block sizes may be in different regions).
928 hammer2_freemap_alloc(trans, &chain->bref, nbytes);
929 chain->bytes = nbytes;
930 /*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */
933 * The device buffer may be larger than the allocation size.
935 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
936 bbytes = HAMMER2_MINIOSIZE;
937 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
938 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
941 * For now just support it on DATA chains (and not on indirect
944 KKASSERT(chain->bp == NULL);
947 * Make sure the chain is marked MOVED and SUBMOD is set in the
948 * parent(s) so the adjustments are picked up by flush.
950 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
951 hammer2_chain_ref(chain);
952 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
954 hammer2_chain_setsubmod(trans, chain);
959 * Set a chain modified, making it read-write and duplicating it if necessary.
960 * This function will assign a new physical block to the chain if necessary
962 * Duplication of already-modified chains is possible when the modification
963 * crosses a flush synchronization boundary.
965 * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
966 * level or the COW operation will not work.
968 * Data blocks - The chain is usually locked RESOLVE_NEVER so as not to
969 * run the data through the device buffers.
971 * This function may return a different chain than was passed, in which case
972 * the old chain will be unlocked and the new chain will be locked.
974 * ip->chain may be adjusted by hammer2_chain_modify_ip().
976 hammer2_inode_data_t *
977 hammer2_chain_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
978 hammer2_chain_t **chainp, int flags)
980 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
981 hammer2_chain_modify(trans, chainp, flags);
982 if (ip->chain != *chainp)
983 hammer2_inode_repoint(ip, NULL, *chainp);
984 return(&ip->chain->data->ipdata);
988 hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
991 hammer2_mount_t *hmp = trans->hmp;
992 hammer2_chain_t *chain;
994 hammer2_tid_t flush_tid;
1003 * Data must be resolved if already assigned unless explicitly
1004 * flagged otherwise.
1007 if (chain->data == NULL && (flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
1008 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
1009 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1010 hammer2_chain_unlock(chain);
1014 * data is not optional for freemap chains (we must always be sure
1015 * to copy the data on COW storage allocations).
1017 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
1018 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
1019 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) ||
1020 (flags & HAMMER2_MODIFY_OPTDATA) == 0);
1024 * If the chain is already marked MODIFIED we can usually just
1025 * return. However, if a modified chain is modified again in
1026 * a synchronization-point-crossing manner we have to issue a
1027 * delete/duplicate on the chain to avoid flush interference.
1029 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
1031 * Which flush_tid do we need to check? If the chain is
1032 * related to the freemap we have to use the freemap flush
1033 * tid (free_flush_tid), otherwise we use the normal filesystem
1034 * flush tid (topo_flush_tid). The two flush domains are
1035 * almost completely independent of each other.
1037 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
1038 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
1039 flush_tid = hmp->topo_flush_tid; /* XXX */
1040 goto skipxx; /* XXX */
1042 flush_tid = hmp->topo_flush_tid;
1048 if (chain->modify_tid <= flush_tid &&
1049 trans->sync_tid > flush_tid) {
1051 * Modifications cross synchronization point,
1052 * requires delete-duplicate.
1054 KKASSERT((flags & HAMMER2_MODIFY_ASSERTNOCOPY) == 0);
1055 hammer2_chain_delete_duplicate(trans, chainp, 0);
1057 /* fall through using duplicate */
1061 * Quick return path, set DIRTYBP to ensure that
1062 * the later retirement of bp will write it out.
1064 * quick return path also needs the modify_tid
1068 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
1069 if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
1070 chain->bref.modify_tid = trans->sync_tid;
1071 chain->modify_tid = trans->sync_tid;
1076 * modify_tid is only update for primary modifications, not for
1077 * propagated brefs. mirror_tid will be updated regardless during
1078 * the flush, no need to set it here.
1080 if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
1081 chain->bref.modify_tid = trans->sync_tid;
1084 * Set MODIFIED and add a chain ref to prevent destruction. Both
1085 * modified flags share the same ref.
1087 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1088 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1089 hammer2_chain_ref(chain);
1093 * Adjust chain->modify_tid so the flusher knows when the
1094 * modification occurred.
1096 chain->modify_tid = trans->sync_tid;
1099 * The modification or re-modification requires an allocation and
1102 * We normally always allocate new storage here. If storage exists
1103 * and MODIFY_NOREALLOC is passed in, we do not allocate new storage.
1105 if (chain != &hmp->vchain &&
1106 chain != &hmp->fchain &&
1107 ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 ||
1108 (flags & HAMMER2_MODIFY_NOREALLOC) == 0)
1110 hammer2_freemap_alloc(trans, &chain->bref, chain->bytes);
1111 /* XXX failed allocation */
1115 * Do not COW if OPTDATA is set. INITIAL flag remains unchanged.
1116 * (OPTDATA does not prevent [re]allocation of storage, only the
1117 * related copy-on-write op).
1119 if (flags & HAMMER2_MODIFY_OPTDATA)
1123 * Clearing the INITIAL flag (for indirect blocks) indicates that
1124 * we've processed the uninitialized storage allocation.
1126 * If this flag is already clear we are likely in a copy-on-write
1127 * situation but we have to be sure NOT to bzero the storage if
1128 * no data is present.
1130 if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1131 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1138 * We currently should never instantiate a device buffer for a
1139 * file data chain. (We definitely can for a freemap chain).
1141 KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
1144 * Instantiate data buffer and possibly execute COW operation
1146 switch(chain->bref.type) {
1147 case HAMMER2_BREF_TYPE_VOLUME:
1148 case HAMMER2_BREF_TYPE_FREEMAP:
1149 case HAMMER2_BREF_TYPE_INODE:
1151 * The data is embedded, no copy-on-write operation is
1154 KKASSERT(chain->bp == NULL);
1156 case HAMMER2_BREF_TYPE_DATA:
1157 case HAMMER2_BREF_TYPE_INDIRECT:
1158 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1159 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1161 * Perform the copy-on-write operation
1163 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain);
1166 * The device buffer may be larger than the allocation size.
1168 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
1169 bbytes = HAMMER2_MINIOSIZE;
1170 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
1171 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
1174 * Buffer aliasing is possible, check for the case.
1176 * The getblk() optimization can only be used if the
1177 * physical block size matches the request.
1179 if (chain->bp && chain->bp->b_loffset == pbase) {
1181 } else if (chain->bytes == bbytes) {
1182 nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
1185 error = bread(hmp->devvp, pbase, bbytes, &nbp);
1186 KKASSERT(error == 0);
1188 bdata = (char *)nbp->b_data + boff;
1191 * Copy or zero-fill on write depending on whether
1192 * chain->data exists or not. Retire the existing bp
1193 * based on the DIRTYBP flag. Set the DIRTYBP flag to
1194 * indicate that retirement of nbp should use bdwrite().
1197 KKASSERT(chain->bp != NULL);
1198 if (chain->data != bdata) {
1199 bcopy(chain->data, bdata, chain->bytes);
1201 } else if (wasinitial) {
1202 bzero(bdata, chain->bytes);
1205 * We have a problem. We were asked to COW but
1206 * we don't have any data to COW with!
1208 panic("hammer2_chain_modify: having a COW %p\n",
1211 if (chain->bp != nbp) {
1213 if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
1216 chain->bp->b_flags |= B_RELBUF;
1222 chain->data = bdata;
1223 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
1226 panic("hammer2_chain_modify: illegal non-embedded type %d",
1232 hammer2_chain_setsubmod(trans, chain);
1236 * Mark the volume as having been modified. This short-cut version
1237 * does not have to lock the volume's chain, which allows the ioctl
1238 * code to make adjustments to connections without deadlocking. XXX
1240 * No ref is made on vchain when flagging it MODIFIED.
1243 hammer2_modify_volume(hammer2_mount_t *hmp)
1245 hammer2_voldata_lock(hmp);
1246 hammer2_voldata_unlock(hmp, 1);
1250 * Locate an in-memory chain. The parent must be locked. The in-memory
1251 * chain is returned with a reference and without a lock, or NULL
1254 * This function returns the chain at the specified index with the highest
1255 * delete_tid. The caller must check whether the chain is flagged
1256 * CHAIN_DELETED or not. However, because chain iterations can be removed
1257 * from memory we must ALSO check that DELETED chains are not flushed. A
1258 * DELETED chain which has been flushed must be ignored (the caller must
1259 * check the parent's blockref array).
1261 * NOTE: If no chain is found the caller usually must check the on-media
1262 * array to determine if a blockref exists at the index.
1264 struct hammer2_chain_find_info {
1265 hammer2_chain_t *best;
1266 hammer2_tid_t delete_tid;
1272 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data)
1274 struct hammer2_chain_find_info *info = data;
1276 if (child->index < info->index)
1278 if (child->index > info->index)
1285 hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
1287 struct hammer2_chain_find_info *info = data;
1289 if (info->delete_tid < child->delete_tid) {
1290 info->delete_tid = child->delete_tid;
1298 hammer2_chain_find_locked(hammer2_chain_t *parent, int index)
1300 struct hammer2_chain_find_info info;
1301 hammer2_chain_t *child;
1304 info.delete_tid = 0;
1307 RB_SCAN(hammer2_chain_tree, &parent->core->rbtree,
1308 hammer2_chain_find_cmp, hammer2_chain_find_callback,
1316 hammer2_chain_find(hammer2_chain_t *parent, int index)
1318 hammer2_chain_t *child;
1320 spin_lock(&parent->core->cst.spin);
1321 child = hammer2_chain_find_locked(parent, index);
1323 hammer2_chain_ref(child);
1324 spin_unlock(&parent->core->cst.spin);
1330 * Return a locked chain structure with all associated data acquired.
1331 * (if LOOKUP_NOLOCK is requested the returned chain is only referenced).
1333 * Caller must hold the parent locked shared or exclusive since we may
1334 * need the parent's bref array to find our block.
1336 * The returned child is locked as requested. If NOLOCK, the returned
1337 * child is still at least referenced.
1340 hammer2_chain_get(hammer2_chain_t *parent, int index, int flags)
1342 hammer2_blockref_t *bref;
1343 hammer2_mount_t *hmp = parent->hmp;
1344 hammer2_chain_core_t *above = parent->core;
1345 hammer2_chain_t *chain;
1346 hammer2_chain_t dummy;
1350 * Figure out how to lock. MAYBE can be used to optimized
1351 * the initial-create state for indirect blocks.
1353 if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
1354 how = HAMMER2_RESOLVE_NEVER;
1356 how = HAMMER2_RESOLVE_MAYBE;
1357 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
1358 how |= HAMMER2_RESOLVE_SHARED;
1362 * First see if we have a (possibly modified) chain element cached
1363 * for this (parent, index). Acquire the data if necessary.
1365 * If chain->data is non-NULL the chain should already be marked
1369 dummy.index = index;
1370 dummy.delete_tid = HAMMER2_MAX_TID;
1371 spin_lock(&above->cst.spin);
1372 chain = RB_FIND(hammer2_chain_tree, &above->rbtree, &dummy);
1374 hammer2_chain_ref(chain);
1375 spin_unlock(&above->cst.spin);
1376 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1377 hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF);
1380 spin_unlock(&above->cst.spin);
1383 * The parent chain must not be in the INITIAL state.
1385 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1386 panic("hammer2_chain_get: Missing bref(1)");
1391 * No RBTREE entry found, lookup the bref and issue I/O (switch on
1392 * the parent's bref to determine where and how big the array is).
1394 switch(parent->bref.type) {
1395 case HAMMER2_BREF_TYPE_INODE:
1396 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1397 bref = &parent->data->ipdata.u.blockset.blockref[index];
1399 case HAMMER2_BREF_TYPE_INDIRECT:
1400 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1401 KKASSERT(parent->data != NULL);
1402 KKASSERT(index >= 0 &&
1403 index < parent->bytes / sizeof(hammer2_blockref_t));
1404 bref = &parent->data->npdata.blockref[index];
1406 case HAMMER2_BREF_TYPE_VOLUME:
1407 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1408 bref = &hmp->voldata.sroot_blockset.blockref[index];
1410 case HAMMER2_BREF_TYPE_FREEMAP:
1411 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1412 bref = &hmp->voldata.freemap_blockset.blockref[index];
1416 panic("hammer2_chain_get: unrecognized blockref type: %d",
1419 if (bref->type == 0) {
1420 panic("hammer2_chain_get: Missing bref(2)");
1425 * Allocate a chain structure representing the existing media
1426 * entry. Resulting chain has one ref and is not locked.
1428 * The locking operation we do later will issue I/O to read it.
1430 chain = hammer2_chain_alloc(hmp, NULL, bref);
1431 hammer2_chain_core_alloc(chain, NULL); /* ref'd chain returned */
1434 * Link the chain into its parent. A spinlock is required to safely
1435 * access the RBTREE, and it is possible to collide with another
1436 * hammer2_chain_get() operation because the caller might only hold
1437 * a shared lock on the parent.
1439 KKASSERT(parent->refs > 0);
1440 spin_lock(&above->cst.spin);
1441 chain->above = above;
1442 chain->index = index;
1443 if (RB_INSERT(hammer2_chain_tree, &above->rbtree, chain)) {
1444 chain->above = NULL;
1446 spin_unlock(&above->cst.spin);
1447 hammer2_chain_drop(chain);
1450 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
1451 spin_unlock(&above->cst.spin);
1454 * Our new chain is referenced but NOT locked. Lock the chain
1455 * below. The locking operation also resolves its data.
1457 * If NOLOCK is set the release will release the one-and-only lock.
1459 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1460 hammer2_chain_lock(chain, how); /* recusive lock */
1461 hammer2_chain_drop(chain); /* excess ref */
1467 * Lookup initialization/completion API
1470 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags)
1472 if (flags & HAMMER2_LOOKUP_SHARED) {
1473 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
1474 HAMMER2_RESOLVE_SHARED);
1476 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
1482 hammer2_chain_lookup_done(hammer2_chain_t *parent)
1485 hammer2_chain_unlock(parent);
1490 hammer2_chain_getparent(hammer2_chain_t **parentp, int how)
1492 hammer2_chain_t *oparent;
1493 hammer2_chain_t *nparent;
1494 hammer2_chain_core_t *above;
1497 above = oparent->above;
1499 spin_lock(&above->cst.spin);
1500 nparent = above->first_parent;
1501 while (hammer2_chain_refactor_test(nparent, 1))
1502 nparent = nparent->next_parent;
1503 hammer2_chain_ref(nparent); /* protect nparent, use in lock */
1504 spin_unlock(&above->cst.spin);
1506 hammer2_chain_unlock(oparent);
1507 hammer2_chain_lock(nparent, how | HAMMER2_RESOLVE_NOREF);
1514 * Locate any key between key_beg and key_end inclusive. (*parentp)
1515 * typically points to an inode but can also point to a related indirect
1516 * block and this function will recurse upwards and find the inode again.
1518 * WARNING! THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER! ANY KEY
1519 * WITHIN THE RANGE CAN BE RETURNED. HOWEVER, AN ITERATION
1520 * WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN
1521 * AND ALL IN-RANGE KEYS WILL EVENTUALLY BE RETURNED (NOT
1522 * NECESSARILY IN ORDER).
1524 * (*parentp) must be exclusively locked and referenced and can be an inode
1525 * or an existing indirect block within the inode.
1527 * On return (*parentp) will be modified to point at the deepest parent chain
1528 * element encountered during the search, as a helper for an insertion or
1529 * deletion. The new (*parentp) will be locked and referenced and the old
1530 * will be unlocked and dereferenced (no change if they are both the same).
1532 * The matching chain will be returned exclusively locked. If NOLOCK is
1533 * requested the chain will be returned only referenced.
1535 * NULL is returned if no match was found, but (*parentp) will still
1536 * potentially be adjusted.
1538 * This function will also recurse up the chain if the key is not within the
1539 * current parent's range. (*parentp) can never be set to NULL. An iteration
1540 * can simply allow (*parentp) to float inside the loop.
1542 * NOTE! chain->data is not always resolved. By default it will not be
1543 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use
1544 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/
1545 * BREF_TYPE_DATA as the device buffer can alias the logical file
1549 hammer2_chain_lookup(hammer2_chain_t **parentp,
1550 hammer2_key_t key_beg, hammer2_key_t key_end,
1553 hammer2_mount_t *hmp;
1554 hammer2_chain_t *parent;
1555 hammer2_chain_t *chain;
1556 hammer2_chain_t *tmp;
1557 hammer2_blockref_t *base;
1558 hammer2_blockref_t *bref;
1559 hammer2_key_t scan_beg;
1560 hammer2_key_t scan_end;
1563 int how_always = HAMMER2_RESOLVE_ALWAYS;
1564 int how_maybe = HAMMER2_RESOLVE_MAYBE;
1566 if (flags & HAMMER2_LOOKUP_ALWAYS)
1567 how_maybe = how_always;
1569 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) {
1570 how_maybe |= HAMMER2_RESOLVE_SHARED;
1571 how_always |= HAMMER2_RESOLVE_SHARED;
1575 * Recurse (*parentp) upward if necessary until the parent completely
1576 * encloses the key range or we hit the inode.
1581 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1582 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1583 scan_beg = parent->bref.key;
1584 scan_end = scan_beg +
1585 ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1586 if (key_beg >= scan_beg && key_end <= scan_end)
1588 parent = hammer2_chain_getparent(parentp, how_maybe);
1593 * Locate the blockref array. Currently we do a fully associative
1594 * search through the array.
1596 switch(parent->bref.type) {
1597 case HAMMER2_BREF_TYPE_INODE:
1599 * Special shortcut for embedded data returns the inode
1600 * itself. Callers must detect this condition and access
1601 * the embedded data (the strategy code does this for us).
1603 * This is only applicable to regular files and softlinks.
1605 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1606 if (flags & HAMMER2_LOOKUP_NOLOCK)
1607 hammer2_chain_ref(parent);
1609 hammer2_chain_lock(parent, how_always);
1612 base = &parent->data->ipdata.u.blockset.blockref[0];
1613 count = HAMMER2_SET_COUNT;
1615 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1616 case HAMMER2_BREF_TYPE_INDIRECT:
1618 * Handle MATCHIND on the parent
1620 if (flags & HAMMER2_LOOKUP_MATCHIND) {
1621 scan_beg = parent->bref.key;
1622 scan_end = scan_beg +
1623 ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1624 if (key_beg == scan_beg && key_end == scan_end) {
1626 hammer2_chain_lock(chain, how_maybe);
1631 * Optimize indirect blocks in the INITIAL state to avoid
1634 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1637 if (parent->data == NULL)
1638 panic("parent->data is NULL");
1639 base = &parent->data->npdata.blockref[0];
1641 count = parent->bytes / sizeof(hammer2_blockref_t);
1643 case HAMMER2_BREF_TYPE_VOLUME:
1644 base = &hmp->voldata.sroot_blockset.blockref[0];
1645 count = HAMMER2_SET_COUNT;
1647 case HAMMER2_BREF_TYPE_FREEMAP:
1648 base = &hmp->voldata.freemap_blockset.blockref[0];
1649 count = HAMMER2_SET_COUNT;
1652 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
1654 base = NULL; /* safety */
1655 count = 0; /* safety */
1659 * If the element and key overlap we use the element.
1661 * NOTE! Deleted elements are effectively invisible. Deletions
1662 * proactively clear the parent bref to the deleted child
1663 * so we do not try to shadow here to avoid parent updates
1664 * (which would be difficult since multiple deleted elements
1665 * might represent different flush synchronization points).
1668 scan_beg = 0; /* avoid compiler warning */
1669 scan_end = 0; /* avoid compiler warning */
1671 for (i = 0; i < count; ++i) {
1672 tmp = hammer2_chain_find(parent, i);
1674 if (tmp->flags & HAMMER2_CHAIN_DELETED) {
1675 hammer2_chain_drop(tmp);
1679 KKASSERT(bref->type != 0);
1680 } else if (base == NULL || base[i].type == 0) {
1685 scan_beg = bref->key;
1686 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1688 hammer2_chain_drop(tmp);
1689 if (key_beg <= scan_end && key_end >= scan_beg)
1693 if (key_beg == key_end)
1695 return (hammer2_chain_next(parentp, NULL,
1696 key_beg, key_end, flags));
1700 * Acquire the new chain element. If the chain element is an
1701 * indirect block we must search recursively.
1703 * It is possible for the tmp chain above to be removed from
1704 * the RBTREE but the parent lock ensures it would not have been
1705 * destroyed from the media, so the chain_get() code will simply
1706 * reload it from the media in that case.
1708 chain = hammer2_chain_get(parent, i, flags);
1713 * If the chain element is an indirect block it becomes the new
1714 * parent and we loop on it.
1716 * The parent always has to be locked with at least RESOLVE_MAYBE
1717 * so we can access its data. It might need a fixup if the caller
1718 * passed incompatible flags. Be careful not to cause a deadlock
1719 * as a data-load requires an exclusive lock.
1721 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key
1722 * range is within the requested key range we return the indirect
1723 * block and do NOT loop. This is usually only used to acquire
1726 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1727 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1728 hammer2_chain_unlock(parent);
1729 *parentp = parent = chain;
1730 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1731 hammer2_chain_lock(chain,
1733 HAMMER2_RESOLVE_NOREF);
1734 } else if ((flags & HAMMER2_LOOKUP_NODATA) &&
1735 chain->data == NULL) {
1736 hammer2_chain_ref(chain);
1737 hammer2_chain_unlock(chain);
1738 hammer2_chain_lock(chain,
1740 HAMMER2_RESOLVE_NOREF);
1746 * All done, return the chain
1752 * After having issued a lookup we can iterate all matching keys.
1754 * If chain is non-NULL we continue the iteration from just after it's index.
1756 * If chain is NULL we assume the parent was exhausted and continue the
1757 * iteration at the next parent.
1759 * parent must be locked on entry and remains locked throughout. chain's
1760 * lock status must match flags. Chain is always at least referenced.
1762 * WARNING! The MATCHIND flag does not apply to this function.
1765 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain,
1766 hammer2_key_t key_beg, hammer2_key_t key_end,
1769 hammer2_mount_t *hmp;
1770 hammer2_chain_t *parent;
1771 hammer2_chain_t *tmp;
1772 hammer2_blockref_t *base;
1773 hammer2_blockref_t *bref;
1774 hammer2_key_t scan_beg;
1775 hammer2_key_t scan_end;
1777 int how_maybe = HAMMER2_RESOLVE_MAYBE;
1780 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
1781 how_maybe |= HAMMER2_RESOLVE_SHARED;
1788 * Calculate the next index and recalculate the parent if necessary.
1792 * Continue iteration within current parent. If not NULL
1793 * the passed-in chain may or may not be locked, based on
1794 * the LOOKUP_NOLOCK flag (passed in as returned from lookup
1797 i = chain->index + 1;
1798 if (flags & HAMMER2_LOOKUP_NOLOCK)
1799 hammer2_chain_drop(chain);
1801 hammer2_chain_unlock(chain);
1804 * Any scan where the lookup returned degenerate data embedded
1805 * in the inode has an invalid index and must terminate.
1807 if (chain == parent)
1810 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
1811 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1813 * We reached the end of the iteration.
1818 * Continue iteration with next parent unless the current
1819 * parent covers the range.
1821 scan_beg = parent->bref.key;
1822 scan_end = scan_beg +
1823 ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1824 if (key_beg >= scan_beg && key_end <= scan_end)
1827 i = parent->index + 1;
1828 parent = hammer2_chain_getparent(parentp, how_maybe);
1833 * Locate the blockref array. Currently we do a fully associative
1834 * search through the array.
1836 switch(parent->bref.type) {
1837 case HAMMER2_BREF_TYPE_INODE:
1838 base = &parent->data->ipdata.u.blockset.blockref[0];
1839 count = HAMMER2_SET_COUNT;
1841 case HAMMER2_BREF_TYPE_INDIRECT:
1842 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1843 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1846 KKASSERT(parent->data != NULL);
1847 base = &parent->data->npdata.blockref[0];
1849 count = parent->bytes / sizeof(hammer2_blockref_t);
1851 case HAMMER2_BREF_TYPE_VOLUME:
1852 base = &hmp->voldata.sroot_blockset.blockref[0];
1853 count = HAMMER2_SET_COUNT;
1855 case HAMMER2_BREF_TYPE_FREEMAP:
1856 base = &hmp->voldata.freemap_blockset.blockref[0];
1857 count = HAMMER2_SET_COUNT;
1860 panic("hammer2_chain_next: unrecognized blockref type: %d",
1862 base = NULL; /* safety */
1863 count = 0; /* safety */
1866 KKASSERT(i <= count);
1869 * Look for the key. If we are unable to find a match and an exact
1870 * match was requested we return NULL. If a range was requested we
1871 * run hammer2_chain_next() to iterate.
1873 * NOTE! Deleted elements are effectively invisible. Deletions
1874 * proactively clear the parent bref to the deleted child
1875 * so we do not try to shadow here to avoid parent updates
1876 * (which would be difficult since multiple deleted elements
1877 * might represent different flush synchronization points).
1880 scan_beg = 0; /* avoid compiler warning */
1881 scan_end = 0; /* avoid compiler warning */
1884 tmp = hammer2_chain_find(parent, i);
1886 if (tmp->flags & HAMMER2_CHAIN_DELETED) {
1887 hammer2_chain_drop(tmp);
1892 } else if (base == NULL || base[i].type == 0) {
1898 scan_beg = bref->key;
1899 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1901 hammer2_chain_drop(tmp);
1902 if (key_beg <= scan_end && key_end >= scan_beg)
1908 * If we couldn't find a match recurse up a parent to continue the
1915 * Acquire the new chain element. If the chain element is an
1916 * indirect block we must search recursively.
1918 chain = hammer2_chain_get(parent, i, flags);
1923 * If the chain element is an indirect block it becomes the new
1924 * parent and we loop on it.
1926 * The parent always has to be locked with at least RESOLVE_MAYBE
1927 * so we can access its data. It might need a fixup if the caller
1928 * passed incompatible flags. Be careful not to cause a deadlock
1929 * as a data-load requires an exclusive lock.
1931 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key
1932 * range is within the requested key range we return the indirect
1933 * block and do NOT loop. This is usually only used to acquire
1936 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1937 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1938 if ((flags & HAMMER2_LOOKUP_MATCHIND) == 0 ||
1939 key_beg > scan_beg || key_end < scan_end) {
1940 hammer2_chain_unlock(parent);
1941 *parentp = parent = chain;
1943 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1944 hammer2_chain_lock(parent,
1946 HAMMER2_RESOLVE_NOREF);
1947 } else if ((flags & HAMMER2_LOOKUP_NODATA) &&
1948 parent->data == NULL) {
1949 hammer2_chain_ref(parent);
1950 hammer2_chain_unlock(parent);
1951 hammer2_chain_lock(parent,
1953 HAMMER2_RESOLVE_NOREF);
1961 * All done, return chain
1967 * Create and return a new hammer2 system memory structure of the specified
1968 * key, type and size and insert it under (*parentp). This is a full
1969 * insertion, based on the supplied key/keybits, and may involve creating
1970 * indirect blocks and moving other chains around via delete/duplicate.
1972 * (*parentp) must be exclusive locked and may be replaced on return
1973 * depending on how much work the function had to do.
1975 * (*chainp) usually starts out NULL and returns the newly created chain,
1976 * but if the caller desires the caller may allocate a disconnected chain
1977 * and pass it in instead. (It is also possible for the caller to use
1978 * chain_duplicate() to create a disconnected chain, manipulate it, then
1979 * pass it into this function to insert it).
1981 * This function should NOT be used to insert INDIRECT blocks. It is
1982 * typically used to create/insert inodes and data blocks.
1984 * Caller must pass-in an exclusively locked parent the new chain is to
1985 * be inserted under, and optionally pass-in a disconnected, exclusively
1986 * locked chain to insert (else we create a new chain). The function will
1987 * adjust (*parentp) as necessary, create or connect the chain, and
1988 * return an exclusively locked chain in *chainp.
1991 hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
1992 hammer2_chain_t **chainp,
1993 hammer2_key_t key, int keybits, int type, size_t bytes)
1995 hammer2_mount_t *hmp;
1996 hammer2_chain_t *chain;
1997 hammer2_chain_t *child;
1998 hammer2_chain_t *parent = *parentp;
1999 hammer2_chain_core_t *above;
2000 hammer2_blockref_t dummy;
2001 hammer2_blockref_t *base;
2007 above = parent->core;
2008 KKASSERT(ccms_thread_lock_owned(&above->cst));
2012 if (chain == NULL) {
2014 * First allocate media space and construct the dummy bref,
2015 * then allocate the in-memory chain structure. Set the
2016 * INITIAL flag for fresh chains.
2018 bzero(&dummy, sizeof(dummy));
2021 dummy.keybits = keybits;
2022 dummy.data_off = hammer2_getradix(bytes);
2023 dummy.methods = parent->bref.methods;
2024 chain = hammer2_chain_alloc(hmp, trans, &dummy);
2025 hammer2_chain_core_alloc(chain, NULL);
2027 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
2030 * Lock the chain manually, chain_lock will load the chain
2031 * which we do NOT want to do. (note: chain->refs is set
2032 * to 1 by chain_alloc() for us, but lockcnt is not).
2035 ccms_thread_lock(&chain->core->cst, CCMS_STATE_EXCLUSIVE);
2039 * We do NOT set INITIAL here (yet). INITIAL is only
2040 * used for indirect blocks.
2042 * Recalculate bytes to reflect the actual media block
2045 bytes = (hammer2_off_t)1 <<
2046 (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
2047 chain->bytes = bytes;
2050 case HAMMER2_BREF_TYPE_VOLUME:
2051 case HAMMER2_BREF_TYPE_FREEMAP:
2052 panic("hammer2_chain_create: called with volume type");
2054 case HAMMER2_BREF_TYPE_INODE:
2055 KKASSERT(bytes == HAMMER2_INODE_BYTES);
2056 chain->data = kmalloc(sizeof(chain->data->ipdata),
2057 hmp->minode, M_WAITOK | M_ZERO);
2059 case HAMMER2_BREF_TYPE_INDIRECT:
2060 panic("hammer2_chain_create: cannot be used to"
2061 "create indirect block");
2063 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2064 panic("hammer2_chain_create: cannot be used to"
2065 "create freemap root or node");
2067 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2068 case HAMMER2_BREF_TYPE_DATA:
2070 /* leave chain->data NULL */
2071 KKASSERT(chain->data == NULL);
2076 * Potentially update the existing chain's key/keybits.
2078 * Do NOT mess with the current state of the INITIAL flag.
2080 chain->bref.key = key;
2081 chain->bref.keybits = keybits;
2082 KKASSERT(chain->above == NULL);
2086 above = parent->core;
2089 * Locate a free blockref in the parent's array
2091 switch(parent->bref.type) {
2092 case HAMMER2_BREF_TYPE_INODE:
2093 KKASSERT((parent->data->ipdata.op_flags &
2094 HAMMER2_OPFLAG_DIRECTDATA) == 0);
2095 KKASSERT(parent->data != NULL);
2096 base = &parent->data->ipdata.u.blockset.blockref[0];
2097 count = HAMMER2_SET_COUNT;
2099 case HAMMER2_BREF_TYPE_INDIRECT:
2100 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2101 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2104 KKASSERT(parent->data != NULL);
2105 base = &parent->data->npdata.blockref[0];
2107 count = parent->bytes / sizeof(hammer2_blockref_t);
2109 case HAMMER2_BREF_TYPE_VOLUME:
2110 KKASSERT(parent->data != NULL);
2111 base = &hmp->voldata.sroot_blockset.blockref[0];
2112 count = HAMMER2_SET_COUNT;
2114 case HAMMER2_BREF_TYPE_FREEMAP:
2115 KKASSERT(parent->data != NULL);
2116 base = &hmp->voldata.freemap_blockset.blockref[0];
2117 count = HAMMER2_SET_COUNT;
2120 panic("hammer2_chain_create: unrecognized blockref type: %d",
2127 * Scan for an unallocated bref, also skipping any slots occupied
2128 * by in-memory chain elements that may not yet have been updated
2129 * in the parent's bref array.
2131 * We don't have to hold the spinlock to save an empty slot as
2132 * new slots can only transition from empty if the parent is
2133 * locked exclusively.
2135 spin_lock(&above->cst.spin);
2136 for (i = 0; i < count; ++i) {
2137 child = hammer2_chain_find_locked(parent, i);
2139 if (child->flags & HAMMER2_CHAIN_DELETED)
2145 if (base[i].type == 0)
2148 spin_unlock(&above->cst.spin);
2151 * If no free blockref could be found we must create an indirect
2152 * block and move a number of blockrefs into it. With the parent
2153 * locked we can safely lock each child in order to move it without
2154 * causing a deadlock.
2156 * This may return the new indirect block or the old parent depending
2157 * on where the key falls. NULL is returned on error.
2160 hammer2_chain_t *nparent;
2162 nparent = hammer2_chain_create_indirect(trans, parent,
2165 if (nparent == NULL) {
2167 hammer2_chain_drop(chain);
2171 if (parent != nparent) {
2172 hammer2_chain_unlock(parent);
2173 parent = *parentp = nparent;
2179 * Link the chain into its parent. Later on we will have to set
2180 * the MOVED bit in situations where we don't mark the new chain
2181 * as being modified.
2183 if (chain->above != NULL)
2184 panic("hammer2: hammer2_chain_create: chain already connected");
2185 KKASSERT(chain->above == NULL);
2186 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
2188 chain->above = above;
2190 spin_lock(&above->cst.spin);
2191 if (RB_INSERT(hammer2_chain_tree, &above->rbtree, chain))
2192 panic("hammer2_chain_create: collision");
2193 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
2194 spin_unlock(&above->cst.spin);
2198 * Mark the newly created chain modified.
2200 * Device buffers are not instantiated for DATA elements
2201 * as these are handled by logical buffers.
2203 * Indirect and freemap node indirect blocks are handled
2204 * by hammer2_chain_create_indirect() and not by this
2207 * Data for all other bref types is expected to be
2208 * instantiated (INODE, LEAF).
2210 switch(chain->bref.type) {
2211 case HAMMER2_BREF_TYPE_DATA:
2212 hammer2_chain_modify(trans, &chain,
2213 HAMMER2_MODIFY_OPTDATA |
2214 HAMMER2_MODIFY_ASSERTNOCOPY);
2216 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2217 case HAMMER2_BREF_TYPE_INODE:
2218 hammer2_chain_modify(trans, &chain,
2219 HAMMER2_MODIFY_ASSERTNOCOPY);
2223 * Remaining types are not supported by this function.
2224 * In particular, INDIRECT and LEAF_NODE types are
2225 * handled by create_indirect().
2227 panic("hammer2_chain_create: bad type: %d",
2234 * When reconnecting a chain we must set MOVED and setsubmod
2235 * so the flush recognizes that it must update the bref in
2238 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2239 hammer2_chain_ref(chain);
2240 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2242 hammer2_chain_setsubmod(trans, chain);
2252 * Replace (*chainp) with a duplicate. The original *chainp is unlocked
2253 * and the replacement will be returned locked. Both the original and the
2254 * new chain will share the same RBTREE (have the same chain->core), with
2255 * the new chain becoming the 'current' chain (meaning it is the first in
2256 * the linked list at core->chain_first).
2258 * If (parent, i) then the new duplicated chain is inserted under the parent
2259 * at the specified index (the parent must not have a ref at that index).
2261 * If (NULL, -1) then the new duplicated chain is not inserted anywhere,
2262 * similar to if it had just been chain_alloc()'d (suitable for passing into
2263 * hammer2_chain_create() after this function returns).
2265 * NOTE! Duplication is used in order to retain the original topology to
2266 * support flush synchronization points. Both the original and the
2267 * new chain will have the same transaction id and thus the operation
2268 * appears atomic w/regards to media flushes.
2270 static void hammer2_chain_dup_inodefixup(hammer2_chain_t *ochain,
2271 hammer2_chain_t *nchain);
2274 hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t *parent, int i,
2275 hammer2_chain_t **chainp, hammer2_blockref_t *bref)
2277 hammer2_mount_t *hmp = trans->hmp;
2278 hammer2_blockref_t *base;
2279 hammer2_chain_t *ochain;
2280 hammer2_chain_t *nchain;
2281 hammer2_chain_t *scan;
2282 hammer2_chain_core_t *above;
2289 * First create a duplicate of the chain structure, associating
2290 * it with the same core, making it the same size, pointing it
2291 * to the same bref (the same media block).
2295 bref = &ochain->bref;
2296 nchain = hammer2_chain_alloc(hmp, trans, bref);
2297 hammer2_chain_core_alloc(nchain, ochain->core);
2298 bytes = (hammer2_off_t)1 <<
2299 (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
2300 nchain->bytes = bytes;
2301 nchain->modify_tid = ochain->modify_tid;
2303 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER);
2304 hammer2_chain_dup_inodefixup(ochain, nchain);
2307 * If parent is not NULL, insert into the parent at the requested
2308 * index. The newly duplicated chain must be marked MOVED and
2309 * SUBMODIFIED set in its parent(s).
2311 * Having both chains locked is extremely important for atomicy.
2315 * Locate a free blockref in the parent's array
2317 above = parent->core;
2318 KKASSERT(ccms_thread_lock_owned(&above->cst));
2320 switch(parent->bref.type) {
2321 case HAMMER2_BREF_TYPE_INODE:
2322 KKASSERT((parent->data->ipdata.op_flags &
2323 HAMMER2_OPFLAG_DIRECTDATA) == 0);
2324 KKASSERT(parent->data != NULL);
2325 base = &parent->data->ipdata.u.blockset.blockref[0];
2326 count = HAMMER2_SET_COUNT;
2328 case HAMMER2_BREF_TYPE_INDIRECT:
2329 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2330 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2333 KKASSERT(parent->data != NULL);
2334 base = &parent->data->npdata.blockref[0];
2336 count = parent->bytes / sizeof(hammer2_blockref_t);
2338 case HAMMER2_BREF_TYPE_VOLUME:
2339 KKASSERT(parent->data != NULL);
2340 base = &hmp->voldata.sroot_blockset.blockref[0];
2341 count = HAMMER2_SET_COUNT;
2343 case HAMMER2_BREF_TYPE_FREEMAP:
2344 KKASSERT(parent->data != NULL);
2345 base = &hmp->voldata.freemap_blockset.blockref[0];
2346 count = HAMMER2_SET_COUNT;
2349 panic("hammer2_chain_create: unrecognized "
2350 "blockref type: %d",
2355 KKASSERT(i >= 0 && i < count);
2357 KKASSERT((nchain->flags & HAMMER2_CHAIN_DELETED) == 0);
2358 KKASSERT(parent->refs > 0);
2360 spin_lock(&above->cst.spin);
2361 nchain->above = above;
2363 scan = hammer2_chain_find_locked(parent, i);
2364 KKASSERT(base == NULL || base[i].type == 0 ||
2366 (scan->flags & HAMMER2_CHAIN_DELETED));
2367 if (RB_INSERT(hammer2_chain_tree, &above->rbtree,
2369 panic("hammer2_chain_duplicate: collision");
2371 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_ONRBTREE);
2372 spin_unlock(&above->cst.spin);
2374 if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2375 hammer2_chain_ref(nchain);
2376 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED);
2378 hammer2_chain_setsubmod(trans, nchain);
2382 * We have to unlock ochain to flush any dirty data, asserting the
2383 * case (data == NULL) to catch any extra locks that might have been
2384 * present, then transfer state to nchain.
2386 oflags = ochain->flags;
2387 odata = ochain->data;
2388 hammer2_chain_unlock(ochain);
2389 KKASSERT(ochain->bref.type == HAMMER2_BREF_TYPE_INODE ||
2390 ochain->data == NULL);
2392 if (oflags & HAMMER2_CHAIN_INITIAL)
2393 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL);
2396 * WARNING! We should never resolve DATA to device buffers
2397 * (XXX allow it if the caller did?), and since
2398 * we currently do not have the logical buffer cache
2399 * buffer in-hand to fix its cached physical offset
2400 * we also force the modify code to not COW it. XXX
2402 if (oflags & HAMMER2_CHAIN_MODIFIED) {
2403 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2404 hammer2_chain_modify(trans, &nchain,
2405 HAMMER2_MODIFY_OPTDATA |
2406 HAMMER2_MODIFY_NOREALLOC |
2407 HAMMER2_MODIFY_ASSERTNOCOPY);
2408 } else if (oflags & HAMMER2_CHAIN_INITIAL) {
2409 hammer2_chain_modify(trans, &nchain,
2410 HAMMER2_MODIFY_OPTDATA |
2411 HAMMER2_MODIFY_ASSERTNOCOPY);
2413 hammer2_chain_modify(trans, &nchain,
2414 HAMMER2_MODIFY_ASSERTNOCOPY);
2416 hammer2_chain_drop(nchain);
2418 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2419 hammer2_chain_drop(nchain);
2420 } else if (oflags & HAMMER2_CHAIN_INITIAL) {
2421 hammer2_chain_drop(nchain);
2423 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_ALWAYS |
2424 HAMMER2_RESOLVE_NOREF);
2425 hammer2_chain_unlock(nchain);
2428 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2434 * When the chain is in the INITIAL state we must still
2435 * ensure that a block has been assigned so MOVED processing
2436 * works as expected.
2438 KKASSERT (nchain->bref.type != HAMMER2_BREF_TYPE_DATA);
2439 hammer2_chain_modify(trans, &nchain,
2440 HAMMER2_MODIFY_OPTDATA |
2441 HAMMER2_MODIFY_ASSERTNOCOPY);
2444 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_MAYBE |
2445 HAMMER2_RESOLVE_NOREF); /* eat excess ref */
2446 hammer2_chain_unlock(nchain);
2450 * Special in-place delete-duplicate sequence which does not require a
2451 * locked parent. (*chainp) is marked DELETED and atomically replaced
2452 * with a duplicate. Atomicy is at the very-fine spin-lock level in
2453 * order to ensure that lookups do not race us.
2456 hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
2459 hammer2_mount_t *hmp = trans->hmp;
2460 hammer2_chain_t *ochain;
2461 hammer2_chain_t *nchain;
2462 hammer2_chain_core_t *above;
2468 * First create a duplicate of the chain structure
2471 nchain = hammer2_chain_alloc(hmp, trans, &ochain->bref); /* 1 ref */
2472 if (flags & HAMMER2_DELDUP_RECORE)
2473 hammer2_chain_core_alloc(nchain, NULL);
2475 hammer2_chain_core_alloc(nchain, ochain->core);
2476 above = ochain->above;
2478 bytes = (hammer2_off_t)1 <<
2479 (int)(ochain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
2480 nchain->bytes = bytes;
2481 nchain->modify_tid = ochain->modify_tid;
2484 * Lock nchain and insert into ochain's core hierarchy, marking
2485 * ochain DELETED at the same time. Having both chains locked
2486 * is extremely important for atomicy.
2488 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER);
2489 hammer2_chain_dup_inodefixup(ochain, nchain);
2490 /* extra ref still present from original allocation */
2492 nchain->index = ochain->index;
2494 spin_lock(&above->cst.spin);
2495 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_ONRBTREE);
2496 ochain->delete_tid = trans->sync_tid;
2497 nchain->above = above;
2498 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_DELETED);
2499 if ((ochain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2500 hammer2_chain_ref(ochain);
2501 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_MOVED);
2503 if (RB_INSERT(hammer2_chain_tree, &above->rbtree, nchain)) {
2504 panic("hammer2_chain_delete_duplicate: collision");
2506 spin_unlock(&above->cst.spin);
2509 * We have to unlock ochain to flush any dirty data, asserting the
2510 * case (data == NULL) to catch any extra locks that might have been
2511 * present, then transfer state to nchain.
2513 oflags = ochain->flags;
2514 odata = ochain->data;
2515 hammer2_chain_unlock(ochain); /* replacing ochain */
2516 KKASSERT(ochain->bref.type == HAMMER2_BREF_TYPE_INODE ||
2517 ochain->data == NULL);
2519 if (oflags & HAMMER2_CHAIN_INITIAL)
2520 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL);
2523 * WARNING! We should never resolve DATA to device buffers
2524 * (XXX allow it if the caller did?), and since
2525 * we currently do not have the logical buffer cache
2526 * buffer in-hand to fix its cached physical offset
2527 * we also force the modify code to not COW it. XXX
2529 if (oflags & HAMMER2_CHAIN_MODIFIED) {
2530 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2531 hammer2_chain_modify(trans, &nchain,
2532 HAMMER2_MODIFY_OPTDATA |
2533 HAMMER2_MODIFY_NOREALLOC |
2534 HAMMER2_MODIFY_ASSERTNOCOPY);
2535 } else if (oflags & HAMMER2_CHAIN_INITIAL) {
2536 hammer2_chain_modify(trans, &nchain,
2537 HAMMER2_MODIFY_OPTDATA |
2538 HAMMER2_MODIFY_ASSERTNOCOPY);
2540 hammer2_chain_modify(trans, &nchain,
2541 HAMMER2_MODIFY_ASSERTNOCOPY);
2543 hammer2_chain_drop(nchain);
2545 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2546 hammer2_chain_drop(nchain);
2547 } else if (oflags & HAMMER2_CHAIN_INITIAL) {
2548 hammer2_chain_drop(nchain);
2550 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_ALWAYS |
2551 HAMMER2_RESOLVE_NOREF);
2552 hammer2_chain_unlock(nchain);
2557 * Unconditionally set the MOVED and SUBMODIFIED bit to force
2558 * update of parent bref and indirect blockrefs during flush.
2560 if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2561 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED);
2562 hammer2_chain_ref(nchain);
2564 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2565 hammer2_chain_setsubmod(trans, nchain);
2570 * Helper function to fixup inodes. The caller procedure stack may hold
2571 * multiple locks on ochain if it represents an inode, preventing our
2572 * unlock from retiring its state to the buffer cache.
2574 * In this situation any attempt to access the buffer cache could result
2575 * either in stale data or a deadlock. Work around the problem by copying
2576 * the embedded data directly.
2580 hammer2_chain_dup_inodefixup(hammer2_chain_t *ochain, hammer2_chain_t *nchain)
2582 if (ochain->bref.type != HAMMER2_BREF_TYPE_INODE)
2584 if (ochain->data == NULL)
2586 KKASSERT(nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
2587 KKASSERT(nchain->data == NULL);
2588 nchain->data = kmalloc(sizeof(nchain->data->ipdata),
2589 ochain->hmp->minode, M_WAITOK | M_ZERO);
2590 nchain->data->ipdata = ochain->data->ipdata;
2594 * Create a snapshot of the specified {parent, chain} with the specified
2597 * (a) We create a duplicate connected to the super-root as the specified
2600 * (b) We issue a restricted flush using the current transaction on the
2603 * (c) We disconnect and reallocate the duplicate's core.
2606 hammer2_chain_snapshot(hammer2_trans_t *trans, hammer2_inode_t *ip,
2607 hammer2_ioc_pfs_t *pfs)
2609 hammer2_mount_t *hmp = trans->hmp;
2610 hammer2_chain_t *chain;
2611 hammer2_chain_t *nchain;
2612 hammer2_chain_t *parent;
2613 hammer2_inode_data_t *ipdata;
2614 size_t name_len = strlen(pfs->name);
2615 hammer2_key_t lhc = hammer2_dirhash(pfs->name, name_len);
2619 * Create disconnected duplicate
2621 KKASSERT((trans->flags & HAMMER2_TRANS_RESTRICTED) == 0);
2623 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_MAYBE);
2624 hammer2_chain_duplicate(trans, NULL, -1, &nchain, NULL);
2625 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_RECYCLE |
2626 HAMMER2_CHAIN_SNAPSHOT);
2629 * Create named entry in the super-root.
2631 parent = hammer2_chain_lookup_init(hmp->schain, 0);
2633 while (error == 0) {
2634 chain = hammer2_chain_lookup(&parent, lhc, lhc, 0);
2637 if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK)
2639 hammer2_chain_unlock(chain);
2643 hammer2_chain_create(trans, &parent, &nchain, lhc, 0,
2644 HAMMER2_BREF_TYPE_INODE,
2645 HAMMER2_INODE_BYTES);
2646 hammer2_chain_modify(trans, &nchain, HAMMER2_MODIFY_ASSERTNOCOPY);
2647 hammer2_chain_lookup_done(parent);
2648 parent = NULL; /* safety */
2653 ipdata = &nchain->data->ipdata;
2654 ipdata->name_key = lhc;
2655 ipdata->name_len = name_len;
2656 ksnprintf(ipdata->filename, sizeof(ipdata->filename), "%s", pfs->name);
2659 * Set PFS type, generate a unique filesystem id, and generate
2660 * a cluster id. Use the same clid when snapshotting a PFS root,
2661 * which theoretically allows the snapshot to be used as part of
2662 * the same cluster (perhaps as a cache).
2664 ipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
2665 kern_uuidgen(&ipdata->pfs_fsid, 1);
2666 if (ip->chain == ip->pmp->rchain)
2667 ipdata->pfs_clid = ip->chain->data->ipdata.pfs_clid;
2669 kern_uuidgen(&ipdata->pfs_clid, 1);
2672 * Issue a restricted flush of the snapshot. This is a synchronous
2675 trans->flags |= HAMMER2_TRANS_RESTRICTED;
2676 kprintf("SNAPSHOTA\n");
2677 tsleep(trans, 0, "snapslp", hz*4);
2678 kprintf("SNAPSHOTB\n");
2679 hammer2_chain_flush(trans, nchain);
2680 trans->flags &= ~HAMMER2_TRANS_RESTRICTED;
2684 * Remove the link b/c nchain is a snapshot and snapshots don't
2685 * follow CHAIN_DELETED semantics ?
2690 KKASSERT(chain->duplink == nchain);
2691 KKASSERT(chain->core == nchain->core);
2692 KKASSERT(nchain->refs >= 2);
2693 chain->duplink = nchain->duplink;
2694 atomic_clear_int(&nchain->flags, HAMMER2_CHAIN_DUPTARGET);
2695 hammer2_chain_drop(nchain);
2698 kprintf("snapshot %s nchain->refs %d nchain->flags %08x\n",
2699 pfs->name, nchain->refs, nchain->flags);
2700 hammer2_chain_unlock(nchain);
2706 * Create an indirect block that covers one or more of the elements in the
2707 * current parent. Either returns the existing parent with no locking or
2708 * ref changes or returns the new indirect block locked and referenced
2709 * and leaving the original parent lock/ref intact as well.
2711 * If an error occurs, NULL is returned and *errorp is set to the error.
2713 * The returned chain depends on where the specified key falls.
2715 * The key/keybits for the indirect mode only needs to follow three rules:
2717 * (1) That all elements underneath it fit within its key space and
2719 * (2) That all elements outside it are outside its key space.
2721 * (3) When creating the new indirect block any elements in the current
2722 * parent that fit within the new indirect block's keyspace must be
2723 * moved into the new indirect block.
2725 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
2726 * keyspace the the current parent, but lookup/iteration rules will
2727 * ensure (and must ensure) that rule (2) for all parents leading up
2728 * to the nearest inode or the root volume header is adhered to. This
2729 * is accomplished by always recursing through matching keyspaces in
2730 * the hammer2_chain_lookup() and hammer2_chain_next() API.
2732 * The current implementation calculates the current worst-case keyspace by
2733 * iterating the current parent and then divides it into two halves, choosing
2734 * whichever half has the most elements (not necessarily the half containing
2735 * the requested key).
2737 * We can also opt to use the half with the least number of elements. This
2738 * causes lower-numbered keys (aka logical file offsets) to recurse through
2739 * fewer indirect blocks and higher-numbered keys to recurse through more.
2740 * This also has the risk of not moving enough elements to the new indirect
2741 * block and being forced to create several indirect blocks before the element
2744 * Must be called with an exclusively locked parent.
2746 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent,
2747 hammer2_key_t *keyp, int keybits,
2748 hammer2_blockref_t *base, int count);
2749 static int hammer2_chain_indkey_normal(hammer2_chain_t *parent,
2750 hammer2_key_t *keyp, int keybits,
2751 hammer2_blockref_t *base, int count);
2754 hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
2755 hammer2_key_t create_key, int create_bits,
2756 int for_type, int *errorp)
2758 hammer2_mount_t *hmp = trans->hmp;
2759 hammer2_chain_core_t *above;
2760 hammer2_chain_core_t *icore;
2761 hammer2_blockref_t *base;
2762 hammer2_blockref_t *bref;
2763 hammer2_chain_t *chain;
2764 hammer2_chain_t *child;
2765 hammer2_chain_t *ichain;
2766 hammer2_chain_t dummy;
2767 hammer2_key_t key = create_key;
2768 int keybits = create_bits;
2774 * Calculate the base blockref pointer or NULL if the chain
2775 * is known to be empty. We need to calculate the array count
2776 * for RB lookups either way.
2778 KKASSERT(ccms_thread_lock_owned(&parent->core->cst));
2780 above = parent->core;
2782 /*hammer2_chain_modify(trans, &parent, HAMMER2_MODIFY_OPTDATA);*/
2783 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2786 switch(parent->bref.type) {
2787 case HAMMER2_BREF_TYPE_INODE:
2788 count = HAMMER2_SET_COUNT;
2790 case HAMMER2_BREF_TYPE_INDIRECT:
2791 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2792 count = parent->bytes / sizeof(hammer2_blockref_t);
2794 case HAMMER2_BREF_TYPE_VOLUME:
2795 count = HAMMER2_SET_COUNT;
2797 case HAMMER2_BREF_TYPE_FREEMAP:
2798 count = HAMMER2_SET_COUNT;
2801 panic("hammer2_chain_create_indirect: "
2802 "unrecognized blockref type: %d",
2808 switch(parent->bref.type) {
2809 case HAMMER2_BREF_TYPE_INODE:
2810 base = &parent->data->ipdata.u.blockset.blockref[0];
2811 count = HAMMER2_SET_COUNT;
2813 case HAMMER2_BREF_TYPE_INDIRECT:
2814 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2815 base = &parent->data->npdata.blockref[0];
2816 count = parent->bytes / sizeof(hammer2_blockref_t);
2818 case HAMMER2_BREF_TYPE_VOLUME:
2819 base = &hmp->voldata.sroot_blockset.blockref[0];
2820 count = HAMMER2_SET_COUNT;
2822 case HAMMER2_BREF_TYPE_FREEMAP:
2823 base = &hmp->voldata.freemap_blockset.blockref[0];
2824 count = HAMMER2_SET_COUNT;
2827 panic("hammer2_chain_create_indirect: "
2828 "unrecognized blockref type: %d",
2836 * dummy used in later chain allocation (no longer used for lookups).
2838 bzero(&dummy, sizeof(dummy));
2839 dummy.delete_tid = HAMMER2_MAX_TID;
2842 * When creating an indirect block for a freemap node or leaf
2843 * the key/keybits must be fitted to static radix levels because
2844 * particular radix levels use particular reserved blocks in the
2847 * This routine calculates the key/radix of the indirect block
2848 * we need to create, and whether it is on the high-side or the
2851 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
2852 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
2853 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits,
2856 keybits = hammer2_chain_indkey_normal(parent, &key, keybits,
2861 * Normalize the key for the radix being represented, keeping the
2862 * high bits and throwing away the low bits.
2864 key &= ~(((hammer2_key_t)1 << keybits) - 1);
2867 * How big should our new indirect block be? It has to be at least
2868 * as large as its parent.
2870 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
2871 nbytes = HAMMER2_IND_BYTES_MIN;
2873 nbytes = HAMMER2_IND_BYTES_MAX;
2874 if (nbytes < count * sizeof(hammer2_blockref_t))
2875 nbytes = count * sizeof(hammer2_blockref_t);
2878 * Ok, create our new indirect block
2880 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
2881 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
2882 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE;
2884 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
2886 dummy.bref.key = key;
2887 dummy.bref.keybits = keybits;
2888 dummy.bref.data_off = hammer2_getradix(nbytes);
2889 dummy.bref.methods = parent->bref.methods;
2891 ichain = hammer2_chain_alloc(hmp, trans, &dummy.bref);
2892 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
2893 hammer2_chain_core_alloc(ichain, NULL);
2894 icore = ichain->core;
2895 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE);
2896 hammer2_chain_drop(ichain); /* excess ref from alloc */
2899 * We have to mark it modified to allocate its block, but use
2900 * OPTDATA to allow it to remain in the INITIAL state. Otherwise
2901 * it won't be acted upon by the flush code.
2903 * XXX leave the node unmodified, depend on the SUBMODIFIED
2904 * flush to assign and modify parent blocks.
2906 hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);
2909 * Iterate the original parent and move the matching brefs into
2910 * the new indirect block.
2912 * At the same time locate an empty slot (or what will become an
2913 * empty slot) and assign the new indirect block to that slot.
2915 * XXX handle flushes.
2917 spin_lock(&above->cst.spin);
2918 for (i = 0; i < count; ++i) {
2920 * For keying purposes access the bref from the media or
2921 * from our in-memory cache. In cases where the in-memory
2922 * cache overrides the media the keyrefs will be the same
2923 * anyway so we can avoid checking the cache when the media
2926 child = hammer2_chain_find_locked(parent, i);
2928 if (child->flags & HAMMER2_CHAIN_DELETED) {
2929 if (ichain->index < 0)
2933 bref = &child->bref;
2934 } else if (base && base[i].type) {
2937 if (ichain->index < 0)
2943 * Skip keys that are not within the key/radix of the new
2944 * indirect block. They stay in the parent.
2946 if ((~(((hammer2_key_t)1 << keybits) - 1) &
2947 (key ^ bref->key)) != 0) {
2952 * This element is being moved from the parent, its slot
2953 * is available for our new indirect block.
2955 if (ichain->index < 0)
2959 * Load the new indirect block by acquiring or allocating
2960 * the related chain entries, then move them to the new
2961 * parent (ichain) by deleting them from their old location
2962 * and inserting a duplicate of the chain and any modified
2963 * sub-chain in the new location.
2965 * We must set MOVED in the chain being duplicated and
2966 * SUBMODIFIED in the parent(s) so the flush code knows
2967 * what is going on. The latter is done after the loop.
2969 * WARNING! above->cst.spin must be held when parent is
2970 * modified, even though we own the full blown lock,
2971 * to deal with setsubmod and rename races.
2972 * (XXX remove this req).
2974 spin_unlock(&above->cst.spin);
2975 chain = hammer2_chain_get(parent, i, HAMMER2_LOOKUP_NODATA);
2976 hammer2_chain_delete(trans, chain);
2977 hammer2_chain_duplicate(trans, ichain, i, &chain, NULL);
2978 hammer2_chain_unlock(chain);
2979 KKASSERT(parent->refs > 0);
2981 spin_lock(&above->cst.spin);
2983 spin_unlock(&above->cst.spin);
2986 * Insert the new indirect block into the parent now that we've
2987 * cleared out some entries in the parent. We calculated a good
2988 * insertion index in the loop above (ichain->index).
2990 * We don't have to set MOVED here because we mark ichain modified
2991 * down below (so the normal modified -> flush -> set-moved sequence
2994 * The insertion shouldn't race as this is a completely new block
2995 * and the parent is locked.
2997 if (ichain->index < 0)
2998 kprintf("indirect parent %p count %d key %016jx/%d\n",
2999 parent, count, (intmax_t)key, keybits);
3000 KKASSERT(ichain->index >= 0);
3001 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
3002 spin_lock(&above->cst.spin);
3003 if (RB_INSERT(hammer2_chain_tree, &above->rbtree, ichain))
3004 panic("hammer2_chain_create_indirect: ichain insertion");
3005 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_ONRBTREE);
3006 ichain->above = above;
3007 spin_unlock(&above->cst.spin);
3010 * Mark the new indirect block modified after insertion, which
3011 * will propagate up through parent all the way to the root and
3012 * also allocate the physical block in ichain for our caller,
3013 * and assign ichain->data to a pre-zero'd space (because there
3014 * is not prior data to copy into it).
3016 * We have to set SUBMODIFIED in ichain's flags manually so the
3017 * flusher knows it has to recurse through it to get to all of
3018 * our moved blocks, then call setsubmod() to set the bit
3021 /*hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);*/
3022 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
3023 hammer2_chain_setsubmod(trans, ichain);
3026 * Figure out what to return.
3028 if (~(((hammer2_key_t)1 << keybits) - 1) &
3029 (create_key ^ key)) {
3031 * Key being created is outside the key range,
3032 * return the original parent.
3034 hammer2_chain_unlock(ichain);
3037 * Otherwise its in the range, return the new parent.
3038 * (leave both the new and old parent locked).
3047 * Calculate the keybits and highside/lowside of the freemap node the
3048 * caller is creating.
3050 * This routine will specify the next higher-level freemap key/radix
3051 * representing the lowest-ordered set. By doing so, eventually all
3052 * low-ordered sets will be moved one level down.
3054 * We have to be careful here because the freemap reserves a limited
3055 * number of blocks for a limited number of levels. So we can't just
3056 * push indiscriminately.
3059 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
3060 int keybits, hammer2_blockref_t *base, int count)
3062 hammer2_chain_core_t *above;
3063 hammer2_chain_t *child;
3064 hammer2_blockref_t *bref;
3071 above = parent->core;
3077 * Calculate the range of keys in the array being careful to skip
3078 * slots which are overridden with a deletion.
3080 spin_lock(&above->cst.spin);
3081 for (i = 0; i < count; ++i) {
3082 child = hammer2_chain_find_locked(parent, i);
3084 if (child->flags & HAMMER2_CHAIN_DELETED)
3086 bref = &child->bref;
3087 } else if (base && base[i].type) {
3093 if (keybits > bref->keybits) {
3095 keybits = bref->keybits;
3096 } else if (keybits == bref->keybits && bref->key < key) {
3100 spin_unlock(&above->cst.spin);
3103 * Return the keybits for a higher-level FREEMAP_NODE covering
3107 case HAMMER2_FREEMAP_LEVEL0_RADIX:
3108 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX;
3110 case HAMMER2_FREEMAP_LEVEL1_RADIX:
3111 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX;
3113 case HAMMER2_FREEMAP_LEVEL2_RADIX:
3114 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX;
3116 case HAMMER2_FREEMAP_LEVEL3_RADIX:
3117 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX;
3119 case HAMMER2_FREEMAP_LEVEL4_RADIX:
3120 panic("hammer2_chain_indkey_freemap: level too high");
3123 panic("hammer2_chain_indkey_freemap: bad radix");
3132 * Calculate the keybits and highside/lowside of the indirect block the
3133 * caller is creating.
3136 hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
3137 int keybits, hammer2_blockref_t *base, int count)
3139 hammer2_chain_core_t *above;
3140 hammer2_chain_t *child;
3141 hammer2_blockref_t *bref;
3149 above = parent->core;
3154 * Calculate the range of keys in the array being careful to skip
3155 * slots which are overridden with a deletion. Once the scan
3156 * completes we will cut the key range in half and shift half the
3157 * range into the new indirect block.
3159 spin_lock(&above->cst.spin);
3160 for (i = 0; i < count; ++i) {
3161 child = hammer2_chain_find_locked(parent, i);
3163 if (child->flags & HAMMER2_CHAIN_DELETED)
3165 bref = &child->bref;
3166 } else if (base && base[i].type) {
3173 * Expand our calculated key range (key, keybits) to fit
3174 * the scanned key. nkeybits represents the full range
3175 * that we will later cut in half (two halves @ nkeybits - 1).
3178 if (nkeybits < bref->keybits) {
3179 if (bref->keybits > 64) {
3180 kprintf("bad bref index %d chain %p bref %p\n",
3184 nkeybits = bref->keybits;
3186 while (nkeybits < 64 &&
3187 (~(((hammer2_key_t)1 << nkeybits) - 1) &
3188 (key ^ bref->key)) != 0) {
3193 * If the new key range is larger we have to determine
3194 * which side of the new key range the existing keys fall
3195 * under by checking the high bit, then collapsing the
3196 * locount into the hicount or vise-versa.
3198 if (keybits != nkeybits) {
3199 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
3210 * The newly scanned key will be in the lower half or the
3211 * higher half of the (new) key range.
3213 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
3218 spin_unlock(&above->cst.spin);
3219 bref = NULL; /* now invalid (safety) */
3222 * Adjust keybits to represent half of the full range calculated
3223 * above (radix 63 max)
3228 * Select whichever half contains the most elements. Theoretically
3229 * we can select either side as long as it contains at least one
3230 * element (in order to ensure that a free slot is present to hold
3231 * the indirect block).
3233 if (hammer2_indirect_optimize) {
3235 * Insert node for least number of keys, this will arrange
3236 * the first few blocks of a large file or the first few
3237 * inodes in a directory with fewer indirect blocks when
3240 if (hicount < locount && hicount != 0)
3241 key |= (hammer2_key_t)1 << keybits;
3243 key &= ~(hammer2_key_t)1 << keybits;
3246 * Insert node for most number of keys, best for heavily
3249 if (hicount > locount)
3250 key |= (hammer2_key_t)1 << keybits;
3252 key &= ~(hammer2_key_t)1 << keybits;
3260 * Sets CHAIN_DELETED and CHAIN_MOVED in the chain being deleted and
3261 * set chain->delete_tid.
3263 * This function does NOT generate a modification to the parent. It
3264 * would be nearly impossible to figure out which parent to modify anyway.
3265 * Such modifications are handled by the flush code and are properly merged
3266 * using the flush synchronization point.
3268 * The find/get code will properly overload the RBTREE check on top of
3269 * the bref check to detect deleted entries.
3271 * This function is NOT recursive. Any entity already pushed into the
3272 * chain (such as an inode) may still need visibility into its contents,
3273 * as well as the ability to read and modify the contents. For example,
3274 * for an unlinked file which is still open.
3276 * NOTE: This function does NOT set chain->modify_tid, allowing future
3277 * code to distinguish between live and deleted chains by testing
3280 * NOTE: Deletions normally do not occur in the middle of a duplication
3281 * chain but we use a trick for hardlink migration that refactors
3282 * the originating inode without deleting it, so we make no assumptions
3286 hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain)
3288 KKASSERT(ccms_thread_lock_owned(&chain->core->cst));
3291 * Nothing to do if already marked.
3293 if (chain->flags & HAMMER2_CHAIN_DELETED)
3297 * We must set MOVED along with DELETED for the flush code to
3298 * recognize the operation and properly disconnect the chain
3301 * The setting of DELETED causes finds, lookups, and _next iterations
3302 * to no longer recognize the chain. RB_SCAN()s will still have
3303 * visibility (needed for flush serialization points).
3305 * We need the spinlock on the core whos RBTREE contains chain
3306 * to protect against races.
3308 spin_lock(&chain->above->cst.spin);
3309 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3310 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
3311 hammer2_chain_ref(chain);
3312 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
3314 chain->delete_tid = trans->sync_tid;
3315 spin_unlock(&chain->above->cst.spin);
3316 hammer2_chain_setsubmod(trans, chain);
3320 hammer2_chain_wait(hammer2_chain_t *chain)
3322 tsleep(chain, 0, "chnflw", 1);