hammer2 - Start working on the freemap (note: freemap not yet operational)
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem handles direct and indirect block searches, recursions,
37  * creation, and deletion.  Chains of blockrefs are tracked and modifications
38  * are flagged for propagation... eventually all the way back to the volume
39  * header.  Any chain except the volume header can be flushed to disk at
40  * any time... none of it matters until the volume header is dealt with
41  * (which is not here, see hammer2_vfsops.c for the volume header disk
42  * sequencing).
43  *
44  * Serialized flushes are not handled here, see hammer2_flush.c.  This module
45  * can essentially work on the current version of data, which can be in memory
46  * as well as on-disk due to the above.  However, we are responsible for
47  * making a copy of the state when a modified chain is part of a flush
48  * and we attempt to modify it again before the flush gets to it.  In that
49  * situation we create an allocated copy of the state that the flush can
50  * deal with.  If a chain undergoing deletion is part of a flush it is
51  * marked DELETED and its bref index is kept intact for the flush, but the
52  * chain is thereafter ignored by this module's because it is no longer
53  * current.
54  */
55 #include <sys/cdefs.h>
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/types.h>
59 #include <sys/lock.h>
60 #include <sys/uuid.h>
61
62 #include "hammer2.h"
63
64 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
65
66 static hammer2_chain_t *hammer2_chain_create_indirect(
67                         hammer2_mount_t *hmp, hammer2_chain_t *parent,
68                         hammer2_key_t key, int keybits,
69                         int *errorp);
70
71 /*
72  * We use a red-black tree to guarantee safe lookups under shared locks.
73  */
74 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
75
76 int
77 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
78 {
79         return(chain2->index - chain1->index);
80 }
81
82 /*
83  * Recursively mark the parent chain elements so flushes can find
84  * modified elements.  Stop when we hit a chain already flagged
85  * SUBMODIFIED, but ignore the SUBMODIFIED bit that might be set
86  * in chain itself.
87  *
88  * SUBMODIFIED is not set on the chain passed in.
89  *
90  * The chain->cst.spin lock can be held to stabilize the chain->parent
91  * pointer.  The first parent is stabilized by virtue of chain being
92  * fully locked.
93  */
94 void
95 hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
96 {
97         hammer2_chain_t *parent;
98
99         parent = chain->parent;
100         if (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
101                 spin_lock(&parent->cst.spin);
102                 for (;;) {
103                         atomic_set_int(&parent->flags,
104                                        HAMMER2_CHAIN_SUBMODIFIED);
105                         if ((chain = parent->parent) == NULL)
106                                 break;
107                         spin_lock(&chain->cst.spin);    /* upward interlock */
108                         spin_unlock(&parent->cst.spin);
109                         parent = chain;
110                 }
111                 spin_unlock(&parent->cst.spin);
112         }
113 }
114
115 /*
116  * Allocate a new disconnected chain element representing the specified
117  * bref.  The chain element is locked exclusively and refs is set to 1.
118  *
119  * This essentially allocates a system memory structure representing one
120  * of the media structure types, including inodes.
121  */
122 hammer2_chain_t *
123 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
124 {
125         hammer2_chain_t *chain;
126         hammer2_inode_t *ip;
127         hammer2_indblock_t *np;
128         hammer2_data_t *dp;
129         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
130
131         /*
132          * Construct the appropriate system structure.
133          */
134         switch(bref->type) {
135         case HAMMER2_BREF_TYPE_INODE:
136                 ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
137                 chain = &ip->chain;
138                 chain->u.ip = ip;
139                 ip->hmp = hmp;
140                 break;
141         case HAMMER2_BREF_TYPE_INDIRECT:
142         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
143         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
144                 np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
145                 chain = &np->chain;
146                 chain->u.np = np;
147                 break;
148         case HAMMER2_BREF_TYPE_DATA:
149         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
150                 dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
151                 chain = &dp->chain;
152                 chain->u.dp = dp;
153                 break;
154         case HAMMER2_BREF_TYPE_VOLUME:
155                 chain = NULL;
156                 panic("hammer2_chain_alloc volume type illegal for op");
157         default:
158                 chain = NULL;
159                 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
160                       bref->type);
161         }
162
163         /*
164          * Only set bref_flush if the bref has a real media offset, otherwise
165          * the caller has to wait for the chain to be modified/block-allocated
166          * before a blockref can be synchronized with its (future) parent.
167          */
168         chain->bref = *bref;
169         if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
170                 chain->bref_flush = *bref;
171         chain->index = -1;              /* not yet assigned */
172         chain->refs = 1;
173         chain->bytes = bytes;
174         ccms_cst_init(&chain->cst, chain);
175         ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
176
177         return (chain);
178 }
179
180 /*
181  * Deallocate a chain (the step before freeing it).  Remove the chain from
182  * its parent's tree.
183  *
184  * Caller must hold the parent and the chain exclusively locked, and
185  * chain->refs must be 0.
186  *
187  * This function unlocks, removes, and destroys chain, and will recursively
188  * destroy any sub-chains under chain (whos refs must also be 0 at this
189  * point).
190  *
191  * parent can be NULL.
192  */
193 static void
194 hammer2_chain_dealloc(hammer2_mount_t *hmp, hammer2_chain_t *chain)
195 {
196         hammer2_inode_t *ip;
197         hammer2_chain_t *parent;
198         hammer2_chain_t *child;
199
200         KKASSERT(chain->refs == 0);
201         KKASSERT(chain->flushing == 0);
202         KKASSERT((chain->flags &
203                   (HAMMER2_CHAIN_MOVED | HAMMER2_CHAIN_MODIFIED)) == 0);
204
205         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
206                 ip = chain->u.ip;
207         else
208                 ip = NULL;
209
210         /*
211          * If the sub-tree is not empty all the elements on it must have
212          * 0 refs and be deallocatable.
213          */
214         while ((child = RB_ROOT(&chain->rbhead)) != NULL) {
215                 ccms_thread_lock(&child->cst, CCMS_STATE_EXCLUSIVE);
216                 hammer2_chain_dealloc(hmp, child);
217         }
218
219         /*
220          * If the DELETED flag is not set the chain must be removed from
221          * its parent's tree.
222          *
223          * WARNING! chain->cst.spin must be held when chain->parent is
224          *          modified, even though we own the full blown lock,
225          *          to deal with setsubmod and rename races.
226          */
227         if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
228                 spin_lock(&chain->cst.spin);    /* shouldn't be needed */
229                 parent = chain->parent;
230                 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
231                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
232                 if (ip)
233                         ip->pip = NULL;
234                 chain->parent = NULL;
235                 spin_unlock(&chain->cst.spin);
236         }
237
238         /*
239          * When cleaning out a hammer2_inode we must
240          * also clean out the related ccms_inode.
241          */
242         if (ip)
243                 ccms_cst_uninit(&ip->topo_cst);
244         hammer2_chain_free(hmp, chain);
245 }
246
247 /*
248  * Free a disconnected chain element
249  */
250 void
251 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
252 {
253         void *mem;
254
255         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
256             chain->bref.type == HAMMER2_BREF_TYPE_VOLUME) {
257                 chain->data = NULL;
258         }
259
260         KKASSERT(chain->bp == NULL);
261         KKASSERT(chain->data == NULL);
262         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
263                  chain->u.ip->vp == NULL);
264         ccms_thread_unlock(&chain->cst);
265         KKASSERT(chain->cst.count == 0);
266         KKASSERT(chain->cst.upgrade == 0);
267
268         if ((mem = chain->u.mem) != NULL) {
269                 chain->u.mem = NULL;
270                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
271                         kfree(mem, hmp->minode);
272                 else
273                         kfree(mem, hmp->mchain);
274         }
275 }
276
277 /*
278  * Add a reference to a chain element, preventing its destruction.
279  *
280  * The parent chain must be locked shared or exclusive or otherwise be
281  * stable and already have a reference.
282  */
283 void
284 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
285 {
286         u_int refs;
287
288         while (chain) {
289                 refs = chain->refs;
290                 KKASSERT(chain->refs >= 0);
291                 cpu_ccfence();
292                 if (refs == 0) {
293                         /*
294                          * 0 -> 1 transition must bump the refs on the parent
295                          * too.  The caller has stabilized the parent.
296                          */
297                         if (atomic_cmpset_int(&chain->refs, 0, 1)) {
298                                 chain = chain->parent;
299                                 KKASSERT(chain == NULL || chain->refs > 0);
300                         }
301                         /* retry or continue along the parent chain */
302                 } else {
303                         /*
304                          * N -> N+1
305                          */
306                         if (atomic_cmpset_int(&chain->refs, refs, refs + 1))
307                                 break;
308                         /* retry */
309                 }
310         }
311 }
312
313 /*
314  * Drop the callers reference to the chain element.  If the ref count
315  * reaches zero we attempt to recursively drop the parent.
316  *
317  * MOVED and MODIFIED elements hold additional references so it should not
318  * be possible for the count on a modified element to drop to 0.
319  *
320  * The chain element must NOT be locked by the caller on the 1->0 transition.
321  *
322  * The parent might or might not be locked by the caller.  If we are unable
323  * to lock the parent on the 1->0 transition the destruction of the chain
324  * will be deferred but we still recurse upward and drop the ref on the
325  * parent (see the lastdrop() function)
326  */
327 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_mount_t *hmp,
328                                                 hammer2_chain_t *chain);
329
330 void
331 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
332 {
333         u_int refs;
334
335         while (chain) {
336                 refs = chain->refs;
337                 cpu_ccfence();
338                 KKASSERT(refs > 0);
339                 if (refs == 1) {
340                         /*
341                          * (1) lastdrop successfully drops the chain to 0
342                          *     refs and may may not have destroyed it.
343                          *     lastdrop will return the parent so we can
344                          *     recursively drop the implied ref from the
345                          *     1->0 transition.
346                          *
347                          * (2) lastdrop fails to transition refs from 1 to 0
348                          *     and returns the same chain, we retry.
349                          */
350                         chain = hammer2_chain_lastdrop(hmp, chain);
351                 } else {
352                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
353                                 /*
354                                  * Succeeded, count did not reach zero so
355                                  * cut out of the loop.
356                                  */
357                                 break;
358                         }
359                         /* retry the same chain */
360                 }
361         }
362 }
363
364 /*
365  * Handle SMP races during the last drop.  We must obtain a lock on
366  * chain->parent to stabilize the last pointer reference to chain
367  * (if applicable).  This reference does not have a parallel ref count,
368  * that is idle chains in the topology can have a ref count of 0.
369  *
370  * The 1->0 transition implies a ref on the parent.
371  */
372 static
373 hammer2_chain_t *
374 hammer2_chain_lastdrop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
375 {
376         hammer2_chain_t *parent;
377
378         /*
379          * Stablize chain->parent with the chain cst's spinlock.
380          * (parent can be NULL here).
381          *
382          * cst.spin locks are allowed to be nested bottom-up (the reverse
383          * of the normal top-down for full-blown cst locks), so this also
384          * allows us to attempt to obtain the parent's cst lock non-blocking
385          * (which must acquire the parent's spinlock unconditionally) while
386          * we are still holding the chain's spinlock.
387          */
388         spin_lock(&chain->cst.spin);
389         parent = chain->parent;
390
391         /*
392          * If chain->flushing is non-zero we cannot deallocate the chain
393          * here.  The flushing field will be serialized for the inline
394          * unlock made by the flusher itself and we don't care about races
395          * in any other situation because the thread lock on the parent
396          * will fail in other situations.
397          *
398          * If we have a non-NULL parent but cannot acquire its thread
399          * lock, we also cannot deallocate the chain.
400          */
401         if (chain->flushing ||
402             (parent && ccms_thread_lock_nonblock(&parent->cst,
403                                                  CCMS_STATE_EXCLUSIVE))) {
404                 if (atomic_cmpset_int(&chain->refs, 1, 0)) {
405                         spin_unlock(&chain->cst.spin);  /* success */
406                         return(parent);
407                 } else {
408                         spin_unlock(&chain->cst.spin);  /* failure */
409                         return(chain);
410                 }
411         }
412         spin_unlock(&chain->cst.spin);
413
414         /*
415          * With the parent now held we control the last pointer reference
416          * to chain ONLY IF this is the 1->0 drop.  If we fail to transition
417          * from 1->0 we raced a refs change and must retry at chain.
418          */
419         if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
420                 /* failure */
421                 if (parent)
422                         ccms_thread_unlock(&parent->cst);
423                 return(chain);
424         }
425
426         /*
427          * Ok, we succeeded.  We now own the implied ref on the parent
428          * associated with the 1->0 transition of the child.  It should not
429          * be possible for ANYTHING to access the child now, as we own the
430          * lock on the parent, so we should be able to safely lock the
431          * child and destroy it.
432          */
433         ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
434         hammer2_chain_dealloc(hmp, chain);
435
436         /*
437          * We want to return parent with its implied ref to the caller
438          * to recurse and drop the parent.
439          */
440         if (parent)
441                 ccms_thread_unlock(&parent->cst);
442         return (parent);
443 }
444
445 /*
446  * Ref and lock a chain element, acquiring its data with I/O if necessary,
447  * and specify how you would like the data to be resolved.
448  *
449  * Returns 0 on success or an error code if the data could not be acquired.
450  * The chain element is locked either way.
451  *
452  * The lock is allowed to recurse, multiple locking ops will aggregate
453  * the requested resolve types.  Once data is assigned it will not be
454  * removed until the last unlock.
455  *
456  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
457  *                         (typically used to avoid device/logical buffer
458  *                          aliasing for data)
459  *
460  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
461  *                         the INITIAL-create state (indirect blocks only).
462  *
463  *                         Do not resolve data elements for DATA chains.
464  *                         (typically used to avoid device/logical buffer
465  *                          aliasing for data)
466  *
467  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
468  *
469  * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
470  *                         it will be locked exclusive.
471  *
472  * NOTE: Embedded elements (volume header, inodes) are always resolved
473  *       regardless.
474  *
475  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
476  *       element will instantiate and zero its buffer, and flush it on
477  *       release.
478  *
479  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
480  *       so as not to instantiate a device buffer, which could alias against
481  *       a logical file buffer.  However, if ALWAYS is specified the
482  *       device buffer will be instantiated anyway.
483  */
484 int
485 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
486 {
487         hammer2_blockref_t *bref;
488         hammer2_off_t pbase;
489         hammer2_off_t peof;
490         ccms_state_t ostate;
491         size_t boff;
492         size_t bbytes;
493         int error;
494         char *bdata;
495
496         /*
497          * Ref and lock the element.  Recursive locks are allowed.
498          */
499         hammer2_chain_ref(hmp, chain);
500         if (how & HAMMER2_RESOLVE_SHARED)
501                 ccms_thread_lock(&chain->cst, CCMS_STATE_SHARED);
502         else
503                 ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
504
505         /*
506          * If we already have a valid data pointer no further action is
507          * necessary.
508          */
509         if (chain->data)
510                 return (0);
511
512         /*
513          * Do we have to resolve the data?
514          */
515         switch(how & HAMMER2_RESOLVE_MASK) {
516         case HAMMER2_RESOLVE_NEVER:
517                 return(0);
518         case HAMMER2_RESOLVE_MAYBE:
519                 if (chain->flags & HAMMER2_CHAIN_INITIAL)
520                         return(0);
521                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
522                         return(0);
523                 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF)
524                         return(0);
525                 /* fall through */
526         case HAMMER2_RESOLVE_ALWAYS:
527                 break;
528         }
529
530         /*
531          * Upgrade to an exclusive lock so we can safely manipulate the
532          * buffer cache.  If another thread got to it before us we
533          * can just return.
534          */
535         ostate = ccms_thread_lock_upgrade(&chain->cst);
536         if (chain->data) {
537                 ccms_thread_lock_restore(&chain->cst, ostate);
538                 return (0);
539         }
540
541         /*
542          * We must resolve to a device buffer, either by issuing I/O or
543          * by creating a zero-fill element.  We do not mark the buffer
544          * dirty when creating a zero-fill element (the hammer2_chain_modify()
545          * API must still be used to do that).
546          *
547          * The device buffer is variable-sized in powers of 2 down
548          * to HAMMER2_MINALLOCSIZE (typically 1K).  A 64K physical storage
549          * chunk always contains buffers of the same size. (XXX)
550          *
551          * The minimum physical IO size may be larger than the variable
552          * block size.
553          */
554         bref = &chain->bref;
555
556         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
557                 bbytes = HAMMER2_MINIOSIZE;
558         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
559         peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
560         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
561         KKASSERT(pbase != 0);
562
563         /*
564          * The getblk() optimization can only be used on newly created
565          * elements if the physical block size matches the request.
566          */
567         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
568             chain->bytes == bbytes) {
569                 chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
570                 error = 0;
571         } else if (hammer2_cluster_enable) {
572                 error = cluster_read(hmp->devvp, peof, pbase, bbytes,
573                                      HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
574                                      &chain->bp);
575         } else {
576                 error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
577         }
578
579         if (error) {
580                 kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
581                         (intmax_t)pbase, error);
582                 bqrelse(chain->bp);
583                 chain->bp = NULL;
584                 ccms_thread_lock_restore(&chain->cst, ostate);
585                 return (error);
586         }
587
588         /*
589          * Zero the data area if the chain is in the INITIAL-create state.
590          * Mark the buffer for bdwrite().
591          */
592         bdata = (char *)chain->bp->b_data + boff;
593         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
594                 bzero(bdata, chain->bytes);
595                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
596         }
597
598         /*
599          * Setup the data pointer, either pointing it to an embedded data
600          * structure and copying the data from the buffer, or pointing it
601          * into the buffer.
602          *
603          * The buffer is not retained when copying to an embedded data
604          * structure in order to avoid potential deadlocks or recursions
605          * on the same physical buffer.
606          */
607         switch (bref->type) {
608         case HAMMER2_BREF_TYPE_VOLUME:
609                 /*
610                  * Copy data from bp to embedded buffer
611                  */
612                 panic("hammer2_chain_lock: called on unresolved volume header");
613 #if 0
614                 /* NOT YET */
615                 KKASSERT(pbase == 0);
616                 KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
617                 bcopy(bdata, &hmp->voldata, chain->bytes);
618                 chain->data = (void *)&hmp->voldata;
619                 bqrelse(chain->bp);
620                 chain->bp = NULL;
621 #endif
622                 break;
623         case HAMMER2_BREF_TYPE_INODE:
624                 /*
625                  * Copy data from bp to embedded buffer, do not retain the
626                  * device buffer.
627                  */
628                 bcopy(bdata, &chain->u.ip->ip_data, chain->bytes);
629                 chain->data = (void *)&chain->u.ip->ip_data;
630                 bqrelse(chain->bp);
631                 chain->bp = NULL;
632                 break;
633         case HAMMER2_BREF_TYPE_INDIRECT:
634         case HAMMER2_BREF_TYPE_DATA:
635         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
636         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
637         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
638         default:
639                 /*
640                  * Point data at the device buffer and leave bp intact.
641                  */
642                 chain->data = (void *)bdata;
643                 break;
644         }
645
646         /*
647          * Make sure the bp is not specifically owned by this thread before
648          * restoring to a possibly shared lock, so another hammer2 thread
649          * can release it.
650          */
651         if (chain->bp)
652                 BUF_KERNPROC(chain->bp);
653         ccms_thread_lock_restore(&chain->cst, ostate);
654         return (0);
655 }
656
657 /*
658  * Unlock and deref a chain element.
659  *
660  * On the last lock release any non-embedded data (chain->bp) will be
661  * retired.
662  */
663 void
664 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
665 {
666         long *counterp;
667
668         /*
669          * Release the CST lock but with a special 1->0 transition case.
670          *
671          * Returns non-zero if lock references remain.  When zero is
672          * returned the last lock reference is retained and any shared
673          * lock is upgraded to an exclusive lock for final disposition.
674          */
675         if (ccms_thread_unlock_zero(&chain->cst)) {
676                 KKASSERT(chain->refs > 1);
677                 atomic_add_int(&chain->refs, -1);
678                 return;
679         }
680
681         /*
682          * Shortcut the case if the data is embedded or not resolved.
683          *
684          * Do NOT null-out pointers to embedded data (e.g. inode).
685          *
686          * The DIRTYBP flag is non-applicable in this situation and can
687          * be cleared to keep the flags state clean.
688          */
689         if (chain->bp == NULL) {
690                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
691                 ccms_thread_unlock(&chain->cst);
692                 hammer2_chain_drop(hmp, chain);
693                 return;
694         }
695
696         /*
697          * Statistics
698          */
699         if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
700                 ;
701         } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
702                 switch(chain->bref.type) {
703                 case HAMMER2_BREF_TYPE_DATA:
704                         counterp = &hammer2_ioa_file_write;
705                         break;
706                 case HAMMER2_BREF_TYPE_INODE:
707                         counterp = &hammer2_ioa_meta_write;
708                         break;
709                 case HAMMER2_BREF_TYPE_INDIRECT:
710                         counterp = &hammer2_ioa_indr_write;
711                         break;
712                 case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
713                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
714                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
715                         counterp = &hammer2_ioa_fmap_write;
716                         break;
717                 default:
718                         counterp = &hammer2_ioa_volu_write;
719                         break;
720                 }
721                 ++*counterp;
722         } else {
723                 switch(chain->bref.type) {
724                 case HAMMER2_BREF_TYPE_DATA:
725                         counterp = &hammer2_iod_file_write;
726                         break;
727                 case HAMMER2_BREF_TYPE_INODE:
728                         counterp = &hammer2_iod_meta_write;
729                         break;
730                 case HAMMER2_BREF_TYPE_INDIRECT:
731                         counterp = &hammer2_iod_indr_write;
732                         break;
733                 case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
734                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
735                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
736                         counterp = &hammer2_iod_fmap_write;
737                         break;
738                 default:
739                         counterp = &hammer2_iod_volu_write;
740                         break;
741                 }
742                 ++*counterp;
743         }
744
745         /*
746          * Clean out the bp.
747          *
748          * If a device buffer was used for data be sure to destroy the
749          * buffer when we are done to avoid aliases (XXX what about the
750          * underlying VM pages?).
751          *
752          * NOTE: Freemap leaf's use reserved blocks and thus no aliasing
753          *       is possible.
754          */
755         if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
756                 chain->bp->b_flags |= B_RELBUF;
757
758         /*
759          * The DIRTYBP flag tracks whether we have to bdwrite() the buffer
760          * or not.  The flag will get re-set when chain_modify() is called,
761          * even if MODIFIED is already set, allowing the OS to retire the
762          * buffer independent of a hammer2 flus.
763          */
764         chain->data = NULL;
765         if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
766                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
767                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
768                         atomic_clear_int(&chain->flags,
769                                          HAMMER2_CHAIN_IOFLUSH);
770                         chain->bp->b_flags |= B_RELBUF;
771                         cluster_awrite(chain->bp);
772                 } else {
773                         chain->bp->b_flags |= B_CLUSTEROK;
774                         bdwrite(chain->bp);
775                 }
776         } else {
777                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
778                         atomic_clear_int(&chain->flags,
779                                          HAMMER2_CHAIN_IOFLUSH);
780                         chain->bp->b_flags |= B_RELBUF;
781                         brelse(chain->bp);
782                 } else {
783                         /* bp might still be dirty */
784                         bqrelse(chain->bp);
785                 }
786         }
787         chain->bp = NULL;
788         ccms_thread_unlock(&chain->cst);
789         hammer2_chain_drop(hmp, chain);
790 }
791
792 /*
793  * Resize the chain's physical storage allocation.  Chains can be resized
794  * smaller without reallocating the storage.  Resizing larger will reallocate
795  * the storage.
796  *
797  * Must be passed a locked chain.
798  *
799  * If you want the resize code to copy the data to the new block then the
800  * caller should lock the chain RESOLVE_MAYBE or RESOLVE_ALWAYS.
801  *
802  * If the caller already holds a logical buffer containing the data and
803  * intends to bdwrite() that buffer resolve with RESOLVE_NEVER.  The resize
804  * operation will then not copy the data.
805  *
806  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
807  * to avoid instantiating a device buffer that conflicts with the vnode
808  * data buffer.
809  *
810  * XXX flags currently ignored, uses chain->bp to detect data/no-data.
811  */
812 void
813 hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
814                      int nradix, int flags)
815 {
816         hammer2_mount_t *hmp = ip->hmp;
817         struct buf *nbp;
818         hammer2_off_t pbase;
819         size_t obytes;
820         size_t nbytes;
821         size_t bbytes;
822         int boff;
823         char *bdata;
824         int error;
825
826         /*
827          * Only data and indirect blocks can be resized for now.
828          * (The volu root, inodes, and freemap elements use a fixed size).
829          */
830         KKASSERT(chain != &hmp->vchain);
831         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
832                  chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
833
834         /*
835          * Nothing to do if the element is already the proper size
836          */
837         obytes = chain->bytes;
838         nbytes = 1U << nradix;
839         if (obytes == nbytes)
840                 return;
841
842         /*
843          * Set MODIFIED and add a chain ref to prevent destruction.  Both
844          * modified flags share the same ref.
845          *
846          * If the chain is already marked MODIFIED then we can safely
847          * return the previous allocation to the pool without having to
848          * worry about snapshots.
849          */
850         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
851                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
852                                               HAMMER2_CHAIN_MODIFY_TID);
853                 hammer2_chain_ref(hmp, chain);
854         } else {
855                 hammer2_freemap_free(hmp, chain->bref.data_off,
856                                      chain->bref.type);
857         }
858
859         /*
860          * Relocate the block, even if making it smaller (because different
861          * block sizes may be in different regions).
862          */
863         chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type,
864                                                      nbytes);
865         chain->bytes = nbytes;
866         ip->delta_dcount += (ssize_t)(nbytes - obytes); /* XXX atomic */
867
868         /*
869          * The device buffer may be larger than the allocation size.
870          */
871         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
872                 bbytes = HAMMER2_MINIOSIZE;
873         pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
874         boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
875
876         /*
877          * Only copy the data if resolved, otherwise the caller is
878          * responsible.
879          */
880         if (chain->bp) {
881                 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
882                          chain->bref.type == HAMMER2_BREF_TYPE_DATA);
883                 KKASSERT(chain != &hmp->vchain);        /* safety */
884
885                 /*
886                  * The getblk() optimization can only be used if the
887                  * physical block size matches the request.
888                  */
889                 if (nbytes == bbytes) {
890                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
891                         error = 0;
892                 } else {
893                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
894                         KKASSERT(error == 0);
895                 }
896                 bdata = (char *)nbp->b_data + boff;
897
898                 if (nbytes < obytes) {
899                         bcopy(chain->data, bdata, nbytes);
900                 } else {
901                         bcopy(chain->data, bdata, obytes);
902                         bzero(bdata + obytes, nbytes - obytes);
903                 }
904
905                 /*
906                  * NOTE: The INITIAL state of the chain is left intact.
907                  *       We depend on hammer2_chain_modify() to do the
908                  *       right thing.
909                  *
910                  * NOTE: We set B_NOCACHE to throw away the previous bp and
911                  *       any VM backing store, even if it was dirty.
912                  *       Otherwise we run the risk of a logical/device
913                  *       conflict on reallocation.
914                  */
915                 chain->bp->b_flags |= B_RELBUF | B_NOCACHE;
916                 brelse(chain->bp);
917                 chain->bp = nbp;
918                 chain->data = (void *)bdata;
919                 hammer2_chain_modify(hmp, chain, 0);
920         }
921
922         /*
923          * Make sure the chain is marked MOVED and SUBMOD is set in the
924          * parent(s) so the adjustments are picked up by flush.
925          */
926         if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
927                 hammer2_chain_ref(hmp, chain);
928                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
929         }
930         hammer2_chain_parent_setsubmod(hmp, chain);
931 }
932
933 /*
934  * Convert a locked chain that was retrieved read-only to read-write.
935  *
936  * If not already marked modified a new physical block will be allocated
937  * and assigned to the bref.
938  *
939  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
940  *                   level or the COW operation will not work.
941  *
942  * Data blocks     - The chain is usually locked RESOLVE_NEVER so as not to
943  *                   run the data through the device buffers.
944  */
945 void
946 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
947 {
948         struct buf *nbp;
949         int error;
950         hammer2_off_t pbase;
951         size_t bbytes;
952         size_t boff;
953         void *bdata;
954
955         /*
956          * Tells flush that modify_tid must be updated, otherwise only
957          * mirror_tid is updated.  This is the default.
958          */
959         if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
960                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFY_TID);
961
962         /*
963          * If the chain is already marked MODIFIED we can just return.
964          *
965          * However, it is possible that a prior lock/modify sequence
966          * retired the buffer.  During this lock/modify sequence MODIFIED
967          * may still be set but the buffer could wind up clean.  Since
968          * the caller is going to modify the buffer further we have to
969          * be sure that DIRTYBP is set again.
970          */
971         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
972                 if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
973                     chain->bp == NULL) {
974                         goto skip1;
975                 }
976                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
977                 return;
978         }
979
980         /*
981          * Set MODIFIED and add a chain ref to prevent destruction.  Both
982          * modified flags share the same ref.
983          */
984         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
985         hammer2_chain_ref(hmp, chain);
986
987         /*
988          * We must allocate the copy-on-write block.
989          *
990          * If the data is embedded no other action is required.
991          *
992          * If the data is not embedded we acquire and clear the
993          * new block.  If chain->data is not NULL we then do the
994          * copy-on-write.  chain->data will then be repointed to the new
995          * buffer and the old buffer will be released.
996          *
997          * For newly created elements with no prior allocation we go
998          * through the copy-on-write steps except without the copying part.
999          */
1000         if (chain != &hmp->vchain) {
1001                 if ((hammer2_debug & 0x0001) &&
1002                     (chain->bref.data_off & HAMMER2_OFF_MASK)) {
1003                         kprintf("Replace %d\n", chain->bytes);
1004                 }
1005                 chain->bref.data_off =
1006                         hammer2_freemap_alloc(hmp, chain->bref.type,
1007                                               chain->bytes);
1008                 /* XXX failed allocation */
1009         }
1010
1011         /*
1012          * If data instantiation is optional and the chain has no current
1013          * data association (typical for DATA and newly-created INDIRECT
1014          * elements), don't instantiate the buffer now.
1015          */
1016         if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL)
1017                 goto skip2;
1018
1019 skip1:
1020         /*
1021          * Setting the DIRTYBP flag will cause the buffer to be dirtied or
1022          * written-out on unlock.  This bit is independent of the MODIFIED
1023          * bit because the chain may still need meta-data adjustments done
1024          * by virtue of MODIFIED for its parent, and the buffer can be
1025          * flushed out (possibly multiple times) by the OS before that.
1026          *
1027          * Clearing the INITIAL flag (for indirect blocks) indicates that
1028          * a zero-fill buffer has been instantiated.
1029          */
1030         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
1031         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1032
1033         /*
1034          * We currently should never instantiate a device buffer for a
1035          * file data chain.  (We definitely can for a freemap chain).
1036          */
1037         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
1038
1039         /*
1040          * Execute COW operation
1041          */
1042         switch(chain->bref.type) {
1043         case HAMMER2_BREF_TYPE_VOLUME:
1044         case HAMMER2_BREF_TYPE_INODE:
1045                 /*
1046                  * The data is embedded, no copy-on-write operation is
1047                  * needed.
1048                  */
1049                 KKASSERT(chain->bp == NULL);
1050                 break;
1051         case HAMMER2_BREF_TYPE_DATA:
1052         case HAMMER2_BREF_TYPE_INDIRECT:
1053         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1054         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1055         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1056                 /*
1057                  * Perform the copy-on-write operation
1058                  */
1059                 KKASSERT(chain != &hmp->vchain);        /* safety */
1060                 /*
1061                  * The device buffer may be larger than the allocation size.
1062                  */
1063                 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
1064                         bbytes = HAMMER2_MINIOSIZE;
1065                 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
1066                 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
1067
1068                 /*
1069                  * The getblk() optimization can only be used if the
1070                  * physical block size matches the request.
1071                  */
1072                 if (chain->bytes == bbytes) {
1073                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
1074                         error = 0;
1075                 } else {
1076                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
1077                         KKASSERT(error == 0);
1078                 }
1079                 bdata = (char *)nbp->b_data + boff;
1080
1081                 /*
1082                  * Copy or zero-fill on write depending on whether
1083                  * chain->data exists or not.
1084                  */
1085                 if (chain->data) {
1086                         bcopy(chain->data, bdata, chain->bytes);
1087                         KKASSERT(chain->bp != NULL);
1088                 } else {
1089                         bzero(bdata, chain->bytes);
1090                 }
1091                 if (chain->bp) {
1092                         chain->bp->b_flags |= B_RELBUF;
1093                         brelse(chain->bp);
1094                 }
1095                 chain->bp = nbp;
1096                 chain->data = bdata;
1097                 break;
1098         default:
1099                 panic("hammer2_chain_modify: illegal non-embedded type %d",
1100                       chain->bref.type);
1101                 break;
1102
1103         }
1104 skip2:
1105         if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
1106                 hammer2_chain_parent_setsubmod(hmp, chain);
1107 }
1108
1109 /*
1110  * Mark the volume as having been modified.  This short-cut version
1111  * does not have to lock the volume's chain, which allows the ioctl
1112  * code to make adjustments to connections without deadlocking.
1113  */
1114 void
1115 hammer2_modify_volume(hammer2_mount_t *hmp)
1116 {
1117         hammer2_voldata_lock(hmp);
1118         atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
1119         hammer2_voldata_unlock(hmp);
1120 }
1121
1122 /*
1123  * Locate an in-memory chain.  The parent must be locked.  The in-memory
1124  * chain is returned or NULL if no in-memory chain is present.
1125  *
1126  * NOTE: A chain on-media might exist for this index when NULL is returned.
1127  */
1128 hammer2_chain_t *
1129 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
1130 {
1131         hammer2_chain_t dummy;
1132         hammer2_chain_t *chain;
1133
1134         dummy.index = index;
1135         chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1136         return (chain);
1137 }
1138
1139 /*
1140  * Return a locked chain structure with all associated data acquired.
1141  *
1142  * Caller must lock the parent on call, the returned child will be locked.
1143  */
1144 hammer2_chain_t *
1145 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1146                   int index, int flags)
1147 {
1148         hammer2_blockref_t *bref;
1149         hammer2_inode_t *ip;
1150         hammer2_chain_t *chain;
1151         hammer2_chain_t dummy;
1152         int how;
1153         ccms_state_t ostate;
1154
1155         /*
1156          * Figure out how to lock.  MAYBE can be used to optimized
1157          * the initial-create state for indirect blocks.
1158          */
1159         if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
1160                 how = HAMMER2_RESOLVE_NEVER;
1161         else
1162                 how = HAMMER2_RESOLVE_MAYBE;
1163         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
1164                 how |= HAMMER2_RESOLVE_SHARED;
1165
1166         /*
1167          * First see if we have a (possibly modified) chain element cached
1168          * for this (parent, index).  Acquire the data if necessary.
1169          *
1170          * If chain->data is non-NULL the chain should already be marked
1171          * modified.
1172          */
1173         dummy.index = index;
1174         chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1175         if (chain) {
1176                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1177                         hammer2_chain_ref(hmp, chain);
1178                 else
1179                         hammer2_chain_lock(hmp, chain, how);
1180                 return(chain);
1181         }
1182
1183         /*
1184          * Upgrade our thread lock and handle any race that may have
1185          * occurred.  Leave the lock upgraded for the rest of the get.
1186          * We have to do this because we will be modifying the chain
1187          * structure.
1188          */
1189         ostate = ccms_thread_lock_upgrade(&parent->cst);
1190         chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1191         if (chain) {
1192                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1193                         hammer2_chain_ref(hmp, chain);
1194                 else
1195                         hammer2_chain_lock(hmp, chain, how);
1196                 ccms_thread_lock_restore(&parent->cst, ostate);
1197                 return(chain);
1198         }
1199
1200         /*
1201          * The get function must always succeed, panic if there's no
1202          * data to index.
1203          */
1204         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1205                 ccms_thread_lock_restore(&parent->cst, ostate);
1206                 panic("hammer2_chain_get: Missing bref(1)");
1207                 /* NOT REACHED */
1208         }
1209
1210         /*
1211          * Otherwise lookup the bref and issue I/O (switch on the parent)
1212          */
1213         switch(parent->bref.type) {
1214         case HAMMER2_BREF_TYPE_INODE:
1215                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1216                 bref = &parent->data->ipdata.u.blockset.blockref[index];
1217                 break;
1218         case HAMMER2_BREF_TYPE_INDIRECT:
1219         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1220         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1221                 KKASSERT(parent->data != NULL);
1222                 KKASSERT(index >= 0 &&
1223                          index < parent->bytes / sizeof(hammer2_blockref_t));
1224                 bref = &parent->data->npdata.blockref[index];
1225                 break;
1226         case HAMMER2_BREF_TYPE_VOLUME:
1227                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1228                 bref = &hmp->voldata.sroot_blockset.blockref[index];
1229                 break;
1230         default:
1231                 bref = NULL;
1232                 panic("hammer2_chain_get: unrecognized blockref type: %d",
1233                       parent->bref.type);
1234         }
1235         if (bref->type == 0) {
1236                 panic("hammer2_chain_get: Missing bref(2)");
1237                 /* NOT REACHED */
1238         }
1239
1240         /*
1241          * Allocate a chain structure representing the existing media
1242          * entry.
1243          *
1244          * The locking operation we do later will issue I/O to read it.
1245          */
1246         chain = hammer2_chain_alloc(hmp, bref);
1247
1248         /*
1249          * Link the chain into its parent.  Caller is expected to hold an
1250          * exclusive lock on the parent.
1251          */
1252         chain->parent = parent;
1253         chain->index = index;
1254         if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
1255                 panic("hammer2_chain_link: collision");
1256         atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
1257         KKASSERT(parent->refs > 0);
1258         atomic_add_int(&parent->refs, 1);       /* for red-black entry */
1259         ccms_thread_lock_restore(&parent->cst, ostate);
1260
1261         /*
1262          * Additional linkage for inodes.  Reuse the parent pointer to
1263          * find the parent directory.
1264          *
1265          * The ccms_inode is initialized from its parent directory.  The
1266          * chain of ccms_inode's is seeded by the mount code.
1267          */
1268         if (bref->type == HAMMER2_BREF_TYPE_INODE) {
1269                 ip = chain->u.ip;
1270                 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1271                         parent = parent->parent;
1272                 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
1273                         ip->pip = parent->u.ip;
1274                         ip->pmp = parent->u.ip->pmp;
1275                         ccms_cst_init(&ip->topo_cst, &ip->chain);
1276                 }
1277         }
1278
1279         /*
1280          * Our new chain structure has already been referenced and locked
1281          * but the lock code handles the I/O so call it to resolve the data.
1282          * Then release one of our two exclusive locks.
1283          *
1284          * If NOLOCK is set the release will release the one-and-only lock.
1285          */
1286         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1287                 hammer2_chain_lock(hmp, chain, how);    /* recusive lock */
1288                 hammer2_chain_drop(hmp, chain);         /* excess ref */
1289         }
1290         ccms_thread_unlock(&chain->cst);                        /* from alloc */
1291
1292         return (chain);
1293 }
1294
1295 /*
1296  * Locate any key between key_beg and key_end inclusive.  (*parentp)
1297  * typically points to an inode but can also point to a related indirect
1298  * block and this function will recurse upwards and find the inode again.
1299  *
1300  * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
1301  *           WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
1302  *           WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
1303  *
1304  * (*parentp) must be exclusively locked and referenced and can be an inode
1305  * or an existing indirect block within the inode.
1306  *
1307  * On return (*parentp) will be modified to point at the deepest parent chain
1308  * element encountered during the search, as a helper for an insertion or
1309  * deletion.   The new (*parentp) will be locked and referenced and the old
1310  * will be unlocked and dereferenced (no change if they are both the same).
1311  *
1312  * The matching chain will be returned exclusively locked and referenced.
1313  *
1314  * NULL is returned if no match was found, but (*parentp) will still
1315  * potentially be adjusted.
1316  *
1317  * This function will also recurse up the chain if the key is not within the
1318  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
1319  * can simply allow (*parentp) to float inside the loop.
1320  */
1321 hammer2_chain_t *
1322 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1323                      hammer2_key_t key_beg, hammer2_key_t key_end,
1324                      int flags)
1325 {
1326         hammer2_chain_t *parent;
1327         hammer2_chain_t *chain;
1328         hammer2_chain_t *tmp;
1329         hammer2_blockref_t *base;
1330         hammer2_blockref_t *bref;
1331         hammer2_key_t scan_beg;
1332         hammer2_key_t scan_end;
1333         int count = 0;
1334         int i;
1335         int how_always = HAMMER2_RESOLVE_ALWAYS;
1336         int how_maybe = HAMMER2_RESOLVE_MAYBE;
1337
1338         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) {
1339                 how_maybe |= HAMMER2_RESOLVE_SHARED;
1340                 how_always |= HAMMER2_RESOLVE_SHARED;
1341         }
1342
1343         /*
1344          * Recurse (*parentp) upward if necessary until the parent completely
1345          * encloses the key range or we hit the inode.
1346          */
1347         parent = *parentp;
1348         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1349                parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1350                 scan_beg = parent->bref.key;
1351                 scan_end = scan_beg +
1352                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1353                 if (key_beg >= scan_beg && key_end <= scan_end)
1354                         break;
1355                 hammer2_chain_ref(hmp, parent);         /* ref old parent */
1356                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1357                 parent = parent->parent;
1358                                                         /* lock new parent */
1359                 hammer2_chain_lock(hmp, parent, how_maybe);
1360                 hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
1361                 *parentp = parent;                      /* new parent */
1362         }
1363
1364 again:
1365         /*
1366          * Locate the blockref array.  Currently we do a fully associative
1367          * search through the array.
1368          */
1369         switch(parent->bref.type) {
1370         case HAMMER2_BREF_TYPE_INODE:
1371                 /*
1372                  * Special shortcut for embedded data returns the inode
1373                  * itself.  Callers must detect this condition and access
1374                  * the embedded data (the strategy code does this for us).
1375                  *
1376                  * This is only applicable to regular files and softlinks.
1377                  */
1378                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1379                         if (flags & HAMMER2_LOOKUP_NOLOCK)
1380                                 hammer2_chain_ref(hmp, parent);
1381                         else
1382                                 hammer2_chain_lock(hmp, parent, how_always);
1383                         return (parent);
1384                 }
1385                 base = &parent->data->ipdata.u.blockset.blockref[0];
1386                 count = HAMMER2_SET_COUNT;
1387                 break;
1388         case HAMMER2_BREF_TYPE_INDIRECT:
1389         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1390         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1391                 /*
1392                  * Optimize indirect blocks in the INITIAL state to avoid
1393                  * I/O.
1394                  */
1395                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1396                         base = NULL;
1397                 } else {
1398                         if (parent->data == NULL)
1399                                 panic("parent->data is NULL");
1400                         base = &parent->data->npdata.blockref[0];
1401                 }
1402                 count = parent->bytes / sizeof(hammer2_blockref_t);
1403                 break;
1404         case HAMMER2_BREF_TYPE_VOLUME:
1405                 base = &hmp->voldata.sroot_blockset.blockref[0];
1406                 count = HAMMER2_SET_COUNT;
1407                 break;
1408         default:
1409                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
1410                       parent->bref.type);
1411                 base = NULL;    /* safety */
1412                 count = 0;      /* safety */
1413         }
1414
1415         /*
1416          * If the element and key overlap we use the element.
1417          *
1418          * NOTE!  Deleted elements are effectively invisible.  A Deleted
1419          *        elements covers (makes invisible) any original media
1420          *        data.
1421          */
1422         bref = NULL;
1423         for (i = 0; i < count; ++i) {
1424                 tmp = hammer2_chain_find(hmp, parent, i);
1425                 if (tmp) {
1426                         if (tmp->flags & HAMMER2_CHAIN_DELETED)
1427                                 continue;
1428                         bref = &tmp->bref;
1429                         KKASSERT(bref->type != 0);
1430                 } else if (base == NULL || base[i].type == 0) {
1431                         continue;
1432                 } else {
1433                         bref = &base[i];
1434                 }
1435                 scan_beg = bref->key;
1436                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1437                 if (key_beg <= scan_end && key_end >= scan_beg)
1438                         break;
1439         }
1440         if (i == count) {
1441                 if (key_beg == key_end)
1442                         return (NULL);
1443                 return (hammer2_chain_next(hmp, parentp, NULL,
1444                                            key_beg, key_end, flags));
1445         }
1446
1447         /*
1448          * Acquire the new chain element.  If the chain element is an
1449          * indirect block we must search recursively.
1450          */
1451         chain = hammer2_chain_get(hmp, parent, i, flags);
1452         if (chain == NULL)
1453                 return (NULL);
1454
1455         /*
1456          * If the chain element is an indirect block it becomes the new
1457          * parent and we loop on it.
1458          *
1459          * The parent always has to be locked with at least RESOLVE_MAYBE,
1460          * so it might need a fixup if the caller passed incompatible flags.
1461          */
1462         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1463             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1464                 hammer2_chain_unlock(hmp, parent);
1465                 *parentp = parent = chain;
1466                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1467                         hammer2_chain_lock(hmp, chain, how_maybe);
1468                         hammer2_chain_drop(hmp, chain); /* excess ref */
1469                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1470                         hammer2_chain_lock(hmp, chain, how_maybe);
1471                         hammer2_chain_unlock(hmp, chain);
1472                 }
1473                 goto again;
1474         }
1475
1476         /*
1477          * All done, return chain
1478          */
1479         return (chain);
1480 }
1481
1482 /*
1483  * After having issued a lookup we can iterate all matching keys.
1484  *
1485  * If chain is non-NULL we continue the iteration from just after it's index.
1486  *
1487  * If chain is NULL we assume the parent was exhausted and continue the
1488  * iteration at the next parent.
1489  *
1490  * parent must be locked on entry and remains locked throughout.  chain's
1491  * lock status must match flags.
1492  */
1493 hammer2_chain_t *
1494 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1495                    hammer2_chain_t *chain,
1496                    hammer2_key_t key_beg, hammer2_key_t key_end,
1497                    int flags)
1498 {
1499         hammer2_chain_t *parent;
1500         hammer2_chain_t *tmp;
1501         hammer2_blockref_t *base;
1502         hammer2_blockref_t *bref;
1503         hammer2_key_t scan_beg;
1504         hammer2_key_t scan_end;
1505         int i;
1506         int how_maybe = HAMMER2_RESOLVE_MAYBE;
1507         int count;
1508
1509         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
1510                 how_maybe |= HAMMER2_RESOLVE_SHARED;
1511
1512         parent = *parentp;
1513
1514 again:
1515         /*
1516          * Calculate the next index and recalculate the parent if necessary.
1517          */
1518         if (chain) {
1519                 /*
1520                  * Continue iteration within current parent.  If not NULL
1521                  * the passed-in chain may or may not be locked, based on
1522                  * the LOOKUP_NOLOCK flag (passed in as returned from lookup
1523                  * or a prior next).
1524                  */
1525                 i = chain->index + 1;
1526                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1527                         hammer2_chain_drop(hmp, chain);
1528                 else
1529                         hammer2_chain_unlock(hmp, chain);
1530
1531                 /*
1532                  * Any scan where the lookup returned degenerate data embedded
1533                  * in the inode has an invalid index and must terminate.
1534                  */
1535                 if (chain == parent)
1536                         return(NULL);
1537                 chain = NULL;
1538         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
1539                    parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1540                 /*
1541                  * We reached the end of the iteration.
1542                  */
1543                 return (NULL);
1544         } else {
1545                 /*
1546                  * Continue iteration with next parent unless the current
1547                  * parent covers the range.
1548                  */
1549                 hammer2_chain_t *nparent;
1550
1551                 scan_beg = parent->bref.key;
1552                 scan_end = scan_beg +
1553                             ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1554                 if (key_beg >= scan_beg && key_end <= scan_end)
1555                         return (NULL);
1556
1557                 i = parent->index + 1;
1558                 nparent = parent->parent;
1559                 hammer2_chain_ref(hmp, nparent);        /* ref new parent */
1560                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1561                                                         /* lock new parent */
1562                 hammer2_chain_lock(hmp, nparent, how_maybe);
1563                 hammer2_chain_drop(hmp, nparent);       /* drop excess ref */
1564                 *parentp = parent = nparent;
1565         }
1566
1567 again2:
1568         /*
1569          * Locate the blockref array.  Currently we do a fully associative
1570          * search through the array.
1571          */
1572         switch(parent->bref.type) {
1573         case HAMMER2_BREF_TYPE_INODE:
1574                 base = &parent->data->ipdata.u.blockset.blockref[0];
1575                 count = HAMMER2_SET_COUNT;
1576                 break;
1577         case HAMMER2_BREF_TYPE_INDIRECT:
1578         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1579         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1580                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1581                         base = NULL;
1582                 } else {
1583                         KKASSERT(parent->data != NULL);
1584                         base = &parent->data->npdata.blockref[0];
1585                 }
1586                 count = parent->bytes / sizeof(hammer2_blockref_t);
1587                 break;
1588         case HAMMER2_BREF_TYPE_VOLUME:
1589                 base = &hmp->voldata.sroot_blockset.blockref[0];
1590                 count = HAMMER2_SET_COUNT;
1591                 break;
1592         default:
1593                 panic("hammer2_chain_next: unrecognized blockref type: %d",
1594                       parent->bref.type);
1595                 base = NULL;    /* safety */
1596                 count = 0;      /* safety */
1597                 break;
1598         }
1599         KKASSERT(i <= count);
1600
1601         /*
1602          * Look for the key.  If we are unable to find a match and an exact
1603          * match was requested we return NULL.  If a range was requested we
1604          * run hammer2_chain_next() to iterate.
1605          *
1606          * NOTE!  Deleted elements are effectively invisible.  A Deleted
1607          *        elements covers (makes invisible) any original media
1608          *        data.
1609          */
1610         bref = NULL;
1611         while (i < count) {
1612                 tmp = hammer2_chain_find(hmp, parent, i);
1613                 if (tmp) {
1614                         if (tmp->flags & HAMMER2_CHAIN_DELETED) {
1615                                 ++i;
1616                                 continue;
1617                         }
1618                         bref = &tmp->bref;
1619                 } else if (base == NULL || base[i].type == 0) {
1620                         ++i;
1621                         continue;
1622                 } else {
1623                         bref = &base[i];
1624                 }
1625                 scan_beg = bref->key;
1626                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1627                 if (key_beg <= scan_end && key_end >= scan_beg)
1628                         break;
1629                 ++i;
1630         }
1631
1632         /*
1633          * If we couldn't find a match recurse up a parent to continue the
1634          * search.
1635          */
1636         if (i == count)
1637                 goto again;
1638
1639         /*
1640          * Acquire the new chain element.  If the chain element is an
1641          * indirect block we must search recursively.
1642          */
1643         chain = hammer2_chain_get(hmp, parent, i, flags);
1644         if (chain == NULL)
1645                 return (NULL);
1646
1647         /*
1648          * If the chain element is an indirect block it becomes the new
1649          * parent and we loop on it.
1650          *
1651          * The parent always has to be locked with at least RESOLVE_MAYBE,
1652          * so it might need a fixup if the caller passed incompatible flags.
1653          */
1654         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1655             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1656                 hammer2_chain_unlock(hmp, parent);
1657                 *parentp = parent = chain;
1658                 chain = NULL;
1659                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1660                         hammer2_chain_lock(hmp, parent, how_maybe);
1661                         hammer2_chain_drop(hmp, parent);        /* excess ref */
1662                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1663                         hammer2_chain_lock(hmp, parent, how_maybe);
1664                         hammer2_chain_unlock(hmp, parent);
1665                 }
1666                 i = 0;
1667                 goto again2;
1668         }
1669
1670         /*
1671          * All done, return chain
1672          */
1673         return (chain);
1674 }
1675
1676 /*
1677  * Create and return a new hammer2 system memory structure of the specified
1678  * key, type and size and insert it RELATIVE TO (PARENT).
1679  *
1680  * (parent) is typically either an inode or an indirect block, acquired
1681  * acquired as a side effect of issuing a prior failed lookup.  parent
1682  * must be locked and held.  Do not pass the inode chain to this function
1683  * unless that is the chain returned by the failed lookup.
1684  *
1685  * Non-indirect types will automatically allocate indirect blocks as required
1686  * if the new item does not fit in the current (parent).
1687  *
1688  * Indirect types will move a portion of the existing blockref array in
1689  * (parent) into the new indirect type and then use one of the free slots
1690  * to emplace the new indirect type.
1691  *
1692  * A new locked, referenced chain element is returned of the specified type.
1693  * The element may or may not have a data area associated with it:
1694  *
1695  *      VOLUME          not allowed here
1696  *      INODE           embedded data are will be set-up
1697  *      INDIRECT        not allowed here
1698  *      DATA            no data area will be set-up (caller is expected
1699  *                      to have logical buffers, we don't want to alias
1700  *                      the data onto device buffers!).
1701  *
1702  * Requires an exclusively locked parent.
1703  */
1704 hammer2_chain_t *
1705 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1706                      hammer2_chain_t *chain,
1707                      hammer2_key_t key, int keybits, int type, size_t bytes,
1708                      int *errorp)
1709 {
1710         hammer2_blockref_t dummy;
1711         hammer2_blockref_t *base;
1712         hammer2_chain_t dummy_chain;
1713         int unlock_parent = 0;
1714         int allocated = 0;
1715         int count;
1716         int i;
1717
1718         KKASSERT(ccms_thread_lock_owned(&parent->cst));
1719         *errorp = 0;
1720
1721         if (chain == NULL) {
1722                 /*
1723                  * First allocate media space and construct the dummy bref,
1724                  * then allocate the in-memory chain structure.
1725                  */
1726                 bzero(&dummy, sizeof(dummy));
1727                 dummy.type = type;
1728                 dummy.key = key;
1729                 dummy.keybits = keybits;
1730                 dummy.data_off = hammer2_allocsize(bytes);
1731                 dummy.methods = parent->bref.methods;
1732                 chain = hammer2_chain_alloc(hmp, &dummy);
1733                 allocated = 1;
1734
1735                 /*
1736                  * We do NOT set INITIAL here (yet).  INITIAL is only
1737                  * used for indirect blocks.
1738                  *
1739                  * Recalculate bytes to reflect the actual media block
1740                  * allocation.
1741                  */
1742                 bytes = (hammer2_off_t)1 <<
1743                         (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
1744                 chain->bytes = bytes;
1745
1746                 switch(type) {
1747                 case HAMMER2_BREF_TYPE_VOLUME:
1748                         panic("hammer2_chain_create: called with volume type");
1749                         break;
1750                 case HAMMER2_BREF_TYPE_INODE:
1751                         KKASSERT(bytes == HAMMER2_INODE_BYTES);
1752                         chain->data = (void *)&chain->u.ip->ip_data;
1753                         break;
1754                 case HAMMER2_BREF_TYPE_INDIRECT:
1755                         panic("hammer2_chain_create: cannot be used to"
1756                               "create indirect block");
1757                         break;
1758                 case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1759                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1760                         panic("hammer2_chain_create: cannot be used to"
1761                               "create freemap root or node");
1762                         break;
1763                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1764                 case HAMMER2_BREF_TYPE_DATA:
1765                 default:
1766                         /* leave chain->data NULL */
1767                         KKASSERT(chain->data == NULL);
1768                         break;
1769                 }
1770         } else {
1771                 /*
1772                  * Potentially update the chain's key/keybits.
1773                  */
1774                 chain->bref.key = key;
1775                 chain->bref.keybits = keybits;
1776         }
1777
1778 again:
1779         /*
1780          * Locate a free blockref in the parent's array
1781          */
1782         switch(parent->bref.type) {
1783         case HAMMER2_BREF_TYPE_INODE:
1784                 KKASSERT((parent->u.ip->ip_data.op_flags &
1785                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
1786                 KKASSERT(parent->data != NULL);
1787                 base = &parent->data->ipdata.u.blockset.blockref[0];
1788                 count = HAMMER2_SET_COUNT;
1789                 break;
1790         case HAMMER2_BREF_TYPE_INDIRECT:
1791         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1792         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1793                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1794                         base = NULL;
1795                 } else {
1796                         KKASSERT(parent->data != NULL);
1797                         base = &parent->data->npdata.blockref[0];
1798                 }
1799                 count = parent->bytes / sizeof(hammer2_blockref_t);
1800                 break;
1801         case HAMMER2_BREF_TYPE_VOLUME:
1802                 KKASSERT(parent->data != NULL);
1803                 base = &hmp->voldata.sroot_blockset.blockref[0];
1804                 count = HAMMER2_SET_COUNT;
1805                 break;
1806         default:
1807                 panic("hammer2_chain_create: unrecognized blockref type: %d",
1808                       parent->bref.type);
1809                 count = 0;
1810                 break;
1811         }
1812
1813         /*
1814          * Scan for an unallocated bref, also skipping any slots occupied
1815          * by in-memory chain elements that may not yet have been updated
1816          * in the parent's bref array.
1817          */
1818         bzero(&dummy_chain, sizeof(dummy_chain));
1819         for (i = 0; i < count; ++i) {
1820                 if (base == NULL) {
1821                         dummy_chain.index = i;
1822                         if (RB_FIND(hammer2_chain_tree,
1823                                     &parent->rbhead, &dummy_chain) == NULL) {
1824                                 break;
1825                         }
1826                 } else if (base[i].type == 0) {
1827                         dummy_chain.index = i;
1828                         if (RB_FIND(hammer2_chain_tree,
1829                                     &parent->rbhead, &dummy_chain) == NULL) {
1830                                 break;
1831                         }
1832                 }
1833         }
1834
1835         /*
1836          * If no free blockref could be found we must create an indirect
1837          * block and move a number of blockrefs into it.  With the parent
1838          * locked we can safely lock each child in order to move it without
1839          * causing a deadlock.
1840          *
1841          * This may return the new indirect block or the old parent depending
1842          * on where the key falls.  NULL is returned on error.  The most
1843          * typical error is EAGAIN (flush conflict during chain move).
1844          */
1845         if (i == count) {
1846                 hammer2_chain_t *nparent;
1847
1848                 nparent = hammer2_chain_create_indirect(hmp, parent,
1849                                                         key, keybits,
1850                                                         errorp);
1851                 if (nparent == NULL) {
1852                         if (allocated)
1853                                 hammer2_chain_free(hmp, chain);
1854                         chain = NULL;
1855                         goto done;
1856                 }
1857                 if (parent != nparent) {
1858                         if (unlock_parent)
1859                                 hammer2_chain_unlock(hmp, parent);
1860                         parent = nparent;
1861                         unlock_parent = 1;
1862                 }
1863                 goto again;
1864         }
1865
1866         /*
1867          * Link the chain into its parent.  Later on we will have to set
1868          * the MOVED bit in situations where we don't mark the new chain
1869          * as being modified.
1870          */
1871         if (chain->parent != NULL)
1872                 panic("hammer2: hammer2_chain_create: chain already connected");
1873         KKASSERT(chain->parent == NULL);
1874         chain->parent = parent;
1875         chain->index = i;
1876         if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
1877                 panic("hammer2_chain_link: collision");
1878         atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
1879         KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
1880         KKASSERT(parent->refs > 0);
1881         atomic_add_int(&parent->refs, 1);
1882
1883         /*
1884          * Additional linkage for inodes.  Reuse the parent pointer to
1885          * find the parent directory.
1886          *
1887          * Cumulative adjustments are inherited on [re]attach and will
1888          * propagate up the tree on the next flush.
1889          *
1890          * The ccms_inode is initialized from its parent directory.  The
1891          * chain of ccms_inode's is seeded by the mount code.
1892          */
1893         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1894                 hammer2_chain_t *scan = parent;
1895                 hammer2_inode_t *ip = chain->u.ip;
1896
1897                 while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1898                         scan = scan->parent;
1899                 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE) {
1900                         ip->pip = scan->u.ip;
1901                         ip->pmp = scan->u.ip->pmp;
1902                         ip->pip->delta_icount += ip->ip_data.inode_count;
1903                         ip->pip->delta_dcount += ip->ip_data.data_count;
1904                         ++ip->pip->delta_icount;
1905                         ccms_cst_init(&ip->topo_cst, &ip->chain);
1906                 }
1907         }
1908
1909         /*
1910          * (allocated) indicates that this is a newly-created chain element
1911          * rather than a renamed chain element.  In this situation we want
1912          * to place the chain element in the MODIFIED state.
1913          *
1914          * The data area will be set up as follows:
1915          *
1916          *      VOLUME          not allowed here.
1917          *
1918          *      INODE           embedded data are will be set-up.
1919          *
1920          *      INDIRECT        not allowed here.
1921          *
1922          *      DATA            no data area will be set-up (caller is expected
1923          *                      to have logical buffers, we don't want to alias
1924          *                      the data onto device buffers!).
1925          */
1926         if (allocated) {
1927                 switch(chain->bref.type) {
1928                 case HAMMER2_BREF_TYPE_DATA:
1929                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1930                         hammer2_chain_modify(hmp, chain,
1931                                              HAMMER2_MODIFY_OPTDATA);
1932                         break;
1933                 case HAMMER2_BREF_TYPE_INDIRECT:
1934                 case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1935                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1936                         /* not supported in this function */
1937                         panic("hammer2_chain_create: bad type");
1938                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1939                         hammer2_chain_modify(hmp, chain,
1940                                              HAMMER2_MODIFY_OPTDATA);
1941                         break;
1942                 default:
1943                         hammer2_chain_modify(hmp, chain, 0);
1944                         break;
1945                 }
1946         } else {
1947                 /*
1948                  * When reconnecting inodes we have to call setsubmod()
1949                  * to ensure that its state propagates up the newly
1950                  * connected parent.
1951                  *
1952                  * Make sure MOVED is set but do not update bref_flush.  If
1953                  * the chain is undergoing modification bref_flush will be
1954                  * updated when it gets flushed.  If it is not then the
1955                  * bref may not have been flushed yet and we do not want to
1956                  * set MODIFIED here as this could result in unnecessary
1957                  * reallocations.
1958                  */
1959                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1960                         hammer2_chain_ref(hmp, chain);
1961                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1962                 }
1963                 hammer2_chain_parent_setsubmod(hmp, chain);
1964         }
1965
1966 done:
1967         if (unlock_parent)
1968                 hammer2_chain_unlock(hmp, parent);
1969         return (chain);
1970 }
1971
1972 /*
1973  * Create an indirect block that covers one or more of the elements in the
1974  * current parent.  Either returns the existing parent with no locking or
1975  * ref changes or returns the new indirect block locked and referenced
1976  * and leaving the original parent lock/ref intact as well.
1977  *
1978  * If an error occurs, NULL is returned and *errorp is set to the error.
1979  * EAGAIN can be returned to indicate a flush collision which requires the
1980  * caller to retry.
1981  *
1982  * The returned chain depends on where the specified key falls.
1983  *
1984  * The key/keybits for the indirect mode only needs to follow three rules:
1985  *
1986  * (1) That all elements underneath it fit within its key space and
1987  *
1988  * (2) That all elements outside it are outside its key space.
1989  *
1990  * (3) When creating the new indirect block any elements in the current
1991  *     parent that fit within the new indirect block's keyspace must be
1992  *     moved into the new indirect block.
1993  *
1994  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
1995  *     keyspace the the current parent, but lookup/iteration rules will
1996  *     ensure (and must ensure) that rule (2) for all parents leading up
1997  *     to the nearest inode or the root volume header is adhered to.  This
1998  *     is accomplished by always recursing through matching keyspaces in
1999  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
2000  *
2001  * The current implementation calculates the current worst-case keyspace by
2002  * iterating the current parent and then divides it into two halves, choosing
2003  * whichever half has the most elements (not necessarily the half containing
2004  * the requested key).
2005  *
2006  * We can also opt to use the half with the least number of elements.  This
2007  * causes lower-numbered keys (aka logical file offsets) to recurse through
2008  * fewer indirect blocks and higher-numbered keys to recurse through more.
2009  * This also has the risk of not moving enough elements to the new indirect
2010  * block and being forced to create several indirect blocks before the element
2011  * can be inserted.
2012  *
2013  * Must be called with an exclusively locked parent.
2014  */
2015 static
2016 hammer2_chain_t *
2017 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
2018                               hammer2_key_t create_key, int create_bits,
2019                               int *errorp)
2020 {
2021         hammer2_blockref_t *base;
2022         hammer2_blockref_t *bref;
2023         hammer2_chain_t *chain;
2024         hammer2_chain_t *ichain;
2025         hammer2_chain_t dummy;
2026         hammer2_key_t key = create_key;
2027         int keybits = create_bits;
2028         int locount = 0;
2029         int hicount = 0;
2030         int count;
2031         int nbytes;
2032         int i;
2033
2034         /*
2035          * Calculate the base blockref pointer or NULL if the chain
2036          * is known to be empty.  We need to calculate the array count
2037          * for RB lookups either way.
2038          */
2039         KKASSERT(ccms_thread_lock_owned(&parent->cst));
2040         *errorp = 0;
2041
2042         hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
2043         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2044                 base = NULL;
2045
2046                 switch(parent->bref.type) {
2047                 case HAMMER2_BREF_TYPE_INODE:
2048                         count = HAMMER2_SET_COUNT;
2049                         break;
2050                 case HAMMER2_BREF_TYPE_INDIRECT:
2051                 case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
2052                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2053                         count = parent->bytes / sizeof(hammer2_blockref_t);
2054                         break;
2055                 case HAMMER2_BREF_TYPE_VOLUME:
2056                         count = HAMMER2_SET_COUNT;
2057                         break;
2058                 default:
2059                         panic("hammer2_chain_create_indirect: "
2060                               "unrecognized blockref type: %d",
2061                               parent->bref.type);
2062                         count = 0;
2063                         break;
2064                 }
2065         } else {
2066                 switch(parent->bref.type) {
2067                 case HAMMER2_BREF_TYPE_INODE:
2068                         base = &parent->data->ipdata.u.blockset.blockref[0];
2069                         count = HAMMER2_SET_COUNT;
2070                         break;
2071                 case HAMMER2_BREF_TYPE_INDIRECT:
2072                 case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
2073                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2074                         base = &parent->data->npdata.blockref[0];
2075                         count = parent->bytes / sizeof(hammer2_blockref_t);
2076                         break;
2077                 case HAMMER2_BREF_TYPE_VOLUME:
2078                         base = &hmp->voldata.sroot_blockset.blockref[0];
2079                         count = HAMMER2_SET_COUNT;
2080                         break;
2081                 default:
2082                         panic("hammer2_chain_create_indirect: "
2083                               "unrecognized blockref type: %d",
2084                               parent->bref.type);
2085                         count = 0;
2086                         break;
2087                 }
2088         }
2089
2090         /*
2091          * Scan for an unallocated bref, also skipping any slots occupied
2092          * by in-memory chain elements which may not yet have been updated
2093          * in the parent's bref array.
2094          */
2095         bzero(&dummy, sizeof(dummy));
2096         for (i = 0; i < count; ++i) {
2097                 int nkeybits;
2098
2099                 dummy.index = i;
2100                 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
2101                 if (chain) {
2102                         /*
2103                          * NOTE! CHAIN_DELETED elements have to be adjusted
2104                          *       too, they cannot be ignored.
2105                          */
2106                         bref = &chain->bref;
2107                 } else if (base && base[i].type) {
2108                         bref = &base[i];
2109                 } else {
2110                         continue;
2111                 }
2112
2113                 /*
2114                  * Expand our calculated key range (key, keybits) to fit
2115                  * the scanned key.  nkeybits represents the full range
2116                  * that we will later cut in half (two halves @ nkeybits - 1).
2117                  */
2118                 nkeybits = keybits;
2119                 if (nkeybits < bref->keybits)
2120                         nkeybits = bref->keybits;
2121                 while (nkeybits < 64 &&
2122                        (~(((hammer2_key_t)1 << nkeybits) - 1) &
2123                         (key ^ bref->key)) != 0) {
2124                         ++nkeybits;
2125                 }
2126
2127                 /*
2128                  * If the new key range is larger we have to determine
2129                  * which side of the new key range the existing keys fall
2130                  * under by checking the high bit, then collapsing the
2131                  * locount into the hicount or vise-versa.
2132                  */
2133                 if (keybits != nkeybits) {
2134                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
2135                                 hicount += locount;
2136                                 locount = 0;
2137                         } else {
2138                                 locount += hicount;
2139                                 hicount = 0;
2140                         }
2141                         keybits = nkeybits;
2142                 }
2143
2144                 /*
2145                  * The newly scanned key will be in the lower half or the
2146                  * higher half of the (new) key range.
2147                  */
2148                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
2149                         ++hicount;
2150                 else
2151                         ++locount;
2152         }
2153
2154         /*
2155          * Adjust keybits to represent half of the full range calculated
2156          * above (radix 63 max)
2157          */
2158         --keybits;
2159
2160         /*
2161          * Select whichever half contains the most elements.  Theoretically
2162          * we can select either side as long as it contains at least one
2163          * element (in order to ensure that a free slot is present to hold
2164          * the indirect block).
2165          */
2166         key &= ~(((hammer2_key_t)1 << keybits) - 1);
2167         if (hammer2_indirect_optimize) {
2168                 /*
2169                  * Insert node for least number of keys, this will arrange
2170                  * the first few blocks of a large file or the first few
2171                  * inodes in a directory with fewer indirect blocks when
2172                  * created linearly.
2173                  */
2174                 if (hicount < locount && hicount != 0)
2175                         key |= (hammer2_key_t)1 << keybits;
2176                 else
2177                         key &= ~(hammer2_key_t)1 << keybits;
2178         } else {
2179                 /*
2180                  * Insert node for most number of keys, best for heavily
2181                  * fragmented files.
2182                  */
2183                 if (hicount > locount)
2184                         key |= (hammer2_key_t)1 << keybits;
2185                 else
2186                         key &= ~(hammer2_key_t)1 << keybits;
2187         }
2188
2189         /*
2190          * How big should our new indirect block be?  It has to be at least
2191          * as large as its parent.
2192          */
2193         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
2194                 nbytes = HAMMER2_IND_BYTES_MIN;
2195         else
2196                 nbytes = HAMMER2_IND_BYTES_MAX;
2197         if (nbytes < count * sizeof(hammer2_blockref_t))
2198                 nbytes = count * sizeof(hammer2_blockref_t);
2199
2200         /*
2201          * Ok, create our new indirect block
2202          */
2203         switch(parent->bref.type) {
2204         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
2205         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2206                 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE;
2207                 break;
2208         default:
2209                 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
2210                 break;
2211         }
2212         dummy.bref.key = key;
2213         dummy.bref.keybits = keybits;
2214         dummy.bref.data_off = hammer2_allocsize(nbytes);
2215         dummy.bref.methods = parent->bref.methods;
2216         ichain = hammer2_chain_alloc(hmp, &dummy.bref);
2217         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
2218
2219         /*
2220          * Iterate the original parent and move the matching brefs into
2221          * the new indirect block.
2222          */
2223         for (i = 0; i < count; ++i) {
2224                 /*
2225                  * For keying purposes access the bref from the media or
2226                  * from our in-memory cache.  In cases where the in-memory
2227                  * cache overrides the media the keyrefs will be the same
2228                  * anyway so we can avoid checking the cache when the media
2229                  * has a key.
2230                  */
2231                 dummy.index = i;
2232                 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
2233                 if (chain) {
2234                         /*
2235                          * NOTE! CHAIN_DELETED elements have to be adjusted
2236                          *       too, they cannot be ignored.
2237                          */
2238                         bref = &chain->bref;
2239                 } else if (base && base[i].type) {
2240                         bref = &base[i];
2241                 } else {
2242                         if (ichain->index < 0)
2243                                 ichain->index = i;
2244                         continue;
2245                 }
2246
2247                 /*
2248                  * Skip keys not in the chosen half (low or high), only bit
2249                  * (keybits - 1) needs to be compared but for safety we
2250                  * will compare all msb bits plus that bit again.
2251                  */
2252                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
2253                     (key ^ bref->key)) != 0) {
2254                         continue;
2255                 }
2256
2257                 /*
2258                  * This element is being moved from the parent, its slot
2259                  * is available for our new indirect block.
2260                  */
2261                 if (ichain->index < 0)
2262                         ichain->index = i;
2263
2264                 /*
2265                  * Load the new indirect block by acquiring or allocating
2266                  * the related chain entries, then simply move them to the
2267                  * new parent (ichain).  We cannot move chains which are
2268                  * undergoing flushing and will break out of the loop in
2269                  * that case.
2270                  *
2271                  * When adjusting the parent/child relationship we must
2272                  * set the MOVED bit but we do NOT update bref_flush
2273                  * because otherwise we might synchronize a bref that has
2274                  * not yet been flushed.  We depend on chain's bref_flush
2275                  * either being correct or the chain being in a MODIFIED
2276                  * state.
2277                  *
2278                  * We do not want to set MODIFIED here as this would result
2279                  * in unnecessary reallocations.
2280                  *
2281                  * We must still set SUBMODIFIED in the parent but we do
2282                  * that after the loop.
2283                  *
2284                  * WARNING! chain->cst.spin must be held when chain->parent is
2285                  *          modified, even though we own the full blown lock,
2286                  *          to deal with setsubmod and rename races.
2287                  */
2288                 chain = hammer2_chain_get(hmp, parent, i,
2289                                           HAMMER2_LOOKUP_NODATA);
2290                 if (chain->flushing) {
2291                         hammer2_chain_unlock(hmp, chain);
2292                         break;
2293                 }
2294
2295                 spin_lock(&chain->cst.spin);
2296                 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
2297                 if (RB_INSERT(hammer2_chain_tree, &ichain->rbhead, chain))
2298                         panic("hammer2_chain_create_indirect: collision");
2299                 chain->parent = ichain;
2300                 spin_unlock(&chain->cst.spin);
2301
2302                 if (base)
2303                         bzero(&base[i], sizeof(base[i]));
2304                 atomic_add_int(&parent->refs, -1);
2305                 atomic_add_int(&ichain->refs, 1);
2306                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2307                         hammer2_chain_ref(hmp, chain);
2308                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2309                 }
2310                 hammer2_chain_unlock(hmp, chain);
2311                 KKASSERT(parent->refs > 0);
2312                 chain = NULL;
2313         }
2314
2315         /*
2316          * If we hit a chain that is undergoing flushing we're screwed and
2317          * we have to duno the whole mess.  Since ichain has not been linked
2318          * in yet, the moved chains are not reachable and will not have been
2319          * disposed of.
2320          *
2321          * WARNING! This code is pretty hairy because the flusher is sitting
2322          *          on the parent processing one of the children that we
2323          *          haven't yet moved, and will do a RB_NEXT loop on that
2324          *          child.  So the children we're moving back have to be
2325          *          returned to the same place in the iteration that they
2326          *          were removed from.
2327          */
2328         if (i != count) {
2329                 kprintf("hammer2_chain_create_indirect: EAGAIN\n");
2330                 *errorp = EAGAIN;
2331                 while ((chain = RB_ROOT(&ichain->rbhead)) != NULL) {
2332                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_NEVER);
2333                         KKASSERT(chain->flushing == 0);
2334                         RB_REMOVE(hammer2_chain_tree, &ichain->rbhead, chain);
2335                         if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
2336                                 panic("hammer2_chain_create_indirect: collision");
2337                         chain->parent = parent;
2338                         atomic_add_int(&parent->refs, 1);
2339                         atomic_add_int(&ichain->refs, -1);
2340                         /* MOVED bit might have been inherited, cannot undo */
2341                         hammer2_chain_unlock(hmp, chain);
2342                 }
2343                 hammer2_chain_free(hmp, ichain);
2344                 return(NULL);
2345         }
2346
2347         /*
2348          * Insert the new indirect block into the parent now that we've
2349          * cleared out some entries in the parent.  We calculated a good
2350          * insertion index in the loop above (ichain->index).
2351          *
2352          * We don't have to set MOVED here because we mark ichain modified
2353          * down below (so the normal modified -> flush -> set-moved sequence
2354          * applies).
2355          */
2356         KKASSERT(ichain->index >= 0);
2357         if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, ichain))
2358                 panic("hammer2_chain_create_indirect: ichain insertion");
2359         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_ONRBTREE);
2360         ichain->parent = parent;
2361         atomic_add_int(&parent->refs, 1);
2362
2363         /*
2364          * Mark the new indirect block modified after insertion, which
2365          * will propagate up through parent all the way to the root and
2366          * also allocate the physical block in ichain for our caller,
2367          * and assign ichain->data to a pre-zero'd space (because there
2368          * is not prior data to copy into it).
2369          *
2370          * We have to set SUBMODIFIED in ichain's flags manually so the
2371          * flusher knows it has to recurse through it to get to all of
2372          * our moved blocks, then call setsubmod() to set the bit
2373          * recursively.
2374          */
2375         hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA);
2376         hammer2_chain_parent_setsubmod(hmp, ichain);
2377         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2378
2379         /*
2380          * Figure out what to return.
2381          */
2382         if (create_bits > keybits) {
2383                 /*
2384                  * Key being created is way outside the key range,
2385                  * return the original parent.
2386                  */
2387                 hammer2_chain_unlock(hmp, ichain);
2388         } else if (~(((hammer2_key_t)1 << keybits) - 1) &
2389                    (create_key ^ key)) {
2390                 /*
2391                  * Key being created is outside the key range,
2392                  * return the original parent.
2393                  */
2394                 hammer2_chain_unlock(hmp, ichain);
2395         } else {
2396                 /*
2397                  * Otherwise its in the range, return the new parent.
2398                  * (leave both the new and old parent locked).
2399                  */
2400                 parent = ichain;
2401         }
2402
2403         return(parent);
2404 }
2405
2406 /*
2407  * Physically delete the specified chain element.  Note that inodes with
2408  * open descriptors should not be deleted (as with other filesystems) until
2409  * the last open descriptor is closed.
2410  *
2411  * This routine will remove the chain element from its parent and potentially
2412  * also recurse upward and delete indirect blocks which become empty as a
2413  * side effect.
2414  *
2415  * The caller must pass a pointer to the chain's parent, also locked and
2416  * referenced.  (*parentp) will be modified in a manner similar to a lookup
2417  * or iteration when indirect blocks are also deleted as a side effect.
2418  *
2419  * XXX This currently does not adhere to the MOVED flag protocol in that
2420  *     the removal is immediately indicated in the parent's blockref[]
2421  *     array.
2422  *
2423  * Must be called with an exclusively locked parent and chain.
2424  */
2425 void
2426 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
2427                      hammer2_chain_t *chain, int retain)
2428 {
2429         hammer2_blockref_t *base;
2430         hammer2_inode_t *ip;
2431         int count;
2432
2433         if (chain->parent != parent)
2434                 panic("hammer2_chain_delete: parent mismatch");
2435         KKASSERT(ccms_thread_lock_owned(&parent->cst));
2436
2437         /*
2438          * Mark the parent modified so our base[] pointer remains valid
2439          * while we move entries.  For the optimized indirect block
2440          * case mark the parent moved instead.
2441          *
2442          * Calculate the blockref reference in the parent
2443          */
2444         switch(parent->bref.type) {
2445         case HAMMER2_BREF_TYPE_INODE:
2446                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2447                 base = &parent->data->ipdata.u.blockset.blockref[0];
2448                 count = HAMMER2_SET_COUNT;
2449                 break;
2450         case HAMMER2_BREF_TYPE_INDIRECT:
2451         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
2452         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2453                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA |
2454                                                   HAMMER2_MODIFY_NO_MODIFY_TID);
2455                 if (parent->flags & HAMMER2_CHAIN_INITIAL)
2456                         base = NULL;
2457                 else
2458                         base = &parent->data->npdata.blockref[0];
2459                 count = parent->bytes / sizeof(hammer2_blockref_t);
2460                 break;
2461         case HAMMER2_BREF_TYPE_VOLUME:
2462                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2463                 base = &hmp->voldata.sroot_blockset.blockref[0];
2464                 count = HAMMER2_SET_COUNT;
2465                 break;
2466         default:
2467                 panic("hammer2_chain_delete: unrecognized blockref type: %d",
2468                       parent->bref.type);
2469                 count = 0;
2470                 break;
2471         }
2472         KKASSERT(chain->index >= 0 && chain->index < count);
2473
2474         /*
2475          * We may not be able to immediately disconnect the chain if a
2476          * flush is in progress.  If retain is non-zero we MUST disconnect
2477          * the chain now and callers are responsible for making sure that
2478          * flushing is zero.
2479          */
2480         spin_lock(&chain->cst.spin);
2481         if ((retain || chain->flushing == 0) &&
2482             (chain->flags & HAMMER2_CHAIN_ONRBTREE)) {
2483                 if (base)
2484                         bzero(&base[chain->index], sizeof(*base));
2485                 KKASSERT(chain->flushing == 0);
2486                 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
2487                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
2488                 atomic_add_int(&parent->refs, -1);   /* for red-black entry */
2489                 chain->index = -1;
2490                 chain->parent = NULL;
2491         }
2492         spin_unlock(&chain->cst.spin);
2493
2494         /*
2495          * Cumulative adjustments must be propagated to the parent inode
2496          * when deleting and synchronized to ip.  This occurs even if we
2497          * cannot detach the chain from its parent.
2498          *
2499          * NOTE:  We do not propagate ip->delta_*count to the parent because
2500          *        these represent adjustments that have not yet been
2501          *        propagated upward, so we don't need to remove them from
2502          *        the parent.
2503          *
2504          * Clear the pointer to the parent inode.
2505          */
2506         if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
2507             chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2508                 ip = chain->u.ip;
2509                 if (ip->pip) {
2510                         /* XXX SMP, pip chain not necessarily parent chain */
2511                         ip->pip->delta_icount -= ip->ip_data.inode_count;
2512                         ip->pip->delta_dcount -= ip->ip_data.data_count;
2513                         ip->ip_data.inode_count += ip->delta_icount;
2514                         ip->ip_data.data_count += ip->delta_dcount;
2515                         ip->delta_icount = 0;
2516                         ip->delta_dcount = 0;
2517                         --ip->pip->delta_icount;
2518                         spin_lock(&chain->cst.spin); /* XXX */
2519                         ip->pip = NULL;
2520                         spin_unlock(&chain->cst.spin);
2521                 }
2522         }
2523
2524         /*
2525          * If retain is 0 the deletion is permanent.  Because the chain is
2526          * no longer connected to the topology a flush will have no
2527          * visibility into it.  We must dispose of the references related
2528          * to the MODIFIED and MOVED flags, otherwise the ref count will
2529          * never transition to 0.
2530          *
2531          * If retain is non-zero the deleted element is likely an inode
2532          * which the vnops frontend will mark DESTROYED and flush.  In that
2533          * situation we must retain the flags for any open file descriptors
2534          * on the (removed) inode.  The final close will destroy the
2535          * disconnected chain.
2536          */
2537         if (retain == 0) {
2538                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2539                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2540                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
2541                         hammer2_chain_drop(hmp, chain);
2542                 }
2543                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2544                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2545                         hammer2_chain_drop(hmp, chain);
2546                 }
2547         }
2548
2549         /*
2550          * The chain is still likely referenced, possibly even by a vnode
2551          * (if an inode), so defer further action until the chain gets
2552          * dropped.
2553          */
2554 }
2555
2556 void
2557 hammer2_chain_wait(hammer2_mount_t *hmp, hammer2_chain_t *chain)
2558 {
2559         tsleep(chain, 0, "chnflw", 1);
2560 }