Merge branch 'master' of ssh://crater.dragonflybsd.org/repository/git/dragonfly
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem handles direct and indirect block searches, recursions,
37  * creation, and deletion.  Chains of blockrefs are tracked and modifications
38  * are flagged for propagation... eventually all the way back to the volume
39  * header.  Any chain except the volume header can be flushed to disk at
40  * any time... none of it matters until the volume header is dealt with
41  * (which is not here, see hammer2_vfsops.c for the volume header disk
42  * sequencing).
43  *
44  * Serialized flushes are not handled here, see hammer2_flush.c.  This module
45  * can essentially work on the current version of data, which can be in memory
46  * as well as on-disk due to the above.  However, we are responsible for
47  * making a copy of the state when a modified chain is part of a flush
48  * and we attempt to modify it again before the flush gets to it.  In that
49  * situation we create an allocated copy of the state that the flush can
50  * deal with.  If a chain undergoing deletion is part of a flush it is
51  * marked DELETED and its bref index is kept intact for the flush, but the
52  * chain is thereafter ignored by this module's because it is no longer
53  * current.
54  */
55 #include <sys/cdefs.h>
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/types.h>
59 #include <sys/lock.h>
60 #include <sys/uuid.h>
61
62 #include "hammer2.h"
63
64 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
65
66 static hammer2_chain_t *hammer2_chain_create_indirect(
67                         hammer2_mount_t *hmp, hammer2_chain_t *parent,
68                         hammer2_key_t key, int keybits,
69                         int *errorp);
70
71 /*
72  * We use a red-black tree to guarantee safe lookups under shared locks.
73  */
74 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
75
76 int
77 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
78 {
79         return(chain2->index - chain1->index);
80 }
81
82 /*
83  * Recursively mark the parent chain elements so flushes can find
84  * modified elements.  Stop when we hit a chain already flagged
85  * SUBMODIFIED, but ignore the SUBMODIFIED bit that might be set
86  * in chain itself.
87  *
88  * SUBMODIFIED is not set on the chain passed in.
89  *
90  * The chain->cst.spin lock can be held to stabilize the chain->parent
91  * pointer.  The first parent is stabilized by virtue of chain being
92  * fully locked.
93  */
94 void
95 hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
96 {
97         hammer2_chain_t *parent;
98
99         parent = chain->parent;
100         if (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
101                 spin_lock(&parent->cst.spin);
102                 for (;;) {
103                         atomic_set_int(&parent->flags,
104                                        HAMMER2_CHAIN_SUBMODIFIED);
105                         if ((chain = parent->parent) == NULL)
106                                 break;
107                         spin_lock(&chain->cst.spin);    /* upward interlock */
108                         spin_unlock(&parent->cst.spin);
109                         parent = chain;
110                 }
111                 spin_unlock(&parent->cst.spin);
112         }
113 }
114
115 /*
116  * Allocate a new disconnected chain element representing the specified
117  * bref.  The chain element is locked exclusively and refs is set to 1.
118  * Media data (data) and meta-structure (u) pointers are left NULL.
119  *
120  * This essentially allocates a system memory structure representing one
121  * of the media structure types, including inodes.
122  */
123 hammer2_chain_t *
124 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
125 {
126         hammer2_chain_t *chain;
127         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
128
129         /*
130          * Construct the appropriate system structure.
131          */
132         switch(bref->type) {
133         case HAMMER2_BREF_TYPE_INODE:
134         case HAMMER2_BREF_TYPE_INDIRECT:
135         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
136         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
137         case HAMMER2_BREF_TYPE_DATA:
138         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
139                 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO);
140                 break;
141         case HAMMER2_BREF_TYPE_VOLUME:
142                 chain = NULL;
143                 panic("hammer2_chain_alloc volume type illegal for op");
144         default:
145                 chain = NULL;
146                 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
147                       bref->type);
148         }
149
150         /*
151          * Only set bref_flush if the bref has a real media offset, otherwise
152          * the caller has to wait for the chain to be modified/block-allocated
153          * before a blockref can be synchronized with its (future) parent.
154          */
155         chain->bref = *bref;
156         if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
157                 chain->bref_flush = *bref;
158         chain->index = -1;              /* not yet assigned */
159         chain->refs = 1;
160         chain->bytes = bytes;
161         ccms_cst_init(&chain->cst, chain);
162         ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
163
164         return (chain);
165 }
166
167 /*
168  * Deallocate a chain (the step before freeing it).  Remove the chain from
169  * its parent's tree.
170  *
171  * Caller must hold the parent and the chain exclusively locked, and
172  * chain->refs must be 0.
173  *
174  * This function unlocks, removes, and destroys chain, and will recursively
175  * destroy any sub-chains under chain (whos refs must also be 0 at this
176  * point).
177  *
178  * parent can be NULL.
179  */
180 static void
181 hammer2_chain_dealloc(hammer2_mount_t *hmp, hammer2_chain_t *chain)
182 {
183         hammer2_chain_t *parent;
184         hammer2_chain_t *child;
185
186         KKASSERT(chain->refs == 0);
187         KKASSERT(chain->flushing == 0);
188         KKASSERT(chain->u.mem == NULL);
189         KKASSERT((chain->flags &
190                   (HAMMER2_CHAIN_MOVED | HAMMER2_CHAIN_MODIFIED)) == 0);
191
192         /*
193          * If the sub-tree is not empty all the elements on it must have
194          * 0 refs and be deallocatable.
195          */
196         while ((child = RB_ROOT(&chain->rbhead)) != NULL) {
197                 ccms_thread_lock(&child->cst, CCMS_STATE_EXCLUSIVE);
198                 hammer2_chain_dealloc(hmp, child);
199         }
200
201         /*
202          * If the DELETED flag is not set the chain must be removed from
203          * its parent's tree.
204          *
205          * WARNING! chain->cst.spin must be held when chain->parent is
206          *          modified, even though we own the full blown lock,
207          *          to deal with setsubmod and rename races.
208          */
209         if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
210                 spin_lock(&chain->cst.spin);    /* shouldn't be needed */
211                 parent = chain->parent;
212                 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
213                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
214                 chain->parent = NULL;
215                 spin_unlock(&chain->cst.spin);
216         }
217         hammer2_chain_free(hmp, chain);
218 }
219
220 /*
221  * Free a disconnected chain element
222  */
223 void
224 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
225 {
226         switch(chain->bref.type) {
227         case HAMMER2_BREF_TYPE_VOLUME:
228                 chain->data = NULL;
229                 break;
230         case HAMMER2_BREF_TYPE_INODE:
231                 if (chain->data) {
232                         kfree(chain->data, hmp->minode);
233                         chain->data = NULL;
234                 }
235                 break;
236         default:
237                 KKASSERT(chain->data == NULL);
238                 break;
239         }
240
241         KKASSERT(chain->bp == NULL);
242         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
243                  chain->u.ip == NULL);
244
245         ccms_thread_unlock(&chain->cst);
246         KKASSERT(chain->cst.count == 0);
247         KKASSERT(chain->cst.upgrade == 0);
248         KKASSERT(chain->u.mem == NULL);
249
250         kfree(chain, hmp->mchain);
251 }
252
253 /*
254  * Add a reference to a chain element, preventing its destruction.
255  *
256  * The parent chain must be locked shared or exclusive or otherwise be
257  * stable and already have a reference.
258  */
259 void
260 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
261 {
262         u_int refs;
263
264         while (chain) {
265                 refs = chain->refs;
266                 KKASSERT(chain->refs >= 0);
267                 cpu_ccfence();
268                 if (refs == 0) {
269                         /*
270                          * 0 -> 1 transition must bump the refs on the parent
271                          * too.  The caller has stabilized the parent.
272                          */
273                         if (atomic_cmpset_int(&chain->refs, 0, 1)) {
274                                 chain = chain->parent;
275                                 KKASSERT(chain == NULL || chain->refs > 0);
276                         }
277                         /* retry or continue along the parent chain */
278                 } else {
279                         /*
280                          * N -> N+1
281                          */
282                         if (atomic_cmpset_int(&chain->refs, refs, refs + 1))
283                                 break;
284                         /* retry */
285                 }
286         }
287 }
288
289 /*
290  * Drop the callers reference to the chain element.  If the ref count
291  * reaches zero we attempt to recursively drop the parent.
292  *
293  * MOVED and MODIFIED elements hold additional references so it should not
294  * be possible for the count on a modified element to drop to 0.
295  *
296  * The chain element must NOT be locked by the caller on the 1->0 transition.
297  *
298  * The parent might or might not be locked by the caller.  If we are unable
299  * to lock the parent on the 1->0 transition the destruction of the chain
300  * will be deferred but we still recurse upward and drop the ref on the
301  * parent (see the lastdrop() function)
302  */
303 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_mount_t *hmp,
304                                                 hammer2_chain_t *chain);
305
306 void
307 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
308 {
309         u_int refs;
310
311         while (chain) {
312                 refs = chain->refs;
313                 cpu_ccfence();
314                 KKASSERT(refs > 0);
315                 if (refs == 1) {
316                         /*
317                          * (1) lastdrop successfully drops the chain to 0
318                          *     refs and may may not have destroyed it.
319                          *     lastdrop will return the parent so we can
320                          *     recursively drop the implied ref from the
321                          *     1->0 transition.
322                          *
323                          * (2) lastdrop fails to transition refs from 1 to 0
324                          *     and returns the same chain, we retry.
325                          */
326                         chain = hammer2_chain_lastdrop(hmp, chain);
327                 } else {
328                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
329                                 /*
330                                  * Succeeded, count did not reach zero so
331                                  * cut out of the loop.
332                                  */
333                                 break;
334                         }
335                         /* retry the same chain */
336                 }
337         }
338 }
339
340 /*
341  * Handle SMP races during the last drop.  We must obtain a lock on
342  * chain->parent to stabilize the last pointer reference to chain
343  * (if applicable).  This reference does not have a parallel ref count,
344  * that is idle chains in the topology can have a ref count of 0.
345  *
346  * The 1->0 transition implies a ref on the parent.
347  */
348 static
349 hammer2_chain_t *
350 hammer2_chain_lastdrop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
351 {
352         hammer2_chain_t *parent;
353
354         /*
355          * Stablize chain->parent with the chain cst's spinlock.
356          * (parent can be NULL here).
357          *
358          * cst.spin locks are allowed to be nested bottom-up (the reverse
359          * of the normal top-down for full-blown cst locks), so this also
360          * allows us to attempt to obtain the parent's cst lock non-blocking
361          * (which must acquire the parent's spinlock unconditionally) while
362          * we are still holding the chain's spinlock.
363          */
364         spin_lock(&chain->cst.spin);
365         parent = chain->parent;
366
367         /*
368          * If chain->flushing is non-zero we cannot deallocate the chain
369          * here.  The flushing field will be serialized for the inline
370          * unlock made by the flusher itself and we don't care about races
371          * in any other situation because the thread lock on the parent
372          * will fail in other situations.
373          *
374          * If we have a non-NULL parent but cannot acquire its thread
375          * lock, we also cannot deallocate the chain.
376          */
377         if (chain->flushing ||
378             (parent && ccms_thread_lock_nonblock(&parent->cst,
379                                                  CCMS_STATE_EXCLUSIVE))) {
380                 if (atomic_cmpset_int(&chain->refs, 1, 0)) {
381                         spin_unlock(&chain->cst.spin);  /* success */
382                         return(parent);
383                 } else {
384                         spin_unlock(&chain->cst.spin);  /* failure */
385                         return(chain);
386                 }
387         }
388         spin_unlock(&chain->cst.spin);
389
390         /*
391          * With the parent now held we control the last pointer reference
392          * to chain ONLY IF this is the 1->0 drop.  If we fail to transition
393          * from 1->0 we raced a refs change and must retry at chain.
394          */
395         if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
396                 /* failure */
397                 if (parent)
398                         ccms_thread_unlock(&parent->cst);
399                 return(chain);
400         }
401
402         /*
403          * Ok, we succeeded.  We now own the implied ref on the parent
404          * associated with the 1->0 transition of the child.  It should not
405          * be possible for ANYTHING to access the child now, as we own the
406          * lock on the parent, so we should be able to safely lock the
407          * child and destroy it.
408          */
409         ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
410         hammer2_chain_dealloc(hmp, chain);
411
412         /*
413          * We want to return parent with its implied ref to the caller
414          * to recurse and drop the parent.
415          */
416         if (parent)
417                 ccms_thread_unlock(&parent->cst);
418         return (parent);
419 }
420
421 /*
422  * Ref and lock a chain element, acquiring its data with I/O if necessary,
423  * and specify how you would like the data to be resolved.
424  *
425  * Returns 0 on success or an error code if the data could not be acquired.
426  * The chain element is locked either way.
427  *
428  * The lock is allowed to recurse, multiple locking ops will aggregate
429  * the requested resolve types.  Once data is assigned it will not be
430  * removed until the last unlock.
431  *
432  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
433  *                         (typically used to avoid device/logical buffer
434  *                          aliasing for data)
435  *
436  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
437  *                         the INITIAL-create state (indirect blocks only).
438  *
439  *                         Do not resolve data elements for DATA chains.
440  *                         (typically used to avoid device/logical buffer
441  *                          aliasing for data)
442  *
443  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
444  *
445  * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
446  *                         it will be locked exclusive.
447  *
448  * NOTE: Embedded elements (volume header, inodes) are always resolved
449  *       regardless.
450  *
451  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
452  *       element will instantiate and zero its buffer, and flush it on
453  *       release.
454  *
455  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
456  *       so as not to instantiate a device buffer, which could alias against
457  *       a logical file buffer.  However, if ALWAYS is specified the
458  *       device buffer will be instantiated anyway.
459  */
460 int
461 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
462 {
463         hammer2_blockref_t *bref;
464         hammer2_off_t pbase;
465         hammer2_off_t peof;
466         ccms_state_t ostate;
467         size_t boff;
468         size_t bbytes;
469         int error;
470         char *bdata;
471
472         /*
473          * Ref and lock the element.  Recursive locks are allowed.
474          */
475         hammer2_chain_ref(hmp, chain);
476         if (how & HAMMER2_RESOLVE_SHARED)
477                 ccms_thread_lock(&chain->cst, CCMS_STATE_SHARED);
478         else
479                 ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
480
481         /*
482          * If we already have a valid data pointer no further action is
483          * necessary.
484          */
485         if (chain->data)
486                 return (0);
487
488         /*
489          * Do we have to resolve the data?
490          */
491         switch(how & HAMMER2_RESOLVE_MASK) {
492         case HAMMER2_RESOLVE_NEVER:
493                 return(0);
494         case HAMMER2_RESOLVE_MAYBE:
495                 if (chain->flags & HAMMER2_CHAIN_INITIAL)
496                         return(0);
497                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
498                         return(0);
499                 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF)
500                         return(0);
501                 /* fall through */
502         case HAMMER2_RESOLVE_ALWAYS:
503                 break;
504         }
505
506         /*
507          * Upgrade to an exclusive lock so we can safely manipulate the
508          * buffer cache.  If another thread got to it before us we
509          * can just return.
510          */
511         ostate = ccms_thread_lock_upgrade(&chain->cst);
512         if (chain->data) {
513                 ccms_thread_lock_restore(&chain->cst, ostate);
514                 return (0);
515         }
516
517         /*
518          * We must resolve to a device buffer, either by issuing I/O or
519          * by creating a zero-fill element.  We do not mark the buffer
520          * dirty when creating a zero-fill element (the hammer2_chain_modify()
521          * API must still be used to do that).
522          *
523          * The device buffer is variable-sized in powers of 2 down
524          * to HAMMER2_MINALLOCSIZE (typically 1K).  A 64K physical storage
525          * chunk always contains buffers of the same size. (XXX)
526          *
527          * The minimum physical IO size may be larger than the variable
528          * block size.
529          */
530         bref = &chain->bref;
531
532         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
533                 bbytes = HAMMER2_MINIOSIZE;
534         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
535         peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
536         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
537         KKASSERT(pbase != 0);
538
539         /*
540          * The getblk() optimization can only be used on newly created
541          * elements if the physical block size matches the request.
542          */
543         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
544             chain->bytes == bbytes) {
545                 chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
546                 error = 0;
547         } else if (hammer2_cluster_enable) {
548                 error = cluster_read(hmp->devvp, peof, pbase, bbytes,
549                                      HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
550                                      &chain->bp);
551         } else {
552                 error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
553         }
554
555         if (error) {
556                 kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
557                         (intmax_t)pbase, error);
558                 bqrelse(chain->bp);
559                 chain->bp = NULL;
560                 ccms_thread_lock_restore(&chain->cst, ostate);
561                 return (error);
562         }
563
564         /*
565          * Zero the data area if the chain is in the INITIAL-create state.
566          * Mark the buffer for bdwrite().
567          */
568         bdata = (char *)chain->bp->b_data + boff;
569         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
570                 bzero(bdata, chain->bytes);
571                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
572         }
573
574         /*
575          * Setup the data pointer, either pointing it to an embedded data
576          * structure and copying the data from the buffer, or pointing it
577          * into the buffer.
578          *
579          * The buffer is not retained when copying to an embedded data
580          * structure in order to avoid potential deadlocks or recursions
581          * on the same physical buffer.
582          */
583         switch (bref->type) {
584         case HAMMER2_BREF_TYPE_VOLUME:
585                 /*
586                  * Copy data from bp to embedded buffer
587                  */
588                 panic("hammer2_chain_lock: called on unresolved volume header");
589 #if 0
590                 /* NOT YET */
591                 KKASSERT(pbase == 0);
592                 KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
593                 bcopy(bdata, &hmp->voldata, chain->bytes);
594                 chain->data = (void *)&hmp->voldata;
595                 bqrelse(chain->bp);
596                 chain->bp = NULL;
597 #endif
598                 break;
599         case HAMMER2_BREF_TYPE_INODE:
600                 /*
601                  * Copy data from bp to embedded buffer, do not retain the
602                  * device buffer.
603                  */
604                 KKASSERT(chain->bytes == sizeof(chain->data->ipdata));
605                 chain->data = kmalloc(sizeof(chain->data->ipdata),
606                                       hmp->minode, M_WAITOK | M_ZERO);
607                 bcopy(bdata, &chain->data->ipdata, chain->bytes);
608                 bqrelse(chain->bp);
609                 chain->bp = NULL;
610                 break;
611         case HAMMER2_BREF_TYPE_INDIRECT:
612         case HAMMER2_BREF_TYPE_DATA:
613         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
614         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
615         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
616         default:
617                 /*
618                  * Point data at the device buffer and leave bp intact.
619                  */
620                 chain->data = (void *)bdata;
621                 break;
622         }
623
624         /*
625          * Make sure the bp is not specifically owned by this thread before
626          * restoring to a possibly shared lock, so another hammer2 thread
627          * can release it.
628          */
629         if (chain->bp)
630                 BUF_KERNPROC(chain->bp);
631         ccms_thread_lock_restore(&chain->cst, ostate);
632         return (0);
633 }
634
635 /*
636  * Unlock and deref a chain element.
637  *
638  * On the last lock release any non-embedded data (chain->bp) will be
639  * retired.
640  */
641 void
642 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
643 {
644         long *counterp;
645
646         /*
647          * Release the CST lock but with a special 1->0 transition case.
648          *
649          * Returns non-zero if lock references remain.  When zero is
650          * returned the last lock reference is retained and any shared
651          * lock is upgraded to an exclusive lock for final disposition.
652          */
653         if (ccms_thread_unlock_zero(&chain->cst)) {
654                 KKASSERT(chain->refs > 1);
655                 atomic_add_int(&chain->refs, -1);
656                 return;
657         }
658
659         /*
660          * Shortcut the case if the data is embedded or not resolved.
661          *
662          * Do NOT NULL out chain->data (e.g. inode data), it might be
663          * dirty.
664          *
665          * The DIRTYBP flag is non-applicable in this situation and can
666          * be cleared to keep the flags state clean.
667          */
668         if (chain->bp == NULL) {
669                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
670                 ccms_thread_unlock(&chain->cst);
671                 hammer2_chain_drop(hmp, chain);
672                 return;
673         }
674
675         /*
676          * Statistics
677          */
678         if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
679                 ;
680         } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
681                 switch(chain->bref.type) {
682                 case HAMMER2_BREF_TYPE_DATA:
683                         counterp = &hammer2_ioa_file_write;
684                         break;
685                 case HAMMER2_BREF_TYPE_INODE:
686                         counterp = &hammer2_ioa_meta_write;
687                         break;
688                 case HAMMER2_BREF_TYPE_INDIRECT:
689                         counterp = &hammer2_ioa_indr_write;
690                         break;
691                 case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
692                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
693                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
694                         counterp = &hammer2_ioa_fmap_write;
695                         break;
696                 default:
697                         counterp = &hammer2_ioa_volu_write;
698                         break;
699                 }
700                 ++*counterp;
701         } else {
702                 switch(chain->bref.type) {
703                 case HAMMER2_BREF_TYPE_DATA:
704                         counterp = &hammer2_iod_file_write;
705                         break;
706                 case HAMMER2_BREF_TYPE_INODE:
707                         counterp = &hammer2_iod_meta_write;
708                         break;
709                 case HAMMER2_BREF_TYPE_INDIRECT:
710                         counterp = &hammer2_iod_indr_write;
711                         break;
712                 case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
713                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
714                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
715                         counterp = &hammer2_iod_fmap_write;
716                         break;
717                 default:
718                         counterp = &hammer2_iod_volu_write;
719                         break;
720                 }
721                 ++*counterp;
722         }
723
724         /*
725          * Clean out the bp.
726          *
727          * If a device buffer was used for data be sure to destroy the
728          * buffer when we are done to avoid aliases (XXX what about the
729          * underlying VM pages?).
730          *
731          * NOTE: Freemap leaf's use reserved blocks and thus no aliasing
732          *       is possible.
733          */
734         if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
735                 chain->bp->b_flags |= B_RELBUF;
736
737         /*
738          * The DIRTYBP flag tracks whether we have to bdwrite() the buffer
739          * or not.  The flag will get re-set when chain_modify() is called,
740          * even if MODIFIED is already set, allowing the OS to retire the
741          * buffer independent of a hammer2 flus.
742          */
743         chain->data = NULL;
744         if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
745                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
746                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
747                         atomic_clear_int(&chain->flags,
748                                          HAMMER2_CHAIN_IOFLUSH);
749                         chain->bp->b_flags |= B_RELBUF;
750                         cluster_awrite(chain->bp);
751                 } else {
752                         chain->bp->b_flags |= B_CLUSTEROK;
753                         bdwrite(chain->bp);
754                 }
755         } else {
756                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
757                         atomic_clear_int(&chain->flags,
758                                          HAMMER2_CHAIN_IOFLUSH);
759                         chain->bp->b_flags |= B_RELBUF;
760                         brelse(chain->bp);
761                 } else {
762                         /* bp might still be dirty */
763                         bqrelse(chain->bp);
764                 }
765         }
766         chain->bp = NULL;
767         ccms_thread_unlock(&chain->cst);
768         hammer2_chain_drop(hmp, chain);
769 }
770
771 /*
772  * Resize the chain's physical storage allocation.  Chains can be resized
773  * smaller without reallocating the storage.  Resizing larger will reallocate
774  * the storage.
775  *
776  * Must be passed a locked chain.
777  *
778  * If you want the resize code to copy the data to the new block then the
779  * caller should lock the chain RESOLVE_MAYBE or RESOLVE_ALWAYS.
780  *
781  * If the caller already holds a logical buffer containing the data and
782  * intends to bdwrite() that buffer resolve with RESOLVE_NEVER.  The resize
783  * operation will then not copy the data.
784  *
785  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
786  * to avoid instantiating a device buffer that conflicts with the vnode
787  * data buffer.
788  *
789  * XXX flags currently ignored, uses chain->bp to detect data/no-data.
790  */
791 void
792 hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
793                      int nradix, int flags)
794 {
795         hammer2_mount_t *hmp = ip->hmp;
796         struct buf *nbp;
797         hammer2_off_t pbase;
798         size_t obytes;
799         size_t nbytes;
800         size_t bbytes;
801         int boff;
802         char *bdata;
803         int error;
804
805         /*
806          * Only data and indirect blocks can be resized for now.
807          * (The volu root, inodes, and freemap elements use a fixed size).
808          */
809         KKASSERT(chain != &hmp->vchain);
810         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
811                  chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
812
813         /*
814          * Nothing to do if the element is already the proper size
815          */
816         obytes = chain->bytes;
817         nbytes = 1U << nradix;
818         if (obytes == nbytes)
819                 return;
820
821         /*
822          * Set MODIFIED and add a chain ref to prevent destruction.  Both
823          * modified flags share the same ref.
824          *
825          * If the chain is already marked MODIFIED then we can safely
826          * return the previous allocation to the pool without having to
827          * worry about snapshots.
828          */
829         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
830                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
831                                               HAMMER2_CHAIN_MODIFY_TID);
832                 hammer2_chain_ref(hmp, chain);
833         } else {
834                 hammer2_freemap_free(hmp, chain->bref.data_off,
835                                      chain->bref.type);
836         }
837
838         /*
839          * Relocate the block, even if making it smaller (because different
840          * block sizes may be in different regions).
841          */
842         chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type,
843                                                      nbytes);
844         chain->bytes = nbytes;
845         /*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */
846
847         /*
848          * The device buffer may be larger than the allocation size.
849          */
850         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
851                 bbytes = HAMMER2_MINIOSIZE;
852         pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
853         boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
854
855         /*
856          * Only copy the data if resolved, otherwise the caller is
857          * responsible.
858          */
859         if (chain->bp) {
860                 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
861                          chain->bref.type == HAMMER2_BREF_TYPE_DATA);
862                 KKASSERT(chain != &hmp->vchain);        /* safety */
863
864                 /*
865                  * The getblk() optimization can only be used if the
866                  * physical block size matches the request.
867                  */
868                 if (nbytes == bbytes) {
869                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
870                         error = 0;
871                 } else {
872                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
873                         KKASSERT(error == 0);
874                 }
875                 bdata = (char *)nbp->b_data + boff;
876
877                 if (nbytes < obytes) {
878                         bcopy(chain->data, bdata, nbytes);
879                 } else {
880                         bcopy(chain->data, bdata, obytes);
881                         bzero(bdata + obytes, nbytes - obytes);
882                 }
883
884                 /*
885                  * NOTE: The INITIAL state of the chain is left intact.
886                  *       We depend on hammer2_chain_modify() to do the
887                  *       right thing.
888                  *
889                  * NOTE: We set B_NOCACHE to throw away the previous bp and
890                  *       any VM backing store, even if it was dirty.
891                  *       Otherwise we run the risk of a logical/device
892                  *       conflict on reallocation.
893                  */
894                 chain->bp->b_flags |= B_RELBUF | B_NOCACHE;
895                 brelse(chain->bp);
896                 chain->bp = nbp;
897                 chain->data = (void *)bdata;
898                 hammer2_chain_modify(hmp, chain, 0);
899         }
900
901         /*
902          * Make sure the chain is marked MOVED and SUBMOD is set in the
903          * parent(s) so the adjustments are picked up by flush.
904          */
905         if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
906                 hammer2_chain_ref(hmp, chain);
907                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
908         }
909         hammer2_chain_parent_setsubmod(hmp, chain);
910 }
911
912 /*
913  * Convert a locked chain that was retrieved read-only to read-write.
914  *
915  * If not already marked modified a new physical block will be allocated
916  * and assigned to the bref.
917  *
918  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
919  *                   level or the COW operation will not work.
920  *
921  * Data blocks     - The chain is usually locked RESOLVE_NEVER so as not to
922  *                   run the data through the device buffers.
923  */
924 void
925 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
926 {
927         struct buf *nbp;
928         int error;
929         hammer2_off_t pbase;
930         size_t bbytes;
931         size_t boff;
932         void *bdata;
933
934         /*
935          * Tells flush that modify_tid must be updated, otherwise only
936          * mirror_tid is updated.  This is the default.
937          */
938         if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
939                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFY_TID);
940
941         /*
942          * If the chain is already marked MODIFIED we can just return.
943          *
944          * However, it is possible that a prior lock/modify sequence
945          * retired the buffer.  During this lock/modify sequence MODIFIED
946          * may still be set but the buffer could wind up clean.  Since
947          * the caller is going to modify the buffer further we have to
948          * be sure that DIRTYBP is set again.
949          */
950         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
951                 if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
952                     chain->bp == NULL) {
953                         goto skip1;
954                 }
955                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
956                 return;
957         }
958
959         /*
960          * Set MODIFIED and add a chain ref to prevent destruction.  Both
961          * modified flags share the same ref.
962          */
963         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
964         hammer2_chain_ref(hmp, chain);
965
966         /*
967          * We must allocate the copy-on-write block.
968          *
969          * If the data is embedded no other action is required.
970          *
971          * If the data is not embedded we acquire and clear the
972          * new block.  If chain->data is not NULL we then do the
973          * copy-on-write.  chain->data will then be repointed to the new
974          * buffer and the old buffer will be released.
975          *
976          * For newly created elements with no prior allocation we go
977          * through the copy-on-write steps except without the copying part.
978          */
979         if (chain != &hmp->vchain) {
980                 if ((hammer2_debug & 0x0001) &&
981                     (chain->bref.data_off & HAMMER2_OFF_MASK)) {
982                         kprintf("Replace %d\n", chain->bytes);
983                 }
984                 chain->bref.data_off =
985                         hammer2_freemap_alloc(hmp, chain->bref.type,
986                                               chain->bytes);
987                 /* XXX failed allocation */
988         }
989
990         /*
991          * If data instantiation is optional and the chain has no current
992          * data association (typical for DATA and newly-created INDIRECT
993          * elements), don't instantiate the buffer now.
994          */
995         if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL)
996                 goto skip2;
997
998 skip1:
999         /*
1000          * Setting the DIRTYBP flag will cause the buffer to be dirtied or
1001          * written-out on unlock.  This bit is independent of the MODIFIED
1002          * bit because the chain may still need meta-data adjustments done
1003          * by virtue of MODIFIED for its parent, and the buffer can be
1004          * flushed out (possibly multiple times) by the OS before that.
1005          *
1006          * Clearing the INITIAL flag (for indirect blocks) indicates that
1007          * a zero-fill buffer has been instantiated.
1008          */
1009         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
1010         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1011
1012         /*
1013          * We currently should never instantiate a device buffer for a
1014          * file data chain.  (We definitely can for a freemap chain).
1015          */
1016         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
1017
1018         /*
1019          * Execute COW operation
1020          */
1021         switch(chain->bref.type) {
1022         case HAMMER2_BREF_TYPE_VOLUME:
1023         case HAMMER2_BREF_TYPE_INODE:
1024                 /*
1025                  * The data is embedded, no copy-on-write operation is
1026                  * needed.
1027                  */
1028                 KKASSERT(chain->bp == NULL);
1029                 break;
1030         case HAMMER2_BREF_TYPE_DATA:
1031         case HAMMER2_BREF_TYPE_INDIRECT:
1032         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1033         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1034         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1035                 /*
1036                  * Perform the copy-on-write operation
1037                  */
1038                 KKASSERT(chain != &hmp->vchain);        /* safety */
1039                 /*
1040                  * The device buffer may be larger than the allocation size.
1041                  */
1042                 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
1043                         bbytes = HAMMER2_MINIOSIZE;
1044                 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
1045                 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
1046
1047                 /*
1048                  * The getblk() optimization can only be used if the
1049                  * physical block size matches the request.
1050                  */
1051                 if (chain->bytes == bbytes) {
1052                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
1053                         error = 0;
1054                 } else {
1055                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
1056                         KKASSERT(error == 0);
1057                 }
1058                 bdata = (char *)nbp->b_data + boff;
1059
1060                 /*
1061                  * Copy or zero-fill on write depending on whether
1062                  * chain->data exists or not.
1063                  */
1064                 if (chain->data) {
1065                         bcopy(chain->data, bdata, chain->bytes);
1066                         KKASSERT(chain->bp != NULL);
1067                 } else {
1068                         bzero(bdata, chain->bytes);
1069                 }
1070                 if (chain->bp) {
1071                         chain->bp->b_flags |= B_RELBUF;
1072                         brelse(chain->bp);
1073                 }
1074                 chain->bp = nbp;
1075                 chain->data = bdata;
1076                 break;
1077         default:
1078                 panic("hammer2_chain_modify: illegal non-embedded type %d",
1079                       chain->bref.type);
1080                 break;
1081
1082         }
1083 skip2:
1084         if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
1085                 hammer2_chain_parent_setsubmod(hmp, chain);
1086 }
1087
1088 /*
1089  * Mark the volume as having been modified.  This short-cut version
1090  * does not have to lock the volume's chain, which allows the ioctl
1091  * code to make adjustments to connections without deadlocking.
1092  */
1093 void
1094 hammer2_modify_volume(hammer2_mount_t *hmp)
1095 {
1096         hammer2_voldata_lock(hmp);
1097         atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
1098         hammer2_voldata_unlock(hmp);
1099 }
1100
1101 /*
1102  * Locate an in-memory chain.  The parent must be locked.  The in-memory
1103  * chain is returned or NULL if no in-memory chain is present.
1104  *
1105  * NOTE: A chain on-media might exist for this index when NULL is returned.
1106  */
1107 hammer2_chain_t *
1108 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
1109 {
1110         hammer2_chain_t dummy;
1111         hammer2_chain_t *chain;
1112
1113         dummy.index = index;
1114         chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1115         return (chain);
1116 }
1117
1118 /*
1119  * Return a locked chain structure with all associated data acquired.
1120  *
1121  * Caller must lock the parent on call, the returned child will be locked.
1122  */
1123 hammer2_chain_t *
1124 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1125                   int index, int flags)
1126 {
1127         hammer2_blockref_t *bref;
1128         hammer2_chain_t *chain;
1129         hammer2_chain_t dummy;
1130         int how;
1131         ccms_state_t ostate;
1132
1133         /*
1134          * Figure out how to lock.  MAYBE can be used to optimized
1135          * the initial-create state for indirect blocks.
1136          */
1137         if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
1138                 how = HAMMER2_RESOLVE_NEVER;
1139         else
1140                 how = HAMMER2_RESOLVE_MAYBE;
1141         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
1142                 how |= HAMMER2_RESOLVE_SHARED;
1143
1144         /*
1145          * First see if we have a (possibly modified) chain element cached
1146          * for this (parent, index).  Acquire the data if necessary.
1147          *
1148          * If chain->data is non-NULL the chain should already be marked
1149          * modified.
1150          */
1151         dummy.index = index;
1152         chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1153         if (chain) {
1154                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1155                         hammer2_chain_ref(hmp, chain);
1156                 else
1157                         hammer2_chain_lock(hmp, chain, how);
1158                 return(chain);
1159         }
1160
1161         /*
1162          * Upgrade our thread lock and handle any race that may have
1163          * occurred.  Leave the lock upgraded for the rest of the get.
1164          * We have to do this because we will be modifying the chain
1165          * structure.
1166          */
1167         ostate = ccms_thread_lock_upgrade(&parent->cst);
1168         chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1169         if (chain) {
1170                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1171                         hammer2_chain_ref(hmp, chain);
1172                 else
1173                         hammer2_chain_lock(hmp, chain, how);
1174                 ccms_thread_lock_restore(&parent->cst, ostate);
1175                 return(chain);
1176         }
1177
1178         /*
1179          * The get function must always succeed, panic if there's no
1180          * data to index.
1181          */
1182         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1183                 ccms_thread_lock_restore(&parent->cst, ostate);
1184                 panic("hammer2_chain_get: Missing bref(1)");
1185                 /* NOT REACHED */
1186         }
1187
1188         /*
1189          * Otherwise lookup the bref and issue I/O (switch on the parent)
1190          */
1191         switch(parent->bref.type) {
1192         case HAMMER2_BREF_TYPE_INODE:
1193                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1194                 bref = &parent->data->ipdata.u.blockset.blockref[index];
1195                 break;
1196         case HAMMER2_BREF_TYPE_INDIRECT:
1197         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1198         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1199                 KKASSERT(parent->data != NULL);
1200                 KKASSERT(index >= 0 &&
1201                          index < parent->bytes / sizeof(hammer2_blockref_t));
1202                 bref = &parent->data->npdata.blockref[index];
1203                 break;
1204         case HAMMER2_BREF_TYPE_VOLUME:
1205                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1206                 bref = &hmp->voldata.sroot_blockset.blockref[index];
1207                 break;
1208         default:
1209                 bref = NULL;
1210                 panic("hammer2_chain_get: unrecognized blockref type: %d",
1211                       parent->bref.type);
1212         }
1213         if (bref->type == 0) {
1214                 panic("hammer2_chain_get: Missing bref(2)");
1215                 /* NOT REACHED */
1216         }
1217
1218         /*
1219          * Allocate a chain structure representing the existing media
1220          * entry.
1221          *
1222          * The locking operation we do later will issue I/O to read it.
1223          */
1224         chain = hammer2_chain_alloc(hmp, bref);
1225
1226         /*
1227          * Link the chain into its parent.  Caller is expected to hold an
1228          * exclusive lock on the parent.
1229          */
1230         chain->parent = parent;
1231         chain->index = index;
1232         if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
1233                 panic("hammer2_chain_link: collision");
1234         atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
1235         KKASSERT(parent->refs > 0);
1236         atomic_add_int(&parent->refs, 1);       /* for red-black entry */
1237         ccms_thread_lock_restore(&parent->cst, ostate);
1238
1239 #if 0
1240         /*
1241          * Additional linkage for inodes.  Reuse the parent pointer to
1242          * find the parent directory.
1243          *
1244          * The ccms_inode is initialized from its parent directory.  The
1245          * chain of ccms_inode's is seeded by the mount code.
1246          */
1247         if (bref->type == HAMMER2_BREF_TYPE_INODE) {
1248                 ip = chain->u.ip;
1249                 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1250                         parent = parent->parent;
1251                 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
1252                         ip->pip = parent->u.ip;
1253                         ip->pmp = parent->u.ip->pmp;
1254                         ccms_cst_init(&ip->topo_cst, &ip->chain);
1255                 }
1256         }
1257 #endif
1258
1259         /*
1260          * Our new chain structure has already been referenced and locked
1261          * but the lock code handles the I/O so call it to resolve the data.
1262          * Then release one of our two exclusive locks.
1263          *
1264          * If NOLOCK is set the release will release the one-and-only lock.
1265          */
1266         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1267                 hammer2_chain_lock(hmp, chain, how);    /* recusive lock */
1268                 hammer2_chain_drop(hmp, chain);         /* excess ref */
1269         }
1270         ccms_thread_unlock(&chain->cst);                        /* from alloc */
1271
1272         return (chain);
1273 }
1274
1275 /*
1276  * Locate any key between key_beg and key_end inclusive.  (*parentp)
1277  * typically points to an inode but can also point to a related indirect
1278  * block and this function will recurse upwards and find the inode again.
1279  *
1280  * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
1281  *           WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
1282  *           WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
1283  *
1284  * (*parentp) must be exclusively locked and referenced and can be an inode
1285  * or an existing indirect block within the inode.
1286  *
1287  * On return (*parentp) will be modified to point at the deepest parent chain
1288  * element encountered during the search, as a helper for an insertion or
1289  * deletion.   The new (*parentp) will be locked and referenced and the old
1290  * will be unlocked and dereferenced (no change if they are both the same).
1291  *
1292  * The matching chain will be returned exclusively locked and referenced.
1293  *
1294  * NULL is returned if no match was found, but (*parentp) will still
1295  * potentially be adjusted.
1296  *
1297  * This function will also recurse up the chain if the key is not within the
1298  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
1299  * can simply allow (*parentp) to float inside the loop.
1300  */
1301 hammer2_chain_t *
1302 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1303                      hammer2_key_t key_beg, hammer2_key_t key_end,
1304                      int flags)
1305 {
1306         hammer2_chain_t *parent;
1307         hammer2_chain_t *chain;
1308         hammer2_chain_t *tmp;
1309         hammer2_blockref_t *base;
1310         hammer2_blockref_t *bref;
1311         hammer2_key_t scan_beg;
1312         hammer2_key_t scan_end;
1313         int count = 0;
1314         int i;
1315         int how_always = HAMMER2_RESOLVE_ALWAYS;
1316         int how_maybe = HAMMER2_RESOLVE_MAYBE;
1317
1318         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) {
1319                 how_maybe |= HAMMER2_RESOLVE_SHARED;
1320                 how_always |= HAMMER2_RESOLVE_SHARED;
1321         }
1322
1323         /*
1324          * Recurse (*parentp) upward if necessary until the parent completely
1325          * encloses the key range or we hit the inode.
1326          */
1327         parent = *parentp;
1328         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1329                parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1330                 scan_beg = parent->bref.key;
1331                 scan_end = scan_beg +
1332                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1333                 if (key_beg >= scan_beg && key_end <= scan_end)
1334                         break;
1335                 hammer2_chain_ref(hmp, parent);         /* ref old parent */
1336                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1337                 parent = parent->parent;
1338                                                         /* lock new parent */
1339                 hammer2_chain_lock(hmp, parent, how_maybe);
1340                 hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
1341                 *parentp = parent;                      /* new parent */
1342         }
1343
1344 again:
1345         /*
1346          * Locate the blockref array.  Currently we do a fully associative
1347          * search through the array.
1348          */
1349         switch(parent->bref.type) {
1350         case HAMMER2_BREF_TYPE_INODE:
1351                 /*
1352                  * Special shortcut for embedded data returns the inode
1353                  * itself.  Callers must detect this condition and access
1354                  * the embedded data (the strategy code does this for us).
1355                  *
1356                  * This is only applicable to regular files and softlinks.
1357                  */
1358                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1359                         if (flags & HAMMER2_LOOKUP_NOLOCK)
1360                                 hammer2_chain_ref(hmp, parent);
1361                         else
1362                                 hammer2_chain_lock(hmp, parent, how_always);
1363                         return (parent);
1364                 }
1365                 base = &parent->data->ipdata.u.blockset.blockref[0];
1366                 count = HAMMER2_SET_COUNT;
1367                 break;
1368         case HAMMER2_BREF_TYPE_INDIRECT:
1369         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1370         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1371                 /*
1372                  * Optimize indirect blocks in the INITIAL state to avoid
1373                  * I/O.
1374                  */
1375                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1376                         base = NULL;
1377                 } else {
1378                         if (parent->data == NULL)
1379                                 panic("parent->data is NULL");
1380                         base = &parent->data->npdata.blockref[0];
1381                 }
1382                 count = parent->bytes / sizeof(hammer2_blockref_t);
1383                 break;
1384         case HAMMER2_BREF_TYPE_VOLUME:
1385                 base = &hmp->voldata.sroot_blockset.blockref[0];
1386                 count = HAMMER2_SET_COUNT;
1387                 break;
1388         default:
1389                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
1390                       parent->bref.type);
1391                 base = NULL;    /* safety */
1392                 count = 0;      /* safety */
1393         }
1394
1395         /*
1396          * If the element and key overlap we use the element.
1397          *
1398          * NOTE!  Deleted elements are effectively invisible.  A Deleted
1399          *        elements covers (makes invisible) any original media
1400          *        data.
1401          */
1402         bref = NULL;
1403         for (i = 0; i < count; ++i) {
1404                 tmp = hammer2_chain_find(hmp, parent, i);
1405                 if (tmp) {
1406                         if (tmp->flags & HAMMER2_CHAIN_DELETED)
1407                                 continue;
1408                         bref = &tmp->bref;
1409                         KKASSERT(bref->type != 0);
1410                 } else if (base == NULL || base[i].type == 0) {
1411                         continue;
1412                 } else {
1413                         bref = &base[i];
1414                 }
1415                 scan_beg = bref->key;
1416                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1417                 if (key_beg <= scan_end && key_end >= scan_beg)
1418                         break;
1419         }
1420         if (i == count) {
1421                 if (key_beg == key_end)
1422                         return (NULL);
1423                 return (hammer2_chain_next(hmp, parentp, NULL,
1424                                            key_beg, key_end, flags));
1425         }
1426
1427         /*
1428          * Acquire the new chain element.  If the chain element is an
1429          * indirect block we must search recursively.
1430          */
1431         chain = hammer2_chain_get(hmp, parent, i, flags);
1432         if (chain == NULL)
1433                 return (NULL);
1434
1435         /*
1436          * If the chain element is an indirect block it becomes the new
1437          * parent and we loop on it.
1438          *
1439          * The parent always has to be locked with at least RESOLVE_MAYBE,
1440          * so it might need a fixup if the caller passed incompatible flags.
1441          */
1442         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1443             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1444                 hammer2_chain_unlock(hmp, parent);
1445                 *parentp = parent = chain;
1446                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1447                         hammer2_chain_lock(hmp, chain, how_maybe);
1448                         hammer2_chain_drop(hmp, chain); /* excess ref */
1449                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1450                         hammer2_chain_lock(hmp, chain, how_maybe);
1451                         hammer2_chain_unlock(hmp, chain);
1452                 }
1453                 goto again;
1454         }
1455
1456         /*
1457          * All done, return chain
1458          */
1459         return (chain);
1460 }
1461
1462 /*
1463  * After having issued a lookup we can iterate all matching keys.
1464  *
1465  * If chain is non-NULL we continue the iteration from just after it's index.
1466  *
1467  * If chain is NULL we assume the parent was exhausted and continue the
1468  * iteration at the next parent.
1469  *
1470  * parent must be locked on entry and remains locked throughout.  chain's
1471  * lock status must match flags.
1472  */
1473 hammer2_chain_t *
1474 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1475                    hammer2_chain_t *chain,
1476                    hammer2_key_t key_beg, hammer2_key_t key_end,
1477                    int flags)
1478 {
1479         hammer2_chain_t *parent;
1480         hammer2_chain_t *tmp;
1481         hammer2_blockref_t *base;
1482         hammer2_blockref_t *bref;
1483         hammer2_key_t scan_beg;
1484         hammer2_key_t scan_end;
1485         int i;
1486         int how_maybe = HAMMER2_RESOLVE_MAYBE;
1487         int count;
1488
1489         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
1490                 how_maybe |= HAMMER2_RESOLVE_SHARED;
1491
1492         parent = *parentp;
1493
1494 again:
1495         /*
1496          * Calculate the next index and recalculate the parent if necessary.
1497          */
1498         if (chain) {
1499                 /*
1500                  * Continue iteration within current parent.  If not NULL
1501                  * the passed-in chain may or may not be locked, based on
1502                  * the LOOKUP_NOLOCK flag (passed in as returned from lookup
1503                  * or a prior next).
1504                  */
1505                 i = chain->index + 1;
1506                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1507                         hammer2_chain_drop(hmp, chain);
1508                 else
1509                         hammer2_chain_unlock(hmp, chain);
1510
1511                 /*
1512                  * Any scan where the lookup returned degenerate data embedded
1513                  * in the inode has an invalid index and must terminate.
1514                  */
1515                 if (chain == parent)
1516                         return(NULL);
1517                 chain = NULL;
1518         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
1519                    parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1520                 /*
1521                  * We reached the end of the iteration.
1522                  */
1523                 return (NULL);
1524         } else {
1525                 /*
1526                  * Continue iteration with next parent unless the current
1527                  * parent covers the range.
1528                  */
1529                 hammer2_chain_t *nparent;
1530
1531                 scan_beg = parent->bref.key;
1532                 scan_end = scan_beg +
1533                             ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1534                 if (key_beg >= scan_beg && key_end <= scan_end)
1535                         return (NULL);
1536
1537                 i = parent->index + 1;
1538                 nparent = parent->parent;
1539                 hammer2_chain_ref(hmp, nparent);        /* ref new parent */
1540                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1541                                                         /* lock new parent */
1542                 hammer2_chain_lock(hmp, nparent, how_maybe);
1543                 hammer2_chain_drop(hmp, nparent);       /* drop excess ref */
1544                 *parentp = parent = nparent;
1545         }
1546
1547 again2:
1548         /*
1549          * Locate the blockref array.  Currently we do a fully associative
1550          * search through the array.
1551          */
1552         switch(parent->bref.type) {
1553         case HAMMER2_BREF_TYPE_INODE:
1554                 base = &parent->data->ipdata.u.blockset.blockref[0];
1555                 count = HAMMER2_SET_COUNT;
1556                 break;
1557         case HAMMER2_BREF_TYPE_INDIRECT:
1558         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1559         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1560                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1561                         base = NULL;
1562                 } else {
1563                         KKASSERT(parent->data != NULL);
1564                         base = &parent->data->npdata.blockref[0];
1565                 }
1566                 count = parent->bytes / sizeof(hammer2_blockref_t);
1567                 break;
1568         case HAMMER2_BREF_TYPE_VOLUME:
1569                 base = &hmp->voldata.sroot_blockset.blockref[0];
1570                 count = HAMMER2_SET_COUNT;
1571                 break;
1572         default:
1573                 panic("hammer2_chain_next: unrecognized blockref type: %d",
1574                       parent->bref.type);
1575                 base = NULL;    /* safety */
1576                 count = 0;      /* safety */
1577                 break;
1578         }
1579         KKASSERT(i <= count);
1580
1581         /*
1582          * Look for the key.  If we are unable to find a match and an exact
1583          * match was requested we return NULL.  If a range was requested we
1584          * run hammer2_chain_next() to iterate.
1585          *
1586          * NOTE!  Deleted elements are effectively invisible.  A Deleted
1587          *        elements covers (makes invisible) any original media
1588          *        data.
1589          */
1590         bref = NULL;
1591         while (i < count) {
1592                 tmp = hammer2_chain_find(hmp, parent, i);
1593                 if (tmp) {
1594                         if (tmp->flags & HAMMER2_CHAIN_DELETED) {
1595                                 ++i;
1596                                 continue;
1597                         }
1598                         bref = &tmp->bref;
1599                 } else if (base == NULL || base[i].type == 0) {
1600                         ++i;
1601                         continue;
1602                 } else {
1603                         bref = &base[i];
1604                 }
1605                 scan_beg = bref->key;
1606                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1607                 if (key_beg <= scan_end && key_end >= scan_beg)
1608                         break;
1609                 ++i;
1610         }
1611
1612         /*
1613          * If we couldn't find a match recurse up a parent to continue the
1614          * search.
1615          */
1616         if (i == count)
1617                 goto again;
1618
1619         /*
1620          * Acquire the new chain element.  If the chain element is an
1621          * indirect block we must search recursively.
1622          */
1623         chain = hammer2_chain_get(hmp, parent, i, flags);
1624         if (chain == NULL)
1625                 return (NULL);
1626
1627         /*
1628          * If the chain element is an indirect block it becomes the new
1629          * parent and we loop on it.
1630          *
1631          * The parent always has to be locked with at least RESOLVE_MAYBE,
1632          * so it might need a fixup if the caller passed incompatible flags.
1633          */
1634         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1635             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1636                 hammer2_chain_unlock(hmp, parent);
1637                 *parentp = parent = chain;
1638                 chain = NULL;
1639                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1640                         hammer2_chain_lock(hmp, parent, how_maybe);
1641                         hammer2_chain_drop(hmp, parent);        /* excess ref */
1642                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1643                         hammer2_chain_lock(hmp, parent, how_maybe);
1644                         hammer2_chain_unlock(hmp, parent);
1645                 }
1646                 i = 0;
1647                 goto again2;
1648         }
1649
1650         /*
1651          * All done, return chain
1652          */
1653         return (chain);
1654 }
1655
1656 /*
1657  * Create and return a new hammer2 system memory structure of the specified
1658  * key, type and size and insert it RELATIVE TO (PARENT).
1659  *
1660  * (parent) is typically either an inode or an indirect block, acquired
1661  * acquired as a side effect of issuing a prior failed lookup.  parent
1662  * must be locked and held.  Do not pass the inode chain to this function
1663  * unless that is the chain returned by the failed lookup.
1664  *
1665  * Non-indirect types will automatically allocate indirect blocks as required
1666  * if the new item does not fit in the current (parent).
1667  *
1668  * Indirect types will move a portion of the existing blockref array in
1669  * (parent) into the new indirect type and then use one of the free slots
1670  * to emplace the new indirect type.
1671  *
1672  * A new locked, referenced chain element is returned of the specified type.
1673  * The element may or may not have a data area associated with it:
1674  *
1675  *      VOLUME          not allowed here
1676  *      INODE           kmalloc()'d data area is set up
1677  *      INDIRECT        not allowed here
1678  *      DATA            no data area will be set-up (caller is expected
1679  *                      to have logical buffers, we don't want to alias
1680  *                      the data onto device buffers!).
1681  *
1682  * Requires an exclusively locked parent.
1683  */
1684 hammer2_chain_t *
1685 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1686                      hammer2_chain_t *chain,
1687                      hammer2_key_t key, int keybits, int type, size_t bytes,
1688                      int *errorp)
1689 {
1690         hammer2_blockref_t dummy;
1691         hammer2_blockref_t *base;
1692         hammer2_chain_t dummy_chain;
1693         int unlock_parent = 0;
1694         int allocated = 0;
1695         int count;
1696         int i;
1697
1698         KKASSERT(ccms_thread_lock_owned(&parent->cst));
1699         *errorp = 0;
1700
1701         if (chain == NULL) {
1702                 /*
1703                  * First allocate media space and construct the dummy bref,
1704                  * then allocate the in-memory chain structure.
1705                  */
1706                 bzero(&dummy, sizeof(dummy));
1707                 dummy.type = type;
1708                 dummy.key = key;
1709                 dummy.keybits = keybits;
1710                 dummy.data_off = hammer2_allocsize(bytes);
1711                 dummy.methods = parent->bref.methods;
1712                 chain = hammer2_chain_alloc(hmp, &dummy);
1713                 allocated = 1;
1714
1715                 /*
1716                  * We do NOT set INITIAL here (yet).  INITIAL is only
1717                  * used for indirect blocks.
1718                  *
1719                  * Recalculate bytes to reflect the actual media block
1720                  * allocation.
1721                  */
1722                 bytes = (hammer2_off_t)1 <<
1723                         (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
1724                 chain->bytes = bytes;
1725
1726                 switch(type) {
1727                 case HAMMER2_BREF_TYPE_VOLUME:
1728                         panic("hammer2_chain_create: called with volume type");
1729                         break;
1730                 case HAMMER2_BREF_TYPE_INODE:
1731                         KKASSERT(bytes == HAMMER2_INODE_BYTES);
1732                         chain->data = kmalloc(sizeof(chain->data->ipdata),
1733                                               hmp->minode, M_WAITOK | M_ZERO);
1734                         break;
1735                 case HAMMER2_BREF_TYPE_INDIRECT:
1736                         panic("hammer2_chain_create: cannot be used to"
1737                               "create indirect block");
1738                         break;
1739                 case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1740                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1741                         panic("hammer2_chain_create: cannot be used to"
1742                               "create freemap root or node");
1743                         break;
1744                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1745                 case HAMMER2_BREF_TYPE_DATA:
1746                 default:
1747                         /* leave chain->data NULL */
1748                         KKASSERT(chain->data == NULL);
1749                         break;
1750                 }
1751         } else {
1752                 /*
1753                  * Potentially update the chain's key/keybits.
1754                  */
1755                 chain->bref.key = key;
1756                 chain->bref.keybits = keybits;
1757         }
1758
1759 again:
1760         /*
1761          * Locate a free blockref in the parent's array
1762          */
1763         switch(parent->bref.type) {
1764         case HAMMER2_BREF_TYPE_INODE:
1765                 KKASSERT((parent->data->ipdata.op_flags &
1766                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
1767                 KKASSERT(parent->data != NULL);
1768                 base = &parent->data->ipdata.u.blockset.blockref[0];
1769                 count = HAMMER2_SET_COUNT;
1770                 break;
1771         case HAMMER2_BREF_TYPE_INDIRECT:
1772         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1773         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1774                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1775                         base = NULL;
1776                 } else {
1777                         KKASSERT(parent->data != NULL);
1778                         base = &parent->data->npdata.blockref[0];
1779                 }
1780                 count = parent->bytes / sizeof(hammer2_blockref_t);
1781                 break;
1782         case HAMMER2_BREF_TYPE_VOLUME:
1783                 KKASSERT(parent->data != NULL);
1784                 base = &hmp->voldata.sroot_blockset.blockref[0];
1785                 count = HAMMER2_SET_COUNT;
1786                 break;
1787         default:
1788                 panic("hammer2_chain_create: unrecognized blockref type: %d",
1789                       parent->bref.type);
1790                 count = 0;
1791                 break;
1792         }
1793
1794         /*
1795          * Scan for an unallocated bref, also skipping any slots occupied
1796          * by in-memory chain elements that may not yet have been updated
1797          * in the parent's bref array.
1798          */
1799         bzero(&dummy_chain, sizeof(dummy_chain));
1800         for (i = 0; i < count; ++i) {
1801                 if (base == NULL) {
1802                         dummy_chain.index = i;
1803                         if (RB_FIND(hammer2_chain_tree,
1804                                     &parent->rbhead, &dummy_chain) == NULL) {
1805                                 break;
1806                         }
1807                 } else if (base[i].type == 0) {
1808                         dummy_chain.index = i;
1809                         if (RB_FIND(hammer2_chain_tree,
1810                                     &parent->rbhead, &dummy_chain) == NULL) {
1811                                 break;
1812                         }
1813                 }
1814         }
1815
1816         /*
1817          * If no free blockref could be found we must create an indirect
1818          * block and move a number of blockrefs into it.  With the parent
1819          * locked we can safely lock each child in order to move it without
1820          * causing a deadlock.
1821          *
1822          * This may return the new indirect block or the old parent depending
1823          * on where the key falls.  NULL is returned on error.  The most
1824          * typical error is EAGAIN (flush conflict during chain move).
1825          */
1826         if (i == count) {
1827                 hammer2_chain_t *nparent;
1828
1829                 nparent = hammer2_chain_create_indirect(hmp, parent,
1830                                                         key, keybits,
1831                                                         errorp);
1832                 if (nparent == NULL) {
1833                         if (allocated)
1834                                 hammer2_chain_free(hmp, chain);
1835                         chain = NULL;
1836                         goto done;
1837                 }
1838                 if (parent != nparent) {
1839                         if (unlock_parent)
1840                                 hammer2_chain_unlock(hmp, parent);
1841                         parent = nparent;
1842                         unlock_parent = 1;
1843                 }
1844                 goto again;
1845         }
1846
1847         /*
1848          * Link the chain into its parent.  Later on we will have to set
1849          * the MOVED bit in situations where we don't mark the new chain
1850          * as being modified.
1851          */
1852         if (chain->parent != NULL)
1853                 panic("hammer2: hammer2_chain_create: chain already connected");
1854         KKASSERT(chain->parent == NULL);
1855         chain->parent = parent;
1856         chain->index = i;
1857         if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
1858                 panic("hammer2_chain_link: collision");
1859         atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
1860         KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
1861         KKASSERT(parent->refs > 0);
1862         atomic_add_int(&parent->refs, 1);
1863
1864 #if 0
1865         /*
1866          * Additional linkage for inodes.  Reuse the parent pointer to
1867          * find the parent directory.
1868          *
1869          * Cumulative adjustments are inherited on [re]attach and will
1870          * propagate up the tree on the next flush.
1871          *
1872          * The ccms_inode is initialized from its parent directory.  The
1873          * chain of ccms_inode's is seeded by the mount code.
1874          */
1875         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1876                 hammer2_chain_t *scan = parent;
1877                 hammer2_inode_t *ip = chain->u.ip;
1878
1879                 while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1880                         scan = scan->parent;
1881                 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE) {
1882                         ip->pip = scan->u.ip;
1883                         ip->pmp = scan->u.ip->pmp;
1884                         ip->pip->delta_icount += ip->ip_data.inode_count;
1885                         ip->pip->delta_dcount += ip->ip_data.data_count;
1886                         ++ip->pip->delta_icount;
1887                         ccms_cst_init(&ip->topo_cst, &ip->chain);
1888                 }
1889         }
1890 #endif
1891         /*
1892          * (allocated) indicates that this is a newly-created chain element
1893          * rather than a renamed chain element.  In this situation we want
1894          * to place the chain element in the MODIFIED state.
1895          *
1896          * The data area will be set up as follows:
1897          *
1898          *      VOLUME          not allowed here.
1899          *
1900          *      INODE           embedded data are will be set-up.
1901          *
1902          *      INDIRECT        not allowed here.
1903          *
1904          *      DATA            no data area will be set-up (caller is expected
1905          *                      to have logical buffers, we don't want to alias
1906          *                      the data onto device buffers!).
1907          */
1908         if (allocated) {
1909                 switch(chain->bref.type) {
1910                 case HAMMER2_BREF_TYPE_DATA:
1911                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1912                         hammer2_chain_modify(hmp, chain,
1913                                              HAMMER2_MODIFY_OPTDATA);
1914                         break;
1915                 case HAMMER2_BREF_TYPE_INDIRECT:
1916                 case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1917                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1918                         /* not supported in this function */
1919                         panic("hammer2_chain_create: bad type");
1920                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1921                         hammer2_chain_modify(hmp, chain,
1922                                              HAMMER2_MODIFY_OPTDATA);
1923                         break;
1924                 default:
1925                         hammer2_chain_modify(hmp, chain, 0);
1926                         break;
1927                 }
1928         } else {
1929                 /*
1930                  * When reconnecting inodes we have to call setsubmod()
1931                  * to ensure that its state propagates up the newly
1932                  * connected parent.
1933                  *
1934                  * Make sure MOVED is set but do not update bref_flush.  If
1935                  * the chain is undergoing modification bref_flush will be
1936                  * updated when it gets flushed.  If it is not then the
1937                  * bref may not have been flushed yet and we do not want to
1938                  * set MODIFIED here as this could result in unnecessary
1939                  * reallocations.
1940                  */
1941                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1942                         hammer2_chain_ref(hmp, chain);
1943                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1944                 }
1945                 hammer2_chain_parent_setsubmod(hmp, chain);
1946         }
1947
1948 done:
1949         if (unlock_parent)
1950                 hammer2_chain_unlock(hmp, parent);
1951         return (chain);
1952 }
1953
1954 /*
1955  * Create an indirect block that covers one or more of the elements in the
1956  * current parent.  Either returns the existing parent with no locking or
1957  * ref changes or returns the new indirect block locked and referenced
1958  * and leaving the original parent lock/ref intact as well.
1959  *
1960  * If an error occurs, NULL is returned and *errorp is set to the error.
1961  * EAGAIN can be returned to indicate a flush collision which requires the
1962  * caller to retry.
1963  *
1964  * The returned chain depends on where the specified key falls.
1965  *
1966  * The key/keybits for the indirect mode only needs to follow three rules:
1967  *
1968  * (1) That all elements underneath it fit within its key space and
1969  *
1970  * (2) That all elements outside it are outside its key space.
1971  *
1972  * (3) When creating the new indirect block any elements in the current
1973  *     parent that fit within the new indirect block's keyspace must be
1974  *     moved into the new indirect block.
1975  *
1976  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
1977  *     keyspace the the current parent, but lookup/iteration rules will
1978  *     ensure (and must ensure) that rule (2) for all parents leading up
1979  *     to the nearest inode or the root volume header is adhered to.  This
1980  *     is accomplished by always recursing through matching keyspaces in
1981  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
1982  *
1983  * The current implementation calculates the current worst-case keyspace by
1984  * iterating the current parent and then divides it into two halves, choosing
1985  * whichever half has the most elements (not necessarily the half containing
1986  * the requested key).
1987  *
1988  * We can also opt to use the half with the least number of elements.  This
1989  * causes lower-numbered keys (aka logical file offsets) to recurse through
1990  * fewer indirect blocks and higher-numbered keys to recurse through more.
1991  * This also has the risk of not moving enough elements to the new indirect
1992  * block and being forced to create several indirect blocks before the element
1993  * can be inserted.
1994  *
1995  * Must be called with an exclusively locked parent.
1996  */
1997 static
1998 hammer2_chain_t *
1999 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
2000                               hammer2_key_t create_key, int create_bits,
2001                               int *errorp)
2002 {
2003         hammer2_blockref_t *base;
2004         hammer2_blockref_t *bref;
2005         hammer2_chain_t *chain;
2006         hammer2_chain_t *ichain;
2007         hammer2_chain_t dummy;
2008         hammer2_key_t key = create_key;
2009         int keybits = create_bits;
2010         int locount = 0;
2011         int hicount = 0;
2012         int count;
2013         int nbytes;
2014         int i;
2015
2016         /*
2017          * Calculate the base blockref pointer or NULL if the chain
2018          * is known to be empty.  We need to calculate the array count
2019          * for RB lookups either way.
2020          */
2021         KKASSERT(ccms_thread_lock_owned(&parent->cst));
2022         *errorp = 0;
2023
2024         hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
2025         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2026                 base = NULL;
2027
2028                 switch(parent->bref.type) {
2029                 case HAMMER2_BREF_TYPE_INODE:
2030                         count = HAMMER2_SET_COUNT;
2031                         break;
2032                 case HAMMER2_BREF_TYPE_INDIRECT:
2033                 case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
2034                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2035                         count = parent->bytes / sizeof(hammer2_blockref_t);
2036                         break;
2037                 case HAMMER2_BREF_TYPE_VOLUME:
2038                         count = HAMMER2_SET_COUNT;
2039                         break;
2040                 default:
2041                         panic("hammer2_chain_create_indirect: "
2042                               "unrecognized blockref type: %d",
2043                               parent->bref.type);
2044                         count = 0;
2045                         break;
2046                 }
2047         } else {
2048                 switch(parent->bref.type) {
2049                 case HAMMER2_BREF_TYPE_INODE:
2050                         base = &parent->data->ipdata.u.blockset.blockref[0];
2051                         count = HAMMER2_SET_COUNT;
2052                         break;
2053                 case HAMMER2_BREF_TYPE_INDIRECT:
2054                 case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
2055                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2056                         base = &parent->data->npdata.blockref[0];
2057                         count = parent->bytes / sizeof(hammer2_blockref_t);
2058                         break;
2059                 case HAMMER2_BREF_TYPE_VOLUME:
2060                         base = &hmp->voldata.sroot_blockset.blockref[0];
2061                         count = HAMMER2_SET_COUNT;
2062                         break;
2063                 default:
2064                         panic("hammer2_chain_create_indirect: "
2065                               "unrecognized blockref type: %d",
2066                               parent->bref.type);
2067                         count = 0;
2068                         break;
2069                 }
2070         }
2071
2072         /*
2073          * Scan for an unallocated bref, also skipping any slots occupied
2074          * by in-memory chain elements which may not yet have been updated
2075          * in the parent's bref array.
2076          */
2077         bzero(&dummy, sizeof(dummy));
2078         for (i = 0; i < count; ++i) {
2079                 int nkeybits;
2080
2081                 dummy.index = i;
2082                 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
2083                 if (chain) {
2084                         /*
2085                          * NOTE! CHAIN_DELETED elements have to be adjusted
2086                          *       too, they cannot be ignored.
2087                          */
2088                         bref = &chain->bref;
2089                 } else if (base && base[i].type) {
2090                         bref = &base[i];
2091                 } else {
2092                         continue;
2093                 }
2094
2095                 /*
2096                  * Expand our calculated key range (key, keybits) to fit
2097                  * the scanned key.  nkeybits represents the full range
2098                  * that we will later cut in half (two halves @ nkeybits - 1).
2099                  */
2100                 nkeybits = keybits;
2101                 if (nkeybits < bref->keybits)
2102                         nkeybits = bref->keybits;
2103                 while (nkeybits < 64 &&
2104                        (~(((hammer2_key_t)1 << nkeybits) - 1) &
2105                         (key ^ bref->key)) != 0) {
2106                         ++nkeybits;
2107                 }
2108
2109                 /*
2110                  * If the new key range is larger we have to determine
2111                  * which side of the new key range the existing keys fall
2112                  * under by checking the high bit, then collapsing the
2113                  * locount into the hicount or vise-versa.
2114                  */
2115                 if (keybits != nkeybits) {
2116                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
2117                                 hicount += locount;
2118                                 locount = 0;
2119                         } else {
2120                                 locount += hicount;
2121                                 hicount = 0;
2122                         }
2123                         keybits = nkeybits;
2124                 }
2125
2126                 /*
2127                  * The newly scanned key will be in the lower half or the
2128                  * higher half of the (new) key range.
2129                  */
2130                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
2131                         ++hicount;
2132                 else
2133                         ++locount;
2134         }
2135
2136         /*
2137          * Adjust keybits to represent half of the full range calculated
2138          * above (radix 63 max)
2139          */
2140         --keybits;
2141
2142         /*
2143          * Select whichever half contains the most elements.  Theoretically
2144          * we can select either side as long as it contains at least one
2145          * element (in order to ensure that a free slot is present to hold
2146          * the indirect block).
2147          */
2148         key &= ~(((hammer2_key_t)1 << keybits) - 1);
2149         if (hammer2_indirect_optimize) {
2150                 /*
2151                  * Insert node for least number of keys, this will arrange
2152                  * the first few blocks of a large file or the first few
2153                  * inodes in a directory with fewer indirect blocks when
2154                  * created linearly.
2155                  */
2156                 if (hicount < locount && hicount != 0)
2157                         key |= (hammer2_key_t)1 << keybits;
2158                 else
2159                         key &= ~(hammer2_key_t)1 << keybits;
2160         } else {
2161                 /*
2162                  * Insert node for most number of keys, best for heavily
2163                  * fragmented files.
2164                  */
2165                 if (hicount > locount)
2166                         key |= (hammer2_key_t)1 << keybits;
2167                 else
2168                         key &= ~(hammer2_key_t)1 << keybits;
2169         }
2170
2171         /*
2172          * How big should our new indirect block be?  It has to be at least
2173          * as large as its parent.
2174          */
2175         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
2176                 nbytes = HAMMER2_IND_BYTES_MIN;
2177         else
2178                 nbytes = HAMMER2_IND_BYTES_MAX;
2179         if (nbytes < count * sizeof(hammer2_blockref_t))
2180                 nbytes = count * sizeof(hammer2_blockref_t);
2181
2182         /*
2183          * Ok, create our new indirect block
2184          */
2185         switch(parent->bref.type) {
2186         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
2187         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2188                 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE;
2189                 break;
2190         default:
2191                 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
2192                 break;
2193         }
2194         dummy.bref.key = key;
2195         dummy.bref.keybits = keybits;
2196         dummy.bref.data_off = hammer2_allocsize(nbytes);
2197         dummy.bref.methods = parent->bref.methods;
2198         ichain = hammer2_chain_alloc(hmp, &dummy.bref);
2199         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
2200
2201         /*
2202          * Iterate the original parent and move the matching brefs into
2203          * the new indirect block.
2204          */
2205         for (i = 0; i < count; ++i) {
2206                 /*
2207                  * For keying purposes access the bref from the media or
2208                  * from our in-memory cache.  In cases where the in-memory
2209                  * cache overrides the media the keyrefs will be the same
2210                  * anyway so we can avoid checking the cache when the media
2211                  * has a key.
2212                  */
2213                 dummy.index = i;
2214                 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
2215                 if (chain) {
2216                         /*
2217                          * NOTE! CHAIN_DELETED elements have to be adjusted
2218                          *       too, they cannot be ignored.
2219                          */
2220                         bref = &chain->bref;
2221                 } else if (base && base[i].type) {
2222                         bref = &base[i];
2223                 } else {
2224                         if (ichain->index < 0)
2225                                 ichain->index = i;
2226                         continue;
2227                 }
2228
2229                 /*
2230                  * Skip keys not in the chosen half (low or high), only bit
2231                  * (keybits - 1) needs to be compared but for safety we
2232                  * will compare all msb bits plus that bit again.
2233                  */
2234                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
2235                     (key ^ bref->key)) != 0) {
2236                         continue;
2237                 }
2238
2239                 /*
2240                  * This element is being moved from the parent, its slot
2241                  * is available for our new indirect block.
2242                  */
2243                 if (ichain->index < 0)
2244                         ichain->index = i;
2245
2246                 /*
2247                  * Load the new indirect block by acquiring or allocating
2248                  * the related chain entries, then simply move them to the
2249                  * new parent (ichain).  We cannot move chains which are
2250                  * undergoing flushing and will break out of the loop in
2251                  * that case.
2252                  *
2253                  * When adjusting the parent/child relationship we must
2254                  * set the MOVED bit but we do NOT update bref_flush
2255                  * because otherwise we might synchronize a bref that has
2256                  * not yet been flushed.  We depend on chain's bref_flush
2257                  * either being correct or the chain being in a MODIFIED
2258                  * state.
2259                  *
2260                  * We do not want to set MODIFIED here as this would result
2261                  * in unnecessary reallocations.
2262                  *
2263                  * We must still set SUBMODIFIED in the parent but we do
2264                  * that after the loop.
2265                  *
2266                  * WARNING! chain->cst.spin must be held when chain->parent is
2267                  *          modified, even though we own the full blown lock,
2268                  *          to deal with setsubmod and rename races.
2269                  */
2270                 chain = hammer2_chain_get(hmp, parent, i,
2271                                           HAMMER2_LOOKUP_NODATA);
2272                 if (chain->flushing) {
2273                         hammer2_chain_unlock(hmp, chain);
2274                         break;
2275                 }
2276
2277                 spin_lock(&chain->cst.spin);
2278                 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
2279                 if (RB_INSERT(hammer2_chain_tree, &ichain->rbhead, chain))
2280                         panic("hammer2_chain_create_indirect: collision");
2281                 chain->parent = ichain;
2282                 spin_unlock(&chain->cst.spin);
2283
2284                 if (base)
2285                         bzero(&base[i], sizeof(base[i]));
2286                 atomic_add_int(&parent->refs, -1);
2287                 atomic_add_int(&ichain->refs, 1);
2288                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2289                         hammer2_chain_ref(hmp, chain);
2290                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2291                 }
2292                 hammer2_chain_unlock(hmp, chain);
2293                 KKASSERT(parent->refs > 0);
2294                 chain = NULL;
2295         }
2296
2297         /*
2298          * If we hit a chain that is undergoing flushing we're screwed and
2299          * we have to duno the whole mess.  Since ichain has not been linked
2300          * in yet, the moved chains are not reachable and will not have been
2301          * disposed of.
2302          *
2303          * WARNING! This code is pretty hairy because the flusher is sitting
2304          *          on the parent processing one of the children that we
2305          *          haven't yet moved, and will do a RB_NEXT loop on that
2306          *          child.  So the children we're moving back have to be
2307          *          returned to the same place in the iteration that they
2308          *          were removed from.
2309          */
2310         if (i != count) {
2311                 kprintf("hammer2_chain_create_indirect: EAGAIN\n");
2312                 *errorp = EAGAIN;
2313                 while ((chain = RB_ROOT(&ichain->rbhead)) != NULL) {
2314                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_NEVER);
2315                         KKASSERT(chain->flushing == 0);
2316                         RB_REMOVE(hammer2_chain_tree, &ichain->rbhead, chain);
2317                         if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
2318                                 panic("hammer2_chain_create_indirect: collision");
2319                         chain->parent = parent;
2320                         atomic_add_int(&parent->refs, 1);
2321                         atomic_add_int(&ichain->refs, -1);
2322                         /* MOVED bit might have been inherited, cannot undo */
2323                         hammer2_chain_unlock(hmp, chain);
2324                 }
2325                 hammer2_chain_free(hmp, ichain);
2326                 return(NULL);
2327         }
2328
2329         /*
2330          * Insert the new indirect block into the parent now that we've
2331          * cleared out some entries in the parent.  We calculated a good
2332          * insertion index in the loop above (ichain->index).
2333          *
2334          * We don't have to set MOVED here because we mark ichain modified
2335          * down below (so the normal modified -> flush -> set-moved sequence
2336          * applies).
2337          */
2338         KKASSERT(ichain->index >= 0);
2339         if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, ichain))
2340                 panic("hammer2_chain_create_indirect: ichain insertion");
2341         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_ONRBTREE);
2342         ichain->parent = parent;
2343         atomic_add_int(&parent->refs, 1);
2344
2345         /*
2346          * Mark the new indirect block modified after insertion, which
2347          * will propagate up through parent all the way to the root and
2348          * also allocate the physical block in ichain for our caller,
2349          * and assign ichain->data to a pre-zero'd space (because there
2350          * is not prior data to copy into it).
2351          *
2352          * We have to set SUBMODIFIED in ichain's flags manually so the
2353          * flusher knows it has to recurse through it to get to all of
2354          * our moved blocks, then call setsubmod() to set the bit
2355          * recursively.
2356          */
2357         hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA);
2358         hammer2_chain_parent_setsubmod(hmp, ichain);
2359         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2360
2361         /*
2362          * Figure out what to return.
2363          */
2364         if (create_bits > keybits) {
2365                 /*
2366                  * Key being created is way outside the key range,
2367                  * return the original parent.
2368                  */
2369                 hammer2_chain_unlock(hmp, ichain);
2370         } else if (~(((hammer2_key_t)1 << keybits) - 1) &
2371                    (create_key ^ key)) {
2372                 /*
2373                  * Key being created is outside the key range,
2374                  * return the original parent.
2375                  */
2376                 hammer2_chain_unlock(hmp, ichain);
2377         } else {
2378                 /*
2379                  * Otherwise its in the range, return the new parent.
2380                  * (leave both the new and old parent locked).
2381                  */
2382                 parent = ichain;
2383         }
2384
2385         return(parent);
2386 }
2387
2388 /*
2389  * Physically delete the specified chain element.  Note that inodes with
2390  * open descriptors should not be deleted (as with other filesystems) until
2391  * the last open descriptor is closed.
2392  *
2393  * This routine will remove the chain element from its parent and potentially
2394  * also recurse upward and delete indirect blocks which become empty as a
2395  * side effect.
2396  *
2397  * The caller must pass a pointer to the chain's parent, also locked and
2398  * referenced.  (*parentp) will be modified in a manner similar to a lookup
2399  * or iteration when indirect blocks are also deleted as a side effect.
2400  *
2401  * XXX This currently does not adhere to the MOVED flag protocol in that
2402  *     the removal is immediately indicated in the parent's blockref[]
2403  *     array.
2404  *
2405  * Must be called with an exclusively locked parent and chain.
2406  */
2407 void
2408 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
2409                      hammer2_chain_t *chain, int retain)
2410 {
2411         hammer2_blockref_t *base;
2412         int count;
2413
2414         if (chain->parent != parent)
2415                 panic("hammer2_chain_delete: parent mismatch");
2416         KKASSERT(ccms_thread_lock_owned(&parent->cst));
2417
2418         /*
2419          * Mark the parent modified so our base[] pointer remains valid
2420          * while we move entries.  For the optimized indirect block
2421          * case mark the parent moved instead.
2422          *
2423          * Calculate the blockref reference in the parent
2424          */
2425         switch(parent->bref.type) {
2426         case HAMMER2_BREF_TYPE_INODE:
2427                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2428                 base = &parent->data->ipdata.u.blockset.blockref[0];
2429                 count = HAMMER2_SET_COUNT;
2430                 break;
2431         case HAMMER2_BREF_TYPE_INDIRECT:
2432         case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
2433         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2434                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA |
2435                                                   HAMMER2_MODIFY_NO_MODIFY_TID);
2436                 if (parent->flags & HAMMER2_CHAIN_INITIAL)
2437                         base = NULL;
2438                 else
2439                         base = &parent->data->npdata.blockref[0];
2440                 count = parent->bytes / sizeof(hammer2_blockref_t);
2441                 break;
2442         case HAMMER2_BREF_TYPE_VOLUME:
2443                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2444                 base = &hmp->voldata.sroot_blockset.blockref[0];
2445                 count = HAMMER2_SET_COUNT;
2446                 break;
2447         default:
2448                 panic("hammer2_chain_delete: unrecognized blockref type: %d",
2449                       parent->bref.type);
2450                 count = 0;
2451                 break;
2452         }
2453         KKASSERT(chain->index >= 0 && chain->index < count);
2454
2455         /*
2456          * We may not be able to immediately disconnect the chain if a
2457          * flush is in progress.  If retain is non-zero we MUST disconnect
2458          * the chain now and callers are responsible for making sure that
2459          * flushing is zero.
2460          */
2461         spin_lock(&chain->cst.spin);
2462         if ((retain || chain->flushing == 0) &&
2463             (chain->flags & HAMMER2_CHAIN_ONRBTREE)) {
2464                 if (base)
2465                         bzero(&base[chain->index], sizeof(*base));
2466                 KKASSERT(chain->flushing == 0);
2467                 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
2468                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
2469                 atomic_add_int(&parent->refs, -1);   /* for red-black entry */
2470                 chain->index = -1;
2471                 chain->parent = NULL;
2472         }
2473         spin_unlock(&chain->cst.spin);
2474
2475         /*
2476          * Cumulative adjustments must be propagated to the parent inode
2477          * when deleting and synchronized to ip.  This occurs even if we
2478          * cannot detach the chain from its parent.
2479          *
2480          * NOTE:  We do not propagate ip->delta_*count to the parent because
2481          *        these represent adjustments that have not yet been
2482          *        propagated upward, so we don't need to remove them from
2483          *        the parent.
2484          *
2485          * Clear the pointer to the parent inode.
2486          */
2487         if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
2488             chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2489                 KKASSERT(chain->u.ip == NULL);
2490         }
2491
2492         /*
2493          * If retain is 0 the deletion is permanent.  Because the chain is
2494          * no longer connected to the topology a flush will have no
2495          * visibility into it.  We must dispose of the references related
2496          * to the MODIFIED and MOVED flags, otherwise the ref count will
2497          * never transition to 0.
2498          *
2499          * If retain is non-zero the deleted element is likely an inode
2500          * which the vnops frontend will mark DESTROYED and flush.  In that
2501          * situation we must retain the flags for any open file descriptors
2502          * on the (removed) inode.  The final close will destroy the
2503          * disconnected chain.
2504          */
2505         if (retain == 0) {
2506                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2507                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2508                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
2509                         hammer2_chain_drop(hmp, chain);
2510                 }
2511                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2512                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2513                         hammer2_chain_drop(hmp, chain);
2514                 }
2515         }
2516
2517         /*
2518          * The chain is still likely referenced, possibly even by a vnode
2519          * (if an inode), so defer further action until the chain gets
2520          * dropped.
2521          */
2522 }
2523
2524 void
2525 hammer2_chain_wait(hammer2_mount_t *hmp, hammer2_chain_t *chain)
2526 {
2527         tsleep(chain, 0, "chnflw", 1);
2528 }