hammer2 - Cleanup various races, better flush
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem handles direct and indirect block searches, recursions,
37  * creation, and deletion.  Chains of blockrefs are tracked and modifications
38  * are flag for propagation... eventually all the way back to the volume
39  * header.
40  */
41
42 #include <sys/cdefs.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/types.h>
46 #include <sys/lock.h>
47 #include <sys/uuid.h>
48
49 #include "hammer2.h"
50
51 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
52
53 static hammer2_chain_t *hammer2_chain_create_indirect(
54                         hammer2_mount_t *hmp, hammer2_chain_t *parent,
55                         hammer2_key_t key, int keybits);
56 static void hammer2_chain_movelock(hammer2_mount_t *hmp,
57                         hammer2_chain_t *chain);
58 static void hammer2_chain_moveunlock(hammer2_mount_t *hmp,
59                         hammer2_chain_t *chain);
60 static void hammer2_chain_movewait(hammer2_mount_t *hmp,
61                         hammer2_chain_t *chain);
62
63 /*
64  * We use a red-black tree to guarantee safe lookups under shared locks.
65  */
66 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
67
68 int
69 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
70 {
71         return(chain2->index - chain1->index);
72 }
73
74 /*
75  * Recursively mark the parent chain elements so flushes can find
76  * modified elements.  Stop when we hit a chain already flagged
77  * SUBMODIFIED, but ignore the SUBMODIFIED bit that might be set
78  * in chain itself.
79  *
80  * SUBMODIFIED is not set on the chain passed in.
81  *
82  * The chain->cst.spin lock can be held to stabilize the chain->parent
83  * pointer.  The first parent is stabilized by virtue of chain being
84  * fully locked.
85  */
86 static void
87 hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
88 {
89         hammer2_chain_t *parent;
90
91         parent = chain->parent;
92         if (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
93                 spin_lock(&parent->cst.spin);
94                 for (;;) {
95                         atomic_set_int(&parent->flags,
96                                        HAMMER2_CHAIN_SUBMODIFIED);
97                         if ((chain = parent->parent) == NULL)
98                                 break;
99                         spin_lock(&chain->cst.spin);    /* upward interlock */
100                         spin_unlock(&parent->cst.spin);
101                         parent = chain;
102                 }
103                 spin_unlock(&parent->cst.spin);
104         }
105 }
106
107 /*
108  * Allocate a new disconnected chain element representing the specified
109  * bref.  The chain element is locked exclusively and refs is set to 1.
110  *
111  * This essentially allocates a system memory structure representing one
112  * of the media structure types, including inodes.
113  */
114 hammer2_chain_t *
115 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
116 {
117         hammer2_chain_t *chain;
118         hammer2_inode_t *ip;
119         hammer2_indblock_t *np;
120         hammer2_data_t *dp;
121         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
122
123         /*
124          * Construct the appropriate system structure.
125          */
126         switch(bref->type) {
127         case HAMMER2_BREF_TYPE_INODE:
128                 ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
129                 chain = &ip->chain;
130                 chain->u.ip = ip;
131                 ip->hmp = hmp;
132                 break;
133         case HAMMER2_BREF_TYPE_INDIRECT:
134                 np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
135                 chain = &np->chain;
136                 chain->u.np = np;
137                 break;
138         case HAMMER2_BREF_TYPE_DATA:
139                 dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
140                 chain = &dp->chain;
141                 chain->u.dp = dp;
142                 break;
143         case HAMMER2_BREF_TYPE_VOLUME:
144                 chain = NULL;
145                 panic("hammer2_chain_alloc volume type illegal for op");
146         default:
147                 chain = NULL;
148                 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
149                       bref->type);
150         }
151
152         /*
153          * Only set bref_flush if the bref has a real media offset, otherwise
154          * the caller has to wait for the chain to be modified/block-allocated
155          * before a blockref can be synchronized with its (future) parent.
156          */
157         chain->bref = *bref;
158         if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
159                 chain->bref_flush = *bref;
160         chain->index = -1;              /* not yet assigned */
161         chain->refs = 1;
162         chain->bytes = bytes;
163         ccms_cst_init(&chain->cst, chain);
164         ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
165
166         return (chain);
167 }
168
169 /*
170  * Deallocate a chain (the step before freeing it).  Remove the chain from
171  * its parent's tree.
172  *
173  * Caller must hold the parent and the chain exclusively locked, and
174  * chain->refs must be 0.
175  *
176  * This function unlocks, removes, and destroys chain, and will recursively
177  * destroy any sub-chains under chain (whos refs must also be 0 at this
178  * point).
179  *
180  * parent can be NULL.
181  */
182 static void
183 hammer2_chain_dealloc(hammer2_mount_t *hmp, hammer2_chain_t *chain)
184 {
185         hammer2_inode_t *ip;
186         hammer2_chain_t *parent;
187         hammer2_chain_t *child;
188
189         /*
190          * NOTE: All movelock holders hold a ref so it shouldn't be possible
191          *       for movelock to be non-zero here.
192          */
193         KKASSERT(chain->refs == 0);
194         KKASSERT((chain->flags &
195                   (HAMMER2_CHAIN_MOVED | HAMMER2_CHAIN_MODIFIED)) == 0);
196         KKASSERT(chain->movelock == 0);
197
198         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
199                 ip = chain->u.ip;
200         else
201                 ip = NULL;
202
203         /*
204          * If the sub-tree is not empty all the elements on it must have
205          * 0 refs and be deallocatable.
206          */
207         while ((child = RB_ROOT(&chain->rbhead)) != NULL) {
208                 ccms_thread_lock(&child->cst, CCMS_STATE_EXCLUSIVE);
209                 hammer2_chain_dealloc(hmp, child);
210         }
211
212         /*
213          * If the DELETED flag is not set the chain must be removed from
214          * its parent's tree.
215          *
216          * WARNING! chain->cst.spin must be held when chain->parent is
217          *          modified, even though we own the full blown lock,
218          *          to deal with setsubmod and rename races.
219          */
220         if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
221                 spin_lock(&chain->cst.spin);    /* shouldn't be needed */
222                 parent = chain->parent;
223                 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
224                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
225                 if (ip)
226                         ip->pip = NULL;
227                 chain->parent = NULL;
228                 spin_unlock(&chain->cst.spin);
229         }
230
231         /*
232          * When cleaning out a hammer2_inode we must
233          * also clean out the related ccms_inode.
234          */
235         if (ip)
236                 ccms_cst_uninit(&ip->topo_cst);
237         hammer2_chain_free(hmp, chain);
238 }
239
240 /*
241  * Free a disconnected chain element
242  */
243 void
244 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
245 {
246         void *mem;
247
248         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
249             chain->bref.type == HAMMER2_BREF_TYPE_VOLUME) {
250                 chain->data = NULL;
251         }
252
253         KKASSERT(chain->bp == NULL);
254         KKASSERT(chain->data == NULL);
255         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
256                  chain->u.ip->vp == NULL);
257         ccms_thread_unlock(&chain->cst);
258         KKASSERT(chain->cst.count == 0);
259         KKASSERT(chain->cst.upgrade == 0);
260
261         if ((mem = chain->u.mem) != NULL) {
262                 chain->u.mem = NULL;
263                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
264                         kfree(mem, hmp->minode);
265                 else
266                         kfree(mem, hmp->mchain);
267         }
268 }
269
270 /*
271  * Add a reference to a chain element, preventing its destruction.
272  *
273  * The parent chain must be locked shared or exclusive or otherwise be
274  * stable and already have a reference.
275  */
276 void
277 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
278 {
279         u_int refs;
280
281         while (chain) {
282                 refs = chain->refs;
283                 KKASSERT(chain->refs >= 0);
284                 cpu_ccfence();
285                 if (refs == 0) {
286                         /*
287                          * 0 -> 1 transition must bump the refs on the parent
288                          * too.  The caller has stabilized the parent.
289                          */
290                         if (atomic_cmpset_int(&chain->refs, 0, 1)) {
291                                 chain = chain->parent;
292                                 KKASSERT(chain == NULL || chain->refs > 0);
293                         }
294                         /* retry or continue along the parent chain */
295                 } else {
296                         /*
297                          * N -> N+1
298                          */
299                         if (atomic_cmpset_int(&chain->refs, refs, refs + 1))
300                                 break;
301                         /* retry */
302                 }
303         }
304 }
305
306 /*
307  * Drop the callers reference to the chain element.  If the ref count
308  * reaches zero we attempt to recursively drop the parent.
309  *
310  * MOVED and MODIFIED elements hold additional references so it should not
311  * be possible for the count on a modified element to drop to 0.
312  *
313  * The chain element must NOT be locked by the caller on the 1->0 transition.
314  *
315  * The parent might or might not be locked by the caller.  If we are unable
316  * to lock the parent on the 1->0 transition the destruction of the chain
317  * will be deferred but we still recurse upward and drop the ref on the
318  * parent (see the lastdrop() function)
319  */
320 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_mount_t *hmp,
321                                                 hammer2_chain_t *chain);
322
323 void
324 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
325 {
326         u_int refs;
327
328         while (chain) {
329                 refs = chain->refs;
330                 cpu_ccfence();
331                 KKASSERT(refs > 0);
332                 if (refs == 1) {
333                         /*
334                          * (1) lastdrop successfully drops the chain to 0
335                          *     refs and may may not have destroyed it.
336                          *     lastdrop will return the parent so we can
337                          *     recursively drop the implied ref from the
338                          *     1->0 transition.
339                          *
340                          * (2) lastdrop fails to transition refs from 1 to 0
341                          *     and returns the same chain, we retry.
342                          */
343                         chain = hammer2_chain_lastdrop(hmp, chain);
344                 } else {
345                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
346                                 /*
347                                  * Succeeded, count did not reach zero so
348                                  * cut out of the loop.
349                                  */
350                                 break;
351                         }
352                         /* retry the same chain */
353                 }
354         }
355 }
356
357 /*
358  * Handle SMP races during the last drop.  We must obtain a lock on
359  * chain->parent to stabilize the last pointer reference to chain
360  * (if applicable).  This reference does not have a parallel ref count,
361  * that is idle chains in the topology can have a ref count of 0.
362  *
363  * The 1->0 transition implies a ref on the parent.
364  */
365 static
366 hammer2_chain_t *
367 hammer2_chain_lastdrop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
368 {
369         hammer2_chain_t *parent;
370
371         /*
372          * Stablize chain->parent with the chain cst's spinlock.
373          * (parent can be NULL here).
374          */
375         spin_lock(&chain->cst.spin);
376         parent = chain->parent;
377
378         /*
379          * CST spin locks are allowed to be held recursively bottom-up
380          * (whereas full CST thread locks can only be held recursively
381          * top-down).
382          *
383          * This makes things fairly easy.  We still must not block while
384          * obtaining the CST lock on the parent.  If this fails we have to
385          * unwind.
386          */
387         if (parent) {
388                 if (ccms_thread_lock_nonblock(&parent->cst,
389                                               CCMS_STATE_EXCLUSIVE)) {
390                         /* parent cst lock attempt failed */
391                         if (atomic_cmpset_int(&chain->refs, 1, 0)) {
392                                 spin_unlock(&chain->cst.spin);  /* success */
393                                 return(parent);
394                         } else {
395                                 spin_unlock(&chain->cst.spin);  /* failure */
396                                 return(chain);
397                         }
398                 }
399         }
400
401         /*
402          * With the parent now held we control the last pointer reference
403          * to chain ONLY IF this is the 1->0 drop.  If we fail to transition
404          * from 1->0 we unwind and retry at chain.
405          */
406         spin_unlock(&chain->cst.spin);
407         if (!atomic_cmpset_int(&chain->refs, 1, 0)) {
408                 if (parent)
409                         ccms_thread_unlock(&parent->cst);
410                 return(chain);
411         }
412
413         /*
414          * The flusher is allowe to set movelock on a child and then release
415          * the parent's lock, and ultimately also release the child's lock
416          * while still holding it referenced.  The reference should prevent
417          * this case from being hit.
418          */
419         KKASSERT(chain->movelock == 0);
420
421         /*
422          * Ok, we succeeded.  We now own the implied ref on the parent
423          * associated with the 1->0 transition of the child.  It should not
424          * be possible for ANYTHING to access the child now, as we own the
425          * lock on the parent, so we should be able to safely lock the
426          * child and destroy it.
427          */
428         ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
429         hammer2_chain_dealloc(hmp, chain);
430
431         /*
432          * We want to return parent with its implied ref to the caller
433          * to recurse and drop the parent.
434          */
435         if (parent)
436                 ccms_thread_unlock(&parent->cst);
437         return (parent);
438 }
439
440 /*
441  * Ref and lock a chain element, acquiring its data with I/O if necessary,
442  * and specify how you would like the data to be resolved.
443  *
444  * Returns 0 on success or an error code if the data could not be acquired.
445  * The chain element is locked either way.
446  *
447  * The lock is allowed to recurse, multiple locking ops will aggregate
448  * the requested resolve types.  Once data is assigned it will not be
449  * removed until the last unlock.
450  *
451  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
452  *                         (typically used to avoid device/logical buffer
453  *                          aliasing for data)
454  *
455  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
456  *                         the INITIAL-create state (indirect blocks only).
457  *
458  *                         Do not resolve data elements for DATA chains.
459  *                         (typically used to avoid device/logical buffer
460  *                          aliasing for data)
461  *
462  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
463  *
464  * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
465  *                         it will be locked exclusive.
466  *
467  * HAMMER2_RESOLVE_MAYDELETE - The caller may attempt to delete the element
468  *                             being locked.  We block as long as the
469  *                             parent's movelock is non-zero.
470  *
471  * NOTE: Embedded elements (volume header, inodes) are always resolved
472  *       regardless.
473  *
474  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
475  *       element will instantiate and zero its buffer, and flush it on
476  *       release.
477  *
478  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
479  *       so as not to instantiate a device buffer, which could alias against
480  *       a logical file buffer.  However, if ALWAYS is specified the
481  *       device buffer will be instantiated anyway.
482  */
483 int
484 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
485 {
486         hammer2_blockref_t *bref;
487         hammer2_off_t pbase;
488         hammer2_off_t peof;
489         ccms_state_t ostate;
490         size_t boff;
491         size_t bbytes;
492         int error;
493         char *bdata;
494
495         /*
496          * Ref and lock the element.  Recursive locks are allowed.
497          */
498         hammer2_chain_ref(hmp, chain);
499         for (;;) {
500                 if (how & HAMMER2_RESOLVE_SHARED)
501                         ccms_thread_lock(&chain->cst, CCMS_STATE_SHARED);
502                 else
503                         ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
504                 if ((how & HAMMER2_RESOLVE_MAYDELETE) == 0 ||
505                     chain->parent == NULL ||
506                     chain->parent->movelock == 0) {
507                         break;
508                 }
509                 ccms_thread_unlock(&chain->cst);
510                 hammer2_chain_movewait(hmp, chain);
511         }
512
513         /*
514          * If we already have a valid data pointer no further action is
515          * necessary.
516          */
517         if (chain->data)
518                 return (0);
519
520         /*
521          * Do we have to resolve the data?
522          */
523         switch(how & HAMMER2_RESOLVE_MASK) {
524         case HAMMER2_RESOLVE_NEVER:
525                 return(0);
526         case HAMMER2_RESOLVE_MAYBE:
527                 if (chain->flags & HAMMER2_CHAIN_INITIAL)
528                         return(0);
529                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
530                         return(0);
531                 /* fall through */
532         case HAMMER2_RESOLVE_ALWAYS:
533                 break;
534         }
535
536         /*
537          * Upgrade to an exclusive lock so we can safely manipulate the
538          * buffer cache.  If another thread got to it before us we
539          * can just return.
540          */
541         ostate = ccms_thread_lock_upgrade(&chain->cst);
542         if (chain->data) {
543                 ccms_thread_lock_restore(&chain->cst, ostate);
544                 return (0);
545         }
546
547         /*
548          * We must resolve to a device buffer, either by issuing I/O or
549          * by creating a zero-fill element.  We do not mark the buffer
550          * dirty when creating a zero-fill element (the hammer2_chain_modify()
551          * API must still be used to do that).
552          *
553          * The device buffer is variable-sized in powers of 2 down
554          * to HAMMER2_MINALLOCSIZE (typically 1K).  A 64K physical storage
555          * chunk always contains buffers of the same size. (XXX)
556          *
557          * The minimum physical IO size may be larger than the variable
558          * block size.
559          */
560         bref = &chain->bref;
561
562         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
563                 bbytes = HAMMER2_MINIOSIZE;
564         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
565         peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
566         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
567         KKASSERT(pbase != 0);
568
569         /*
570          * The getblk() optimization can only be used on newly created
571          * elements if the physical block size matches the request.
572          */
573         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
574             chain->bytes == bbytes) {
575                 chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
576                 error = 0;
577         } else if (hammer2_cluster_enable) {
578                 error = cluster_read(hmp->devvp, peof, pbase, bbytes,
579                                      HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
580                                      &chain->bp);
581         } else {
582                 error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
583         }
584
585         if (error) {
586                 kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
587                         (intmax_t)pbase, error);
588                 bqrelse(chain->bp);
589                 chain->bp = NULL;
590                 ccms_thread_lock_restore(&chain->cst, ostate);
591                 return (error);
592         }
593
594         /*
595          * Zero the data area if the chain is in the INITIAL-create state.
596          * Mark the buffer for bdwrite().
597          */
598         bdata = (char *)chain->bp->b_data + boff;
599         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
600                 bzero(bdata, chain->bytes);
601                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
602         }
603
604         /*
605          * Setup the data pointer, either pointing it to an embedded data
606          * structure and copying the data from the buffer, or pointing it
607          * into the buffer.
608          *
609          * The buffer is not retained when copying to an embedded data
610          * structure in order to avoid potential deadlocks or recursions
611          * on the same physical buffer.
612          */
613         switch (bref->type) {
614         case HAMMER2_BREF_TYPE_VOLUME:
615                 /*
616                  * Copy data from bp to embedded buffer
617                  */
618                 panic("hammer2_chain_lock: called on unresolved volume header");
619 #if 0
620                 /* NOT YET */
621                 KKASSERT(pbase == 0);
622                 KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
623                 bcopy(bdata, &hmp->voldata, chain->bytes);
624                 chain->data = (void *)&hmp->voldata;
625                 bqrelse(chain->bp);
626                 chain->bp = NULL;
627 #endif
628                 break;
629         case HAMMER2_BREF_TYPE_INODE:
630                 /*
631                  * Copy data from bp to embedded buffer, do not retain the
632                  * device buffer.
633                  */
634                 bcopy(bdata, &chain->u.ip->ip_data, chain->bytes);
635                 chain->data = (void *)&chain->u.ip->ip_data;
636                 bqrelse(chain->bp);
637                 chain->bp = NULL;
638                 break;
639         case HAMMER2_BREF_TYPE_INDIRECT:
640         case HAMMER2_BREF_TYPE_DATA:
641         default:
642                 /*
643                  * Point data at the device buffer and leave bp intact.
644                  */
645                 chain->data = (void *)bdata;
646                 break;
647         }
648
649         /*
650          * Make sure the bp is not specifically owned by this thread before
651          * restoring to a possibly shared lock, so another hammer2 thread
652          * can release it.
653          */
654         if (chain->bp)
655                 BUF_KERNPROC(chain->bp);
656         ccms_thread_lock_restore(&chain->cst, ostate);
657         return (0);
658 }
659
660 /*
661  * Similar to the normal chain_lock but handles HAMMER2_RESOLVE_MAYDELETE
662  * in a more intelligent fashion.
663  */
664 int
665 hammer2_chain_lock_pair(hammer2_mount_t *hmp, hammer2_chain_t *parent,
666                         hammer2_chain_t *chain, int how)
667 {
668         int error;
669
670         error = hammer2_chain_lock(hmp, parent,
671                                    how & ~HAMMER2_RESOLVE_MAYDELETE);
672         if (error == 0) {
673                 error = hammer2_chain_lock(hmp, chain, how);
674                 if (error) {
675                         hammer2_chain_unlock(hmp, parent);
676                 }
677         }
678         return error;
679 }
680
681 /*
682  * Unlock and deref a chain element.
683  *
684  * On the last lock release any non-embedded data (chain->bp) will be
685  * retired.
686  */
687 void
688 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
689 {
690         long *counterp;
691
692         /*
693          * Release the CST lock but with a special 1->0 transition case.
694          *
695          * Returns non-zero if lock references remain.  When zero is
696          * returned the last lock reference is retained and any shared
697          * lock is upgraded to an exclusive lock for final disposition.
698          */
699         if (ccms_thread_unlock_zero(&chain->cst)) {
700                 KKASSERT(chain->refs > 1);
701                 atomic_add_int(&chain->refs, -1);
702                 return;
703         }
704
705         /*
706          * Shortcut the case if the data is embedded or not resolved.
707          *
708          * Do NOT null-out pointers to embedded data (e.g. inode).
709          *
710          * The DIRTYBP flag is non-applicable in this situation and can
711          * be cleared to keep the flags state clean.
712          */
713         if (chain->bp == NULL) {
714                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
715                 ccms_thread_unlock(&chain->cst);
716                 hammer2_chain_drop(hmp, chain);
717                 return;
718         }
719
720         /*
721          * Statistics
722          */
723         if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
724                 ;
725         } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
726                 switch(chain->bref.type) {
727                 case HAMMER2_BREF_TYPE_DATA:
728                         counterp = &hammer2_ioa_file_write;
729                         break;
730                 case HAMMER2_BREF_TYPE_INODE:
731                         counterp = &hammer2_ioa_meta_write;
732                         break;
733                 case HAMMER2_BREF_TYPE_INDIRECT:
734                         counterp = &hammer2_ioa_indr_write;
735                         break;
736                 default:
737                         counterp = &hammer2_ioa_volu_write;
738                         break;
739                 }
740                 ++*counterp;
741         } else {
742                 switch(chain->bref.type) {
743                 case HAMMER2_BREF_TYPE_DATA:
744                         counterp = &hammer2_iod_file_write;
745                         break;
746                 case HAMMER2_BREF_TYPE_INODE:
747                         counterp = &hammer2_iod_meta_write;
748                         break;
749                 case HAMMER2_BREF_TYPE_INDIRECT:
750                         counterp = &hammer2_iod_indr_write;
751                         break;
752                 default:
753                         counterp = &hammer2_iod_volu_write;
754                         break;
755                 }
756                 ++*counterp;
757         }
758
759         /*
760          * Clean out the bp.
761          *
762          * If a device buffer was used for data be sure to destroy the
763          * buffer when we are done to avoid aliases (XXX what about the
764          * underlying VM pages?).
765          */
766         if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
767                 chain->bp->b_flags |= B_RELBUF;
768
769         /*
770          * The DIRTYBP flag tracks whether we have to bdwrite() the buffer
771          * or not.  The flag will get re-set when chain_modify() is called,
772          * even if MODIFIED is already set, allowing the OS to retire the
773          * buffer independent of a hammer2 flus.
774          */
775         chain->data = NULL;
776         if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
777                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
778                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
779                         atomic_clear_int(&chain->flags,
780                                          HAMMER2_CHAIN_IOFLUSH);
781                         chain->bp->b_flags |= B_RELBUF;
782                         cluster_awrite(chain->bp);
783                 } else {
784                         chain->bp->b_flags |= B_CLUSTEROK;
785                         bdwrite(chain->bp);
786                 }
787         } else {
788                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
789                         atomic_clear_int(&chain->flags,
790                                          HAMMER2_CHAIN_IOFLUSH);
791                         chain->bp->b_flags |= B_RELBUF;
792                         brelse(chain->bp);
793                 } else {
794                         /* bp might still be dirty */
795                         bqrelse(chain->bp);
796                 }
797         }
798         chain->bp = NULL;
799         ccms_thread_unlock(&chain->cst);
800         hammer2_chain_drop(hmp, chain);
801 }
802
803 /*
804  * Called on a locked chain element.  Allows the caller to temporarily
805  * unlock the element (caller must be sure to hold an extra ref on it
806  * to prevent destruction), thus allowing other accessors to lock it,
807  * but disallows deletions.
808  */
809 static void
810 hammer2_chain_movelock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
811 {
812         atomic_add_int(&chain->movelock, 1);
813 }
814
815 static void
816 hammer2_chain_moveunlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
817 {
818         u_int omovelock;
819         u_int nmovelock;
820
821         for (;;) {
822                 omovelock = chain->movelock;
823                 cpu_ccfence();
824                 nmovelock = (omovelock - 1) & 0x7FFFFFFF;
825                 if (atomic_cmpset_int(&chain->movelock, omovelock, nmovelock)) {
826                         if (omovelock & 0x80000000)
827                                 wakeup(&chain->movelock);
828                         break;
829                 }
830         }
831 }
832
833 static void
834 hammer2_chain_movewait(hammer2_mount_t *hmp, hammer2_chain_t *chain)
835 {
836         u_int omovelock;
837         u_int nmovelock;
838
839         for (;;) {
840                 omovelock = chain->movelock;
841                 cpu_ccfence();
842                 if (omovelock == 0)
843                         break;
844                 nmovelock = omovelock | 0x80000000;
845                 tsleep_interlock(&chain->movelock, 0);
846                 if (atomic_cmpset_int(&chain->movelock, omovelock, nmovelock)) {
847                         tsleep(&chain->movelock, PINTERLOCKED, "movelk", 0);
848                 }
849         }
850 }
851
852 /*
853  * Resize the chain's physical storage allocation.  Chains can be resized
854  * smaller without reallocating the storage.  Resizing larger will reallocate
855  * the storage.
856  *
857  * Must be passed a locked chain.
858  *
859  * If you want the resize code to copy the data to the new block then the
860  * caller should lock the chain RESOLVE_MAYBE or RESOLVE_ALWAYS.
861  *
862  * If the caller already holds a logical buffer containing the data and
863  * intends to bdwrite() that buffer resolve with RESOLVE_NEVER.  The resize
864  * operation will then not copy the data.
865  *
866  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
867  * to avoid instantiating a device buffer that conflicts with the vnode
868  * data buffer.
869  *
870  * XXX flags currently ignored, uses chain->bp to detect data/no-data.
871  */
872 void
873 hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
874                      int nradix, int flags)
875 {
876         hammer2_mount_t *hmp = ip->hmp;
877         struct buf *nbp;
878         hammer2_off_t pbase;
879         size_t obytes;
880         size_t nbytes;
881         size_t bbytes;
882         int boff;
883         char *bdata;
884         int error;
885
886         /*
887          * Only data and indirect blocks can be resized for now
888          */
889         KKASSERT(chain != &hmp->vchain);
890         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
891                  chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
892
893         /*
894          * Nothing to do if the element is already the proper size
895          */
896         obytes = chain->bytes;
897         nbytes = 1U << nradix;
898         if (obytes == nbytes)
899                 return;
900
901         /*
902          * Set MODIFIED and add a chain ref to prevent destruction.  Both
903          * modified flags share the same ref.
904          *
905          * If the chain is already marked MODIFIED then we can safely
906          * return the previous allocation to the pool without having to
907          * worry about snapshots.
908          */
909         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
910                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
911                                               HAMMER2_CHAIN_MODIFY_TID);
912                 hammer2_chain_ref(hmp, chain);
913         } else {
914                 hammer2_freemap_free(hmp, chain->bref.data_off,
915                                      chain->bref.type);
916         }
917
918         /*
919          * Relocate the block, even if making it smaller (because different
920          * block sizes may be in different regions).
921          */
922         chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type,
923                                                      nbytes);
924         chain->bytes = nbytes;
925         ip->delta_dcount += (ssize_t)(nbytes - obytes); /* XXX atomic */
926
927         /*
928          * The device buffer may be larger than the allocation size.
929          */
930         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
931                 bbytes = HAMMER2_MINIOSIZE;
932         pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
933         boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
934
935         /*
936          * Only copy the data if resolved, otherwise the caller is
937          * responsible.
938          */
939         if (chain->bp) {
940                 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
941                          chain->bref.type == HAMMER2_BREF_TYPE_DATA);
942                 KKASSERT(chain != &hmp->vchain);        /* safety */
943
944                 /*
945                  * The getblk() optimization can only be used if the
946                  * physical block size matches the request.
947                  */
948                 if (nbytes == bbytes) {
949                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
950                         error = 0;
951                 } else {
952                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
953                         KKASSERT(error == 0);
954                 }
955                 bdata = (char *)nbp->b_data + boff;
956
957                 if (nbytes < obytes) {
958                         bcopy(chain->data, bdata, nbytes);
959                 } else {
960                         bcopy(chain->data, bdata, obytes);
961                         bzero(bdata + obytes, nbytes - obytes);
962                 }
963
964                 /*
965                  * NOTE: The INITIAL state of the chain is left intact.
966                  *       We depend on hammer2_chain_modify() to do the
967                  *       right thing.
968                  *
969                  * NOTE: We set B_NOCACHE to throw away the previous bp and
970                  *       any VM backing store, even if it was dirty.
971                  *       Otherwise we run the risk of a logical/device
972                  *       conflict on reallocation.
973                  */
974                 chain->bp->b_flags |= B_RELBUF | B_NOCACHE;
975                 brelse(chain->bp);
976                 chain->bp = nbp;
977                 chain->data = (void *)bdata;
978                 hammer2_chain_modify(hmp, chain, 0);
979         }
980
981         /*
982          * Make sure the chain is marked MOVED and SUBMOD is set in the
983          * parent(s) so the adjustments are picked up by flush.
984          */
985         if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
986                 hammer2_chain_ref(hmp, chain);
987                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
988         }
989         hammer2_chain_parent_setsubmod(hmp, chain);
990 }
991
992 /*
993  * Convert a locked chain that was retrieved read-only to read-write.
994  *
995  * If not already marked modified a new physical block will be allocated
996  * and assigned to the bref.
997  *
998  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
999  *                   level or the COW operation will not work.
1000  *
1001  * Data blocks     - The chain is usually locked RESOLVE_NEVER so as not to
1002  *                   run the data through the device buffers.
1003  */
1004 void
1005 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
1006 {
1007         struct buf *nbp;
1008         int error;
1009         hammer2_off_t pbase;
1010         size_t bbytes;
1011         size_t boff;
1012         void *bdata;
1013
1014         /*
1015          * Tells flush that modify_tid must be updated, otherwise only
1016          * mirror_tid is updated.  This is the default.
1017          */
1018         if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
1019                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFY_TID);
1020
1021         /*
1022          * If the chain is already marked MODIFIED we can just return.
1023          *
1024          * However, it is possible that a prior lock/modify sequence
1025          * retired the buffer.  During this lock/modify sequence MODIFIED
1026          * may still be set but the buffer could wind up clean.  Since
1027          * the caller is going to modify the buffer further we have to
1028          * be sure that DIRTYBP is set again.
1029          */
1030         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
1031                 if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
1032                     chain->bp == NULL) {
1033                         goto skip1;
1034                 }
1035                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
1036                 return;
1037         }
1038
1039         /*
1040          * Set MODIFIED and add a chain ref to prevent destruction.  Both
1041          * modified flags share the same ref.
1042          */
1043         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1044         hammer2_chain_ref(hmp, chain);
1045
1046         /*
1047          * We must allocate the copy-on-write block.
1048          *
1049          * If the data is embedded no other action is required.
1050          *
1051          * If the data is not embedded we acquire and clear the
1052          * new block.  If chain->data is not NULL we then do the
1053          * copy-on-write.  chain->data will then be repointed to the new
1054          * buffer and the old buffer will be released.
1055          *
1056          * For newly created elements with no prior allocation we go
1057          * through the copy-on-write steps except without the copying part.
1058          */
1059         if (chain != &hmp->vchain) {
1060                 if ((hammer2_debug & 0x0001) &&
1061                     (chain->bref.data_off & HAMMER2_OFF_MASK)) {
1062                         kprintf("Replace %d\n", chain->bytes);
1063                 }
1064                 chain->bref.data_off =
1065                         hammer2_freemap_alloc(hmp, chain->bref.type,
1066                                               chain->bytes);
1067                 /* XXX failed allocation */
1068         }
1069
1070         /*
1071          * If data instantiation is optional and the chain has no current
1072          * data association (typical for DATA and newly-created INDIRECT
1073          * elements), don't instantiate the buffer now.
1074          */
1075         if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL)
1076                 goto skip2;
1077
1078 skip1:
1079         /*
1080          * Setting the DIRTYBP flag will cause the buffer to be dirtied or
1081          * written-out on unlock.  This bit is independent of the MODIFIED
1082          * bit because the chain may still need meta-data adjustments done
1083          * by virtue of MODIFIED for its parent, and the buffer can be
1084          * flushed out (possibly multiple times) by the OS before that.
1085          *
1086          * Clearing the INITIAL flag (for indirect blocks) indicates that
1087          * a zero-fill buffer has been instantiated.
1088          */
1089         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
1090         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1091
1092         /*
1093          * We currently should never instantiate a device buffer for a
1094          * data chain.
1095          */
1096         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
1097
1098         /*
1099          * Execute COW operation
1100          */
1101         switch(chain->bref.type) {
1102         case HAMMER2_BREF_TYPE_VOLUME:
1103         case HAMMER2_BREF_TYPE_INODE:
1104                 /*
1105                  * The data is embedded, no copy-on-write operation is
1106                  * needed.
1107                  */
1108                 KKASSERT(chain->bp == NULL);
1109                 break;
1110         case HAMMER2_BREF_TYPE_DATA:
1111         case HAMMER2_BREF_TYPE_INDIRECT:
1112                 /*
1113                  * Perform the copy-on-write operation
1114                  */
1115                 KKASSERT(chain != &hmp->vchain);        /* safety */
1116                 /*
1117                  * The device buffer may be larger than the allocation size.
1118                  */
1119                 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
1120                         bbytes = HAMMER2_MINIOSIZE;
1121                 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
1122                 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
1123
1124                 /*
1125                  * The getblk() optimization can only be used if the
1126                  * physical block size matches the request.
1127                  */
1128                 if (chain->bytes == bbytes) {
1129                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
1130                         error = 0;
1131                 } else {
1132                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
1133                         KKASSERT(error == 0);
1134                 }
1135                 bdata = (char *)nbp->b_data + boff;
1136
1137                 /*
1138                  * Copy or zero-fill on write depending on whether
1139                  * chain->data exists or not.
1140                  */
1141                 if (chain->data) {
1142                         bcopy(chain->data, bdata, chain->bytes);
1143                         KKASSERT(chain->bp != NULL);
1144                 } else {
1145                         bzero(bdata, chain->bytes);
1146                 }
1147                 if (chain->bp) {
1148                         chain->bp->b_flags |= B_RELBUF;
1149                         brelse(chain->bp);
1150                 }
1151                 chain->bp = nbp;
1152                 chain->data = bdata;
1153                 break;
1154         default:
1155                 panic("hammer2_chain_modify: illegal non-embedded type %d",
1156                       chain->bref.type);
1157                 break;
1158
1159         }
1160 skip2:
1161         if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
1162                 hammer2_chain_parent_setsubmod(hmp, chain);
1163 }
1164
1165 /*
1166  * Mark the volume as having been modified.  This short-cut version
1167  * does not have to lock the volume's chain, which allows the ioctl
1168  * code to make adjustments to connections without deadlocking.
1169  */
1170 void
1171 hammer2_modify_volume(hammer2_mount_t *hmp)
1172 {
1173         hammer2_voldata_lock(hmp);
1174         atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
1175         hammer2_voldata_unlock(hmp);
1176 }
1177
1178 /*
1179  * Locate an in-memory chain.  The parent must be locked.  The in-memory
1180  * chain is returned or NULL if no in-memory chain is present.
1181  *
1182  * NOTE: A chain on-media might exist for this index when NULL is returned.
1183  */
1184 hammer2_chain_t *
1185 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
1186 {
1187         hammer2_chain_t dummy;
1188         hammer2_chain_t *chain;
1189
1190         dummy.index = index;
1191         chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1192         return (chain);
1193 }
1194
1195 /*
1196  * Return a locked chain structure with all associated data acquired.
1197  *
1198  * Caller must lock the parent on call, the returned child will be locked.
1199  */
1200 hammer2_chain_t *
1201 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1202                   int index, int flags)
1203 {
1204         hammer2_blockref_t *bref;
1205         hammer2_inode_t *ip;
1206         hammer2_chain_t *chain;
1207         hammer2_chain_t dummy;
1208         int how;
1209         ccms_state_t ostate;
1210
1211         /*
1212          * Figure out how to lock.  MAYBE can be used to optimized
1213          * the initial-create state for indirect blocks.
1214          */
1215         if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
1216                 how = HAMMER2_RESOLVE_NEVER;
1217         else
1218                 how = HAMMER2_RESOLVE_MAYBE;
1219         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
1220                 how |= HAMMER2_RESOLVE_SHARED;
1221         if (flags & HAMMER2_LOOKUP_MAYDELETE)
1222                 how |= HAMMER2_RESOLVE_MAYDELETE;
1223
1224         /*
1225          * First see if we have a (possibly modified) chain element cached
1226          * for this (parent, index).  Acquire the data if necessary.
1227          *
1228          * If chain->data is non-NULL the chain should already be marked
1229          * modified.
1230          */
1231         dummy.index = index;
1232         chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1233         if (chain) {
1234                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1235                         hammer2_chain_ref(hmp, chain);
1236                 else
1237                         hammer2_chain_lock(hmp, chain, how);
1238                 return(chain);
1239         }
1240
1241         /*
1242          * Upgrade our thread lock and handle any race that may have
1243          * occurred.  Leave the lock upgraded for the rest of the get.
1244          * We have to do this because we will be modifying the chain
1245          * structure.
1246          */
1247         ostate = ccms_thread_lock_upgrade(&parent->cst);
1248         chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1249         if (chain) {
1250                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1251                         hammer2_chain_ref(hmp, chain);
1252                 else
1253                         hammer2_chain_lock(hmp, chain, how);
1254                 ccms_thread_lock_restore(&parent->cst, ostate);
1255                 return(chain);
1256         }
1257
1258         /*
1259          * The get function must always succeed, panic if there's no
1260          * data to index.
1261          */
1262         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1263                 ccms_thread_lock_restore(&parent->cst, ostate);
1264                 panic("hammer2_chain_get: Missing bref(1)");
1265                 /* NOT REACHED */
1266         }
1267
1268         /*
1269          * Otherwise lookup the bref and issue I/O (switch on the parent)
1270          */
1271         switch(parent->bref.type) {
1272         case HAMMER2_BREF_TYPE_INODE:
1273                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1274                 bref = &parent->data->ipdata.u.blockset.blockref[index];
1275                 break;
1276         case HAMMER2_BREF_TYPE_INDIRECT:
1277                 KKASSERT(parent->data != NULL);
1278                 KKASSERT(index >= 0 &&
1279                          index < parent->bytes / sizeof(hammer2_blockref_t));
1280                 bref = &parent->data->npdata.blockref[index];
1281                 break;
1282         case HAMMER2_BREF_TYPE_VOLUME:
1283                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1284                 bref = &hmp->voldata.sroot_blockset.blockref[index];
1285                 break;
1286         default:
1287                 bref = NULL;
1288                 panic("hammer2_chain_get: unrecognized blockref type: %d",
1289                       parent->bref.type);
1290         }
1291         if (bref->type == 0) {
1292                 panic("hammer2_chain_get: Missing bref(2)");
1293                 /* NOT REACHED */
1294         }
1295
1296         /*
1297          * Allocate a chain structure representing the existing media
1298          * entry.
1299          *
1300          * The locking operation we do later will issue I/O to read it.
1301          */
1302         chain = hammer2_chain_alloc(hmp, bref);
1303
1304         /*
1305          * Link the chain into its parent.  Caller is expected to hold an
1306          * exclusive lock on the parent.
1307          */
1308         chain->parent = parent;
1309         chain->index = index;
1310         if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
1311                 panic("hammer2_chain_link: collision");
1312         KKASSERT(parent->refs > 0);
1313         atomic_add_int(&parent->refs, 1);       /* for red-black entry */
1314         ccms_thread_lock_restore(&parent->cst, ostate);
1315
1316         /*
1317          * Additional linkage for inodes.  Reuse the parent pointer to
1318          * find the parent directory.
1319          *
1320          * The ccms_inode is initialized from its parent directory.  The
1321          * chain of ccms_inode's is seeded by the mount code.
1322          */
1323         if (bref->type == HAMMER2_BREF_TYPE_INODE) {
1324                 ip = chain->u.ip;
1325                 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1326                         parent = parent->parent;
1327                 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
1328                         ip->pip = parent->u.ip;
1329                         ip->pmp = parent->u.ip->pmp;
1330                         ccms_cst_init(&ip->topo_cst, &ip->chain);
1331                 }
1332         }
1333
1334         /*
1335          * Our new chain structure has already been referenced and locked
1336          * but the lock code handles the I/O so call it to resolve the data.
1337          * Then release one of our two exclusive locks.
1338          *
1339          * If NOLOCK is set the release will release the one-and-only lock.
1340          */
1341         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1342                 hammer2_chain_lock(hmp, chain, how);    /* recusive lock */
1343                 hammer2_chain_drop(hmp, chain);         /* excess ref */
1344         }
1345         ccms_thread_unlock(&chain->cst);                        /* from alloc */
1346
1347         return (chain);
1348 }
1349
1350 /*
1351  * Locate any key between key_beg and key_end inclusive.  (*parentp)
1352  * typically points to an inode but can also point to a related indirect
1353  * block and this function will recurse upwards and find the inode again.
1354  *
1355  * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
1356  *           WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
1357  *           WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
1358  *
1359  * (*parentp) must be exclusively locked and referenced and can be an inode
1360  * or an existing indirect block within the inode.
1361  *
1362  * On return (*parentp) will be modified to point at the deepest parent chain
1363  * element encountered during the search, as a helper for an insertion or
1364  * deletion.   The new (*parentp) will be locked and referenced and the old
1365  * will be unlocked and dereferenced (no change if they are both the same).
1366  *
1367  * The matching chain will be returned exclusively locked and referenced.
1368  *
1369  * NULL is returned if no match was found, but (*parentp) will still
1370  * potentially be adjusted.
1371  *
1372  * This function will also recurse up the chain if the key is not within the
1373  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
1374  * can simply allow (*parentp) to float inside the loop.
1375  */
1376 hammer2_chain_t *
1377 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1378                      hammer2_key_t key_beg, hammer2_key_t key_end,
1379                      int flags)
1380 {
1381         hammer2_chain_t *parent;
1382         hammer2_chain_t *chain;
1383         hammer2_chain_t *tmp;
1384         hammer2_blockref_t *base;
1385         hammer2_blockref_t *bref;
1386         hammer2_key_t scan_beg;
1387         hammer2_key_t scan_end;
1388         int count = 0;
1389         int i;
1390         int how_always = HAMMER2_RESOLVE_ALWAYS;
1391         int how_maybe = HAMMER2_RESOLVE_MAYBE;
1392
1393         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) {
1394                 how_maybe |= HAMMER2_RESOLVE_SHARED;
1395                 how_always |= HAMMER2_RESOLVE_SHARED;
1396         }
1397         if (flags & HAMMER2_LOOKUP_MAYDELETE) {
1398                 how_maybe |= HAMMER2_RESOLVE_MAYDELETE;
1399                 how_always |= HAMMER2_RESOLVE_MAYDELETE;
1400         }
1401
1402         /*
1403          * Recurse (*parentp) upward if necessary until the parent completely
1404          * encloses the key range or we hit the inode.
1405          */
1406         parent = *parentp;
1407         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1408                 scan_beg = parent->bref.key;
1409                 scan_end = scan_beg +
1410                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1411                 if (key_beg >= scan_beg && key_end <= scan_end)
1412                         break;
1413                 hammer2_chain_ref(hmp, parent);         /* ref old parent */
1414                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1415                 parent = parent->parent;
1416                                                         /* lock new parent */
1417                 hammer2_chain_lock(hmp, parent, how_maybe);
1418                 hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
1419                 *parentp = parent;                      /* new parent */
1420         }
1421
1422 again:
1423         /*
1424          * Locate the blockref array.  Currently we do a fully associative
1425          * search through the array.
1426          */
1427         switch(parent->bref.type) {
1428         case HAMMER2_BREF_TYPE_INODE:
1429                 /*
1430                  * Special shortcut for embedded data returns the inode
1431                  * itself.  Callers must detect this condition and access
1432                  * the embedded data (the strategy code does this for us).
1433                  *
1434                  * This is only applicable to regular files and softlinks.
1435                  */
1436                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1437                         if (flags & HAMMER2_LOOKUP_NOLOCK)
1438                                 hammer2_chain_ref(hmp, parent);
1439                         else
1440                                 hammer2_chain_lock(hmp, parent, how_always);
1441                         return (parent);
1442                 }
1443                 base = &parent->data->ipdata.u.blockset.blockref[0];
1444                 count = HAMMER2_SET_COUNT;
1445                 break;
1446         case HAMMER2_BREF_TYPE_INDIRECT:
1447                 /*
1448                  * Optimize indirect blocks in the INITIAL state to avoid
1449                  * I/O.
1450                  */
1451                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1452                         base = NULL;
1453                 } else {
1454                         if (parent->data == NULL)
1455                                 panic("parent->data is NULL");
1456                         base = &parent->data->npdata.blockref[0];
1457                 }
1458                 count = parent->bytes / sizeof(hammer2_blockref_t);
1459                 break;
1460         case HAMMER2_BREF_TYPE_VOLUME:
1461                 base = &hmp->voldata.sroot_blockset.blockref[0];
1462                 count = HAMMER2_SET_COUNT;
1463                 break;
1464         default:
1465                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
1466                       parent->bref.type);
1467                 base = NULL;    /* safety */
1468                 count = 0;      /* safety */
1469         }
1470
1471         /*
1472          * If the element and key overlap we use the element.
1473          */
1474         bref = NULL;
1475         for (i = 0; i < count; ++i) {
1476                 tmp = hammer2_chain_find(hmp, parent, i);
1477                 if (tmp) {
1478                         bref = &tmp->bref;
1479                         KKASSERT(bref->type != 0);
1480                 } else if (base == NULL || base[i].type == 0) {
1481                         continue;
1482                 } else {
1483                         bref = &base[i];
1484                 }
1485                 scan_beg = bref->key;
1486                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1487                 if (key_beg <= scan_end && key_end >= scan_beg)
1488                         break;
1489         }
1490         if (i == count) {
1491                 if (key_beg == key_end)
1492                         return (NULL);
1493                 return (hammer2_chain_next(hmp, parentp, NULL,
1494                                            key_beg, key_end, flags));
1495         }
1496
1497         /*
1498          * Acquire the new chain element.  If the chain element is an
1499          * indirect block we must search recursively.
1500          */
1501         chain = hammer2_chain_get(hmp, parent, i, flags);
1502         if (chain == NULL)
1503                 return (NULL);
1504
1505         /*
1506          * If the chain element is an indirect block it becomes the new
1507          * parent and we loop on it.
1508          *
1509          * The parent always has to be locked with at least RESOLVE_MAYBE,
1510          * so it might need a fixup if the caller passed incompatible flags.
1511          */
1512         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1513                 hammer2_chain_unlock(hmp, parent);
1514                 *parentp = parent = chain;
1515                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1516                         hammer2_chain_lock(hmp, chain, how_maybe);
1517                         hammer2_chain_drop(hmp, chain); /* excess ref */
1518                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1519                         hammer2_chain_lock(hmp, chain, how_maybe);
1520                         hammer2_chain_unlock(hmp, chain);
1521                 }
1522                 goto again;
1523         }
1524
1525         /*
1526          * All done, return chain
1527          */
1528         return (chain);
1529 }
1530
1531 /*
1532  * After having issued a lookup we can iterate all matching keys.
1533  *
1534  * If chain is non-NULL we continue the iteration from just after it's index.
1535  *
1536  * If chain is NULL we assume the parent was exhausted and continue the
1537  * iteration at the next parent.
1538  *
1539  * parent must be locked on entry and remains locked throughout.  chain's
1540  * lock status must match flags.
1541  */
1542 hammer2_chain_t *
1543 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1544                    hammer2_chain_t *chain,
1545                    hammer2_key_t key_beg, hammer2_key_t key_end,
1546                    int flags)
1547 {
1548         hammer2_chain_t *parent;
1549         hammer2_chain_t *tmp;
1550         hammer2_blockref_t *base;
1551         hammer2_blockref_t *bref;
1552         hammer2_key_t scan_beg;
1553         hammer2_key_t scan_end;
1554         int i;
1555         int how_maybe = HAMMER2_RESOLVE_MAYBE;
1556         int count;
1557
1558         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
1559                 how_maybe |= HAMMER2_RESOLVE_SHARED;
1560         if (flags & HAMMER2_LOOKUP_MAYDELETE)
1561                 how_maybe |= HAMMER2_RESOLVE_MAYDELETE;
1562
1563         parent = *parentp;
1564
1565 again:
1566         /*
1567          * Calculate the next index and recalculate the parent if necessary.
1568          */
1569         if (chain) {
1570                 /*
1571                  * Continue iteration within current parent.  If not NULL
1572                  * the passed-in chain may or may not be locked, based on
1573                  * the LOOKUP_NOLOCK flag (passed in as returned from lookup
1574                  * or a prior next).
1575                  */
1576                 i = chain->index + 1;
1577                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1578                         hammer2_chain_drop(hmp, chain);
1579                 else
1580                         hammer2_chain_unlock(hmp, chain);
1581
1582                 /*
1583                  * Any scan where the lookup returned degenerate data embedded
1584                  * in the inode has an invalid index and must terminate.
1585                  */
1586                 if (chain == parent)
1587                         return(NULL);
1588                 chain = NULL;
1589         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) {
1590                 /*
1591                  * We reached the end of the iteration.
1592                  */
1593                 return (NULL);
1594         } else {
1595                 /*
1596                  * Continue iteration with next parent unless the current
1597                  * parent covers the range.
1598                  */
1599                 hammer2_chain_t *nparent;
1600
1601                 scan_beg = parent->bref.key;
1602                 scan_end = scan_beg +
1603                             ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1604                 if (key_beg >= scan_beg && key_end <= scan_end)
1605                         return (NULL);
1606
1607                 i = parent->index + 1;
1608                 nparent = parent->parent;
1609                 hammer2_chain_ref(hmp, nparent);        /* ref new parent */
1610                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1611                                                         /* lock new parent */
1612                 hammer2_chain_lock(hmp, nparent, how_maybe);
1613                 hammer2_chain_drop(hmp, nparent);       /* drop excess ref */
1614                 *parentp = parent = nparent;
1615         }
1616
1617 again2:
1618         /*
1619          * Locate the blockref array.  Currently we do a fully associative
1620          * search through the array.
1621          */
1622         switch(parent->bref.type) {
1623         case HAMMER2_BREF_TYPE_INODE:
1624                 base = &parent->data->ipdata.u.blockset.blockref[0];
1625                 count = HAMMER2_SET_COUNT;
1626                 break;
1627         case HAMMER2_BREF_TYPE_INDIRECT:
1628                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1629                         base = NULL;
1630                 } else {
1631                         KKASSERT(parent->data != NULL);
1632                         base = &parent->data->npdata.blockref[0];
1633                 }
1634                 count = parent->bytes / sizeof(hammer2_blockref_t);
1635                 break;
1636         case HAMMER2_BREF_TYPE_VOLUME:
1637                 base = &hmp->voldata.sroot_blockset.blockref[0];
1638                 count = HAMMER2_SET_COUNT;
1639                 break;
1640         default:
1641                 panic("hammer2_chain_next: unrecognized blockref type: %d",
1642                       parent->bref.type);
1643                 base = NULL;    /* safety */
1644                 count = 0;      /* safety */
1645                 break;
1646         }
1647         KKASSERT(i <= count);
1648
1649         /*
1650          * Look for the key.  If we are unable to find a match and an exact
1651          * match was requested we return NULL.  If a range was requested we
1652          * run hammer2_chain_next() to iterate.
1653          */
1654         bref = NULL;
1655         while (i < count) {
1656                 tmp = hammer2_chain_find(hmp, parent, i);
1657                 if (tmp) {
1658                         bref = &tmp->bref;
1659                 } else if (base == NULL || base[i].type == 0) {
1660                         ++i;
1661                         continue;
1662                 } else {
1663                         bref = &base[i];
1664                 }
1665                 scan_beg = bref->key;
1666                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1667                 if (key_beg <= scan_end && key_end >= scan_beg)
1668                         break;
1669                 ++i;
1670         }
1671
1672         /*
1673          * If we couldn't find a match recurse up a parent to continue the
1674          * search.
1675          */
1676         if (i == count)
1677                 goto again;
1678
1679         /*
1680          * Acquire the new chain element.  If the chain element is an
1681          * indirect block we must search recursively.
1682          */
1683         chain = hammer2_chain_get(hmp, parent, i, flags);
1684         if (chain == NULL)
1685                 return (NULL);
1686
1687         /*
1688          * If the chain element is an indirect block it becomes the new
1689          * parent and we loop on it.
1690          *
1691          * The parent always has to be locked with at least RESOLVE_MAYBE,
1692          * so it might need a fixup if the caller passed incompatible flags.
1693          */
1694         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1695                 hammer2_chain_unlock(hmp, parent);
1696                 *parentp = parent = chain;
1697                 chain = NULL;
1698                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1699                         hammer2_chain_lock(hmp, parent, how_maybe);
1700                         hammer2_chain_drop(hmp, parent);        /* excess ref */
1701                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1702                         hammer2_chain_lock(hmp, parent, how_maybe);
1703                         hammer2_chain_unlock(hmp, parent);
1704                 }
1705                 i = 0;
1706                 goto again2;
1707         }
1708
1709         /*
1710          * All done, return chain
1711          */
1712         return (chain);
1713 }
1714
1715 /*
1716  * Create and return a new hammer2 system memory structure of the specified
1717  * key, type and size and insert it RELATIVE TO (PARENT).
1718  *
1719  * (parent) is typically either an inode or an indirect block, acquired
1720  * acquired as a side effect of issuing a prior failed lookup.  parent
1721  * must be locked and held.  Do not pass the inode chain to this function
1722  * unless that is the chain returned by the failed lookup.
1723  *
1724  * Non-indirect types will automatically allocate indirect blocks as required
1725  * if the new item does not fit in the current (parent).
1726  *
1727  * Indirect types will move a portion of the existing blockref array in
1728  * (parent) into the new indirect type and then use one of the free slots
1729  * to emplace the new indirect type.
1730  *
1731  * A new locked, referenced chain element is returned of the specified type.
1732  * The element may or may not have a data area associated with it:
1733  *
1734  *      VOLUME          not allowed here
1735  *      INODE           embedded data are will be set-up
1736  *      INDIRECT        not allowed here
1737  *      DATA            no data area will be set-up (caller is expected
1738  *                      to have logical buffers, we don't want to alias
1739  *                      the data onto device buffers!).
1740  *
1741  * Requires an exclusively locked parent.
1742  */
1743 hammer2_chain_t *
1744 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1745                      hammer2_chain_t *chain,
1746                      hammer2_key_t key, int keybits, int type, size_t bytes)
1747 {
1748         hammer2_blockref_t dummy;
1749         hammer2_blockref_t *base;
1750         hammer2_chain_t dummy_chain;
1751         int unlock_parent = 0;
1752         int allocated = 0;
1753         int count;
1754         int i;
1755
1756         KKASSERT(ccms_thread_lock_owned(&parent->cst));
1757
1758         if (chain == NULL) {
1759                 /*
1760                  * First allocate media space and construct the dummy bref,
1761                  * then allocate the in-memory chain structure.
1762                  */
1763                 bzero(&dummy, sizeof(dummy));
1764                 dummy.type = type;
1765                 dummy.key = key;
1766                 dummy.keybits = keybits;
1767                 dummy.data_off = hammer2_bytes_to_radix(bytes);
1768                 chain = hammer2_chain_alloc(hmp, &dummy);
1769                 allocated = 1;
1770
1771                 /*
1772                  * We do NOT set INITIAL here (yet).  INITIAL is only
1773                  * used for indirect blocks.
1774                  *
1775                  * Recalculate bytes to reflect the actual media block
1776                  * allocation.
1777                  */
1778                 bytes = (hammer2_off_t)1 <<
1779                         (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
1780                 chain->bytes = bytes;
1781
1782                 switch(type) {
1783                 case HAMMER2_BREF_TYPE_VOLUME:
1784                         panic("hammer2_chain_create: called with volume type");
1785                         break;
1786                 case HAMMER2_BREF_TYPE_INODE:
1787                         KKASSERT(bytes == HAMMER2_INODE_BYTES);
1788                         chain->data = (void *)&chain->u.ip->ip_data;
1789                         break;
1790                 case HAMMER2_BREF_TYPE_INDIRECT:
1791                         panic("hammer2_chain_create: cannot be used to"
1792                               "create indirect block");
1793                         break;
1794                 case HAMMER2_BREF_TYPE_DATA:
1795                 default:
1796                         /* leave chain->data NULL */
1797                         KKASSERT(chain->data == NULL);
1798                         break;
1799                 }
1800         } else {
1801                 /*
1802                  * Potentially update the chain's key/keybits.
1803                  */
1804                 chain->bref.key = key;
1805                 chain->bref.keybits = keybits;
1806         }
1807
1808 again:
1809         /*
1810          * Locate a free blockref in the parent's array
1811          */
1812         switch(parent->bref.type) {
1813         case HAMMER2_BREF_TYPE_INODE:
1814                 KKASSERT((parent->u.ip->ip_data.op_flags &
1815                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
1816                 KKASSERT(parent->data != NULL);
1817                 base = &parent->data->ipdata.u.blockset.blockref[0];
1818                 count = HAMMER2_SET_COUNT;
1819                 break;
1820         case HAMMER2_BREF_TYPE_INDIRECT:
1821                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1822                         base = NULL;
1823                 } else {
1824                         KKASSERT(parent->data != NULL);
1825                         base = &parent->data->npdata.blockref[0];
1826                 }
1827                 count = parent->bytes / sizeof(hammer2_blockref_t);
1828                 break;
1829         case HAMMER2_BREF_TYPE_VOLUME:
1830                 KKASSERT(parent->data != NULL);
1831                 base = &hmp->voldata.sroot_blockset.blockref[0];
1832                 count = HAMMER2_SET_COUNT;
1833                 break;
1834         default:
1835                 panic("hammer2_chain_create: unrecognized blockref type: %d",
1836                       parent->bref.type);
1837                 count = 0;
1838                 break;
1839         }
1840
1841         /*
1842          * Scan for an unallocated bref, also skipping any slots occupied
1843          * by in-memory chain elements that may not yet have been updated
1844          * in the parent's bref array.
1845          */
1846         bzero(&dummy_chain, sizeof(dummy_chain));
1847         for (i = 0; i < count; ++i) {
1848                 if (base == NULL) {
1849                         dummy_chain.index = i;
1850                         if (RB_FIND(hammer2_chain_tree,
1851                                     &parent->rbhead, &dummy_chain) == NULL) {
1852                                 break;
1853                         }
1854                 } else if (base[i].type == 0) {
1855                         dummy_chain.index = i;
1856                         if (RB_FIND(hammer2_chain_tree,
1857                                     &parent->rbhead, &dummy_chain) == NULL) {
1858                                 break;
1859                         }
1860                 }
1861         }
1862
1863         /*
1864          * If no free blockref could be found we must create an indirect
1865          * block and move a number of blockrefs into it.  With the parent
1866          * locked we can safely lock each child in order to move it without
1867          * causing a deadlock.
1868          *
1869          * This may return the new indirect block or the old parent depending
1870          * on where the key falls.
1871          */
1872         if (i == count) {
1873                 hammer2_chain_t *nparent;
1874
1875                 nparent = hammer2_chain_create_indirect(hmp, parent,
1876                                                         key, keybits);
1877                 if (nparent == NULL) {
1878                         if (allocated)
1879                                 hammer2_chain_free(hmp, chain);
1880                         chain = NULL;
1881                         goto done;
1882                 }
1883                 if (parent != nparent) {
1884                         if (unlock_parent)
1885                                 hammer2_chain_unlock(hmp, parent);
1886                         parent = nparent;
1887                         unlock_parent = 1;
1888                 }
1889                 goto again;
1890         }
1891
1892         /*
1893          * Link the chain into its parent.  Later on we will have to set
1894          * the MOVED bit in situations where we don't mark the new chain
1895          * as being modified.
1896          */
1897         if (chain->parent != NULL)
1898                 panic("hammer2: hammer2_chain_create: chain already connected");
1899         KKASSERT(chain->parent == NULL);
1900         chain->parent = parent;
1901         chain->index = i;
1902         if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
1903                 panic("hammer2_chain_link: collision");
1904         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
1905         KKASSERT(parent->refs > 0);
1906         atomic_add_int(&parent->refs, 1);
1907
1908         /*
1909          * Additional linkage for inodes.  Reuse the parent pointer to
1910          * find the parent directory.
1911          *
1912          * Cumulative adjustments are inherited on [re]attach and will
1913          * propagate up the tree on the next flush.
1914          *
1915          * The ccms_inode is initialized from its parent directory.  The
1916          * chain of ccms_inode's is seeded by the mount code.
1917          */
1918         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1919                 hammer2_chain_t *scan = parent;
1920                 hammer2_inode_t *ip = chain->u.ip;
1921
1922                 while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1923                         scan = scan->parent;
1924                 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE) {
1925                         ip->pip = scan->u.ip;
1926                         ip->pmp = scan->u.ip->pmp;
1927                         ip->pip->delta_icount += ip->ip_data.inode_count;
1928                         ip->pip->delta_dcount += ip->ip_data.data_count;
1929                         ++ip->pip->delta_icount;
1930                         ccms_cst_init(&ip->topo_cst, &ip->chain);
1931                 }
1932         }
1933
1934         /*
1935          * (allocated) indicates that this is a newly-created chain element
1936          * rather than a renamed chain element.  In this situation we want
1937          * to place the chain element in the MODIFIED state.
1938          *
1939          * The data area will be set up as follows:
1940          *
1941          *      VOLUME          not allowed here.
1942          *
1943          *      INODE           embedded data are will be set-up.
1944          *
1945          *      INDIRECT        not allowed here.
1946          *
1947          *      DATA            no data area will be set-up (caller is expected
1948          *                      to have logical buffers, we don't want to alias
1949          *                      the data onto device buffers!).
1950          */
1951         if (allocated) {
1952                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1953                         hammer2_chain_modify(hmp, chain,
1954                                              HAMMER2_MODIFY_OPTDATA);
1955                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1956                         /* not supported in this function */
1957                         panic("hammer2_chain_create: bad type");
1958                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1959                         hammer2_chain_modify(hmp, chain,
1960                                              HAMMER2_MODIFY_OPTDATA);
1961                 } else {
1962                         hammer2_chain_modify(hmp, chain, 0);
1963                 }
1964         } else {
1965                 /*
1966                  * When reconnecting inodes we have to call setsubmod()
1967                  * to ensure that its state propagates up the newly
1968                  * connected parent.
1969                  *
1970                  * Make sure MOVED is set but do not update bref_flush.  If
1971                  * the chain is undergoing modification bref_flush will be
1972                  * updated when it gets flushed.  If it is not then the
1973                  * bref may not have been flushed yet and we do not want to
1974                  * set MODIFIED here as this could result in unnecessary
1975                  * reallocations.
1976                  */
1977                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1978                         hammer2_chain_ref(hmp, chain);
1979                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1980                 }
1981                 hammer2_chain_parent_setsubmod(hmp, chain);
1982         }
1983
1984 done:
1985         if (unlock_parent)
1986                 hammer2_chain_unlock(hmp, parent);
1987         return (chain);
1988 }
1989
1990 /*
1991  * Create an indirect block that covers one or more of the elements in the
1992  * current parent.  Either returns the existing parent with no locking or
1993  * ref changes or returns the new indirect block locked and referenced
1994  * and leaving the original parent lock/ref intact as well.
1995  *
1996  * The returned chain depends on where the specified key falls.
1997  *
1998  * The key/keybits for the indirect mode only needs to follow three rules:
1999  *
2000  * (1) That all elements underneath it fit within its key space and
2001  *
2002  * (2) That all elements outside it are outside its key space.
2003  *
2004  * (3) When creating the new indirect block any elements in the current
2005  *     parent that fit within the new indirect block's keyspace must be
2006  *     moved into the new indirect block.
2007  *
2008  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
2009  *     keyspace the the current parent, but lookup/iteration rules will
2010  *     ensure (and must ensure) that rule (2) for all parents leading up
2011  *     to the nearest inode or the root volume header is adhered to.  This
2012  *     is accomplished by always recursing through matching keyspaces in
2013  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
2014  *
2015  * The current implementation calculates the current worst-case keyspace by
2016  * iterating the current parent and then divides it into two halves, choosing
2017  * whichever half has the most elements (not necessarily the half containing
2018  * the requested key).
2019  *
2020  * We can also opt to use the half with the least number of elements.  This
2021  * causes lower-numbered keys (aka logical file offsets) to recurse through
2022  * fewer indirect blocks and higher-numbered keys to recurse through more.
2023  * This also has the risk of not moving enough elements to the new indirect
2024  * block and being forced to create several indirect blocks before the element
2025  * can be inserted.
2026  *
2027  * Must be called with an exclusively locked parent
2028  */
2029 static
2030 hammer2_chain_t *
2031 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
2032                               hammer2_key_t create_key, int create_bits)
2033 {
2034         hammer2_blockref_t *base;
2035         hammer2_blockref_t *bref;
2036         hammer2_chain_t *chain;
2037         hammer2_chain_t *ichain;
2038         hammer2_chain_t dummy;
2039         hammer2_key_t key = create_key;
2040         int keybits = create_bits;
2041         int locount = 0;
2042         int hicount = 0;
2043         int count;
2044         int nbytes;
2045         int i;
2046
2047         /*
2048          * Calculate the base blockref pointer or NULL if the chain
2049          * is known to be empty.  We need to calculate the array count
2050          * for RB lookups either way.
2051          */
2052         KKASSERT(ccms_thread_lock_owned(&parent->cst));
2053
2054         hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
2055         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2056                 base = NULL;
2057
2058                 switch(parent->bref.type) {
2059                 case HAMMER2_BREF_TYPE_INODE:
2060                         count = HAMMER2_SET_COUNT;
2061                         break;
2062                 case HAMMER2_BREF_TYPE_INDIRECT:
2063                         count = parent->bytes / sizeof(hammer2_blockref_t);
2064                         break;
2065                 case HAMMER2_BREF_TYPE_VOLUME:
2066                         count = HAMMER2_SET_COUNT;
2067                         break;
2068                 default:
2069                         panic("hammer2_chain_create_indirect: "
2070                               "unrecognized blockref type: %d",
2071                               parent->bref.type);
2072                         count = 0;
2073                         break;
2074                 }
2075         } else {
2076                 switch(parent->bref.type) {
2077                 case HAMMER2_BREF_TYPE_INODE:
2078                         base = &parent->data->ipdata.u.blockset.blockref[0];
2079                         count = HAMMER2_SET_COUNT;
2080                         break;
2081                 case HAMMER2_BREF_TYPE_INDIRECT:
2082                         base = &parent->data->npdata.blockref[0];
2083                         count = parent->bytes / sizeof(hammer2_blockref_t);
2084                         break;
2085                 case HAMMER2_BREF_TYPE_VOLUME:
2086                         base = &hmp->voldata.sroot_blockset.blockref[0];
2087                         count = HAMMER2_SET_COUNT;
2088                         break;
2089                 default:
2090                         panic("hammer2_chain_create_indirect: "
2091                               "unrecognized blockref type: %d",
2092                               parent->bref.type);
2093                         count = 0;
2094                         break;
2095                 }
2096         }
2097
2098         /*
2099          * Scan for an unallocated bref, also skipping any slots occupied
2100          * by in-memory chain elements which may not yet have been updated
2101          * in the parent's bref array.
2102          */
2103         bzero(&dummy, sizeof(dummy));
2104         for (i = 0; i < count; ++i) {
2105                 int nkeybits;
2106
2107                 dummy.index = i;
2108                 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
2109                 if (chain) {
2110                         bref = &chain->bref;
2111                 } else if (base && base[i].type) {
2112                         bref = &base[i];
2113                 } else {
2114                         continue;
2115                 }
2116
2117                 /*
2118                  * Expand our calculated key range (key, keybits) to fit
2119                  * the scanned key.  nkeybits represents the full range
2120                  * that we will later cut in half (two halves @ nkeybits - 1).
2121                  */
2122                 nkeybits = keybits;
2123                 if (nkeybits < bref->keybits)
2124                         nkeybits = bref->keybits;
2125                 while (nkeybits < 64 &&
2126                        (~(((hammer2_key_t)1 << nkeybits) - 1) &
2127                         (key ^ bref->key)) != 0) {
2128                         ++nkeybits;
2129                 }
2130
2131                 /*
2132                  * If the new key range is larger we have to determine
2133                  * which side of the new key range the existing keys fall
2134                  * under by checking the high bit, then collapsing the
2135                  * locount into the hicount or vise-versa.
2136                  */
2137                 if (keybits != nkeybits) {
2138                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
2139                                 hicount += locount;
2140                                 locount = 0;
2141                         } else {
2142                                 locount += hicount;
2143                                 hicount = 0;
2144                         }
2145                         keybits = nkeybits;
2146                 }
2147
2148                 /*
2149                  * The newly scanned key will be in the lower half or the
2150                  * higher half of the (new) key range.
2151                  */
2152                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
2153                         ++hicount;
2154                 else
2155                         ++locount;
2156         }
2157
2158         /*
2159          * Adjust keybits to represent half of the full range calculated
2160          * above (radix 63 max)
2161          */
2162         --keybits;
2163
2164         /*
2165          * Select whichever half contains the most elements.  Theoretically
2166          * we can select either side as long as it contains at least one
2167          * element (in order to ensure that a free slot is present to hold
2168          * the indirect block).
2169          */
2170         key &= ~(((hammer2_key_t)1 << keybits) - 1);
2171         if (hammer2_indirect_optimize) {
2172                 /*
2173                  * Insert node for least number of keys, this will arrange
2174                  * the first few blocks of a large file or the first few
2175                  * inodes in a directory with fewer indirect blocks when
2176                  * created linearly.
2177                  */
2178                 if (hicount < locount && hicount != 0)
2179                         key |= (hammer2_key_t)1 << keybits;
2180                 else
2181                         key &= ~(hammer2_key_t)1 << keybits;
2182         } else {
2183                 /*
2184                  * Insert node for most number of keys, best for heavily
2185                  * fragmented files.
2186                  */
2187                 if (hicount > locount)
2188                         key |= (hammer2_key_t)1 << keybits;
2189                 else
2190                         key &= ~(hammer2_key_t)1 << keybits;
2191         }
2192
2193         /*
2194          * How big should our new indirect block be?  It has to be at least
2195          * as large as its parent.
2196          */
2197         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
2198                 nbytes = HAMMER2_IND_BYTES_MIN;
2199         else
2200                 nbytes = HAMMER2_IND_BYTES_MAX;
2201         if (nbytes < count * sizeof(hammer2_blockref_t))
2202                 nbytes = count * sizeof(hammer2_blockref_t);
2203
2204         /*
2205          * Ok, create our new indirect block
2206          */
2207         dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
2208         dummy.bref.key = key;
2209         dummy.bref.keybits = keybits;
2210         dummy.bref.data_off = hammer2_bytes_to_radix(nbytes);
2211         ichain = hammer2_chain_alloc(hmp, &dummy.bref);
2212         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
2213
2214         /*
2215          * Iterate the original parent and move the matching brefs into
2216          * the new indirect block.
2217          */
2218         for (i = 0; i < count; ++i) {
2219                 /*
2220                  * For keying purposes access the bref from the media or
2221                  * from our in-memory cache.  In cases where the in-memory
2222                  * cache overrides the media the keyrefs will be the same
2223                  * anyway so we can avoid checking the cache when the media
2224                  * has a key.
2225                  */
2226                 dummy.index = i;
2227                 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
2228                 if (chain) {
2229                         bref = &chain->bref;
2230                 } else if (base && base[i].type) {
2231                         bref = &base[i];
2232                 } else {
2233                         if (ichain->index < 0)
2234                                 ichain->index = i;
2235                         continue;
2236                 }
2237
2238                 /*
2239                  * Skip keys not in the chosen half (low or high), only bit
2240                  * (keybits - 1) needs to be compared but for safety we
2241                  * will compare all msb bits plus that bit again.
2242                  */
2243                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
2244                     (key ^ bref->key)) != 0) {
2245                         continue;
2246                 }
2247
2248                 /*
2249                  * This element is being moved from the parent, its slot
2250                  * is available for our new indirect block.
2251                  */
2252                 if (ichain->index < 0)
2253                         ichain->index = i;
2254
2255                 /*
2256                  * Load the new indirect block by acquiring or allocating
2257                  * the related chain entries, then simply move them to the
2258                  * new parent (ichain).
2259                  *
2260                  * When adjusting the parent/child relationship we must
2261                  * set the MOVED bit but we do NOT update bref_flush
2262                  * because otherwise we might synchronize a bref that has
2263                  * not yet been flushed.  We depend on chain's bref_flush
2264                  * either being correct or the chain being in a MODIFIED
2265                  * state.
2266                  *
2267                  * We do not want to set MODIFIED here as this would result
2268                  * in unnecessary reallocations.
2269                  *
2270                  * We must still set SUBMODIFIED in the parent but we do
2271                  * that after the loop.
2272                  *
2273                  * WARNING! chain->cst.spin must be held when chain->parent is
2274                  *          modified, even though we own the full blown lock,
2275                  *          to deal with setsubmod and rename races.
2276                  *
2277                  * XXX we really need a lock here but we don't need the
2278                  *     data.  NODATA feature needed.
2279                  */
2280                 chain = hammer2_chain_get(hmp, parent, i,
2281                                           HAMMER2_LOOKUP_NODATA);
2282                 spin_lock(&chain->cst.spin);
2283                 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
2284                 if (RB_INSERT(hammer2_chain_tree, &ichain->rbhead, chain))
2285                         panic("hammer2_chain_create_indirect: collision");
2286                 chain->parent = ichain;
2287                 spin_unlock(&chain->cst.spin);
2288                 if (base)
2289                         bzero(&base[i], sizeof(base[i]));
2290                 atomic_add_int(&parent->refs, -1);
2291                 atomic_add_int(&ichain->refs, 1);
2292                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2293                         hammer2_chain_ref(hmp, chain);
2294                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2295                 }
2296                 hammer2_chain_unlock(hmp, chain);
2297                 KKASSERT(parent->refs > 0);
2298                 chain = NULL;
2299         }
2300
2301         /*
2302          * Insert the new indirect block into the parent now that we've
2303          * cleared out some entries in the parent.  We calculated a good
2304          * insertion index in the loop above (ichain->index).
2305          *
2306          * We don't have to set MOVED here because we mark ichain modified
2307          * down below (so the normal modified -> flush -> set-moved sequence
2308          * applies).
2309          */
2310         KKASSERT(ichain->index >= 0);
2311         if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, ichain))
2312                 panic("hammer2_chain_create_indirect: ichain insertion");
2313         ichain->parent = parent;
2314         atomic_add_int(&parent->refs, 1);
2315
2316         /*
2317          * Mark the new indirect block modified after insertion, which
2318          * will propagate up through parent all the way to the root and
2319          * also allocate the physical block in ichain for our caller,
2320          * and assign ichain->data to a pre-zero'd space (because there
2321          * is not prior data to copy into it).
2322          *
2323          * We have to set SUBMODIFIED in ichain's flags manually so the
2324          * flusher knows it has to recurse through it to get to all of
2325          * our moved blocks, then call setsubmod() to set the bit
2326          * recursively.
2327          */
2328         hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA);
2329         hammer2_chain_parent_setsubmod(hmp, ichain);
2330         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2331
2332         /*
2333          * Figure out what to return.
2334          */
2335         if (create_bits > keybits) {
2336                 /*
2337                  * Key being created is way outside the key range,
2338                  * return the original parent.
2339                  */
2340                 hammer2_chain_unlock(hmp, ichain);
2341         } else if (~(((hammer2_key_t)1 << keybits) - 1) &
2342                    (create_key ^ key)) {
2343                 /*
2344                  * Key being created is outside the key range,
2345                  * return the original parent.
2346                  */
2347                 hammer2_chain_unlock(hmp, ichain);
2348         } else {
2349                 /*
2350                  * Otherwise its in the range, return the new parent.
2351                  * (leave both the new and old parent locked).
2352                  */
2353                 parent = ichain;
2354         }
2355
2356         return(parent);
2357 }
2358
2359 /*
2360  * Physically delete the specified chain element.  Note that inodes with
2361  * open descriptors should not be deleted (as with other filesystems) until
2362  * the last open descriptor is closed.
2363  *
2364  * This routine will remove the chain element from its parent and potentially
2365  * also recurse upward and delete indirect blocks which become empty as a
2366  * side effect.
2367  *
2368  * The caller must pass a pointer to the chain's parent, also locked and
2369  * referenced.  (*parentp) will be modified in a manner similar to a lookup
2370  * or iteration when indirect blocks are also deleted as a side effect.
2371  *
2372  * The caller must ensure that the chain is locked with the MAYDELETE
2373  * flag to interlock chain->movelock.
2374  *
2375  * XXX This currently does not adhere to the MOVED flag protocol in that
2376  *     the removal is immediately indicated in the parent's blockref[]
2377  *     array.
2378  *
2379  * Must be called with an exclusively locked parent and chain.
2380  */
2381 void
2382 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
2383                      hammer2_chain_t *chain, int retain)
2384 {
2385         hammer2_blockref_t *base;
2386         hammer2_inode_t *ip;
2387         int count;
2388
2389         /*
2390          * NOTE: Caller is responsible for using MAYDELETE flags when
2391          *       acquiring chain elements that it desires to delete.
2392          *       This flag should interlock the movelock flag.  Parent
2393          *       chain must remain locked (the flusher can set movelock
2394          *       on children otherwise).
2395          */
2396         if (chain->parent != parent)
2397                 panic("hammer2_chain_delete: parent mismatch");
2398         KKASSERT(ccms_thread_lock_owned(&parent->cst));
2399         KKASSERT(chain->movelock == 0);
2400
2401         /*
2402          * Mark the parent modified so our base[] pointer remains valid
2403          * while we move entries.  For the optimized indirect block
2404          * case mark the parent moved instead.
2405          *
2406          * Calculate the blockref reference in the parent
2407          */
2408         switch(parent->bref.type) {
2409         case HAMMER2_BREF_TYPE_INODE:
2410                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2411                 base = &parent->data->ipdata.u.blockset.blockref[0];
2412                 count = HAMMER2_SET_COUNT;
2413                 break;
2414         case HAMMER2_BREF_TYPE_INDIRECT:
2415                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA |
2416                                                   HAMMER2_MODIFY_NO_MODIFY_TID);
2417                 if (parent->flags & HAMMER2_CHAIN_INITIAL)
2418                         base = NULL;
2419                 else
2420                         base = &parent->data->npdata.blockref[0];
2421                 count = parent->bytes / sizeof(hammer2_blockref_t);
2422                 break;
2423         case HAMMER2_BREF_TYPE_VOLUME:
2424                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2425                 base = &hmp->voldata.sroot_blockset.blockref[0];
2426                 count = HAMMER2_SET_COUNT;
2427                 break;
2428         default:
2429                 panic("hammer2_chain_delete: unrecognized blockref type: %d",
2430                       parent->bref.type);
2431                 count = 0;
2432                 break;
2433         }
2434
2435         /*
2436          * Disconnect the bref in the parent, remove the chain, and
2437          * disconnect in-memory fields from the parent.
2438          *
2439          * WARNING! chain->cst.spin must be held when chain->parent is
2440          *          modified, even though we own the full blown lock,
2441          *          to deal with setsubmod and rename races.
2442          */
2443         KKASSERT(chain->index >= 0 && chain->index < count);
2444         if (base)
2445                 bzero(&base[chain->index], sizeof(*base));
2446
2447         spin_lock(&chain->cst.spin);
2448         RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
2449         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2450         atomic_add_int(&parent->refs, -1);      /* for red-black entry */
2451         chain->index = -1;
2452         chain->parent = NULL;
2453         spin_unlock(&chain->cst.spin);
2454
2455         /*
2456          * Cumulative adjustments must be propagated to the parent inode
2457          * when deleting and synchronized to ip.
2458          *
2459          * NOTE:  We do not propagate ip->delta_*count to the parent because
2460          *        these represent adjustments that have not yet been
2461          *        propagated upward, so we don't need to remove them from
2462          *        the parent.
2463          *
2464          * Clear the pointer to the parent inode.
2465          */
2466         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2467                 ip = chain->u.ip;
2468                 if (ip->pip) {
2469                         ip->pip->delta_icount -= ip->ip_data.inode_count;
2470                         ip->pip->delta_dcount -= ip->ip_data.data_count;
2471                         ip->ip_data.inode_count += ip->delta_icount;
2472                         ip->ip_data.data_count += ip->delta_dcount;
2473                         ip->delta_icount = 0;
2474                         ip->delta_dcount = 0;
2475                         --ip->pip->delta_icount;
2476                         spin_lock(&chain->cst.spin); /* XXX */
2477                         ip->pip = NULL;
2478                         spin_unlock(&chain->cst.spin);
2479                 }
2480         }
2481
2482         /*
2483          * If retain is 0 the deletion is permanent.  Because the chain is
2484          * no longer connected to the topology a flush will have no
2485          * visibility into it.  We must dispose of the references related
2486          * to the MODIFIED and MOVED flags, otherwise the ref count will
2487          * never transition to 0.
2488          *
2489          * If retain is non-zero the deleted element is likely an inode
2490          * which the vnops frontend will mark DESTROYED and flush.  In that
2491          * situation we must retain the flags for any open file descriptors
2492          * on the (removed) inode.  The final close will destroy the
2493          * disconnected chain.
2494          */
2495         if (retain == 0) {
2496                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2497                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
2498                         hammer2_chain_drop(hmp, chain);
2499                 }
2500                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2501                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2502                         hammer2_chain_drop(hmp, chain);
2503                 }
2504         }
2505
2506         /*
2507          * The chain is still likely referenced, possibly even by a vnode
2508          * (if an inode), so defer further action until the chain gets
2509          * dropped.
2510          */
2511 }
2512
2513 /*
2514  * Recursively flush the specified chain.  The chain is locked and
2515  * referenced by the caller and will remain so on return.  The chain
2516  * will remain referenced throughout but can temporarily lose its
2517  * lock during the recursion to avoid unnecessarily stalling user
2518  * processes.
2519  */
2520 struct hammer2_flush_info {
2521         struct flush_deferral_list flush_list;
2522         int             depth;
2523         hammer2_tid_t   modify_tid;
2524 };
2525
2526 typedef struct hammer2_flush_info hammer2_flush_info_t;
2527
2528 static void
2529 hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
2530                           hammer2_flush_info_t *info)
2531 {
2532         hammer2_blockref_t *bref;
2533         hammer2_off_t pbase;
2534         size_t bbytes;
2535         size_t boff;
2536         char *bdata;
2537         struct buf *bp;
2538         int error;
2539         int wasmodified;
2540
2541         /*
2542          * If we hit the stack recursion depth limit defer the operation.
2543          * The controller of the info structure will execute the deferral
2544          * list and then retry.
2545          *
2546          * This is only applicable if SUBMODIFIED is set.  After a reflush
2547          * SUBMODIFIED will probably be cleared and we want to drop through
2548          * to finish processing the current element so our direct parent
2549          * can process the results.
2550          */
2551         if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT &&
2552             (chain->flags & HAMMER2_CHAIN_SUBMODIFIED)) {
2553                 if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
2554                         hammer2_chain_ref(hmp, chain);
2555                         TAILQ_INSERT_TAIL(&info->flush_list,
2556                                           chain, flush_node);
2557                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
2558                 }
2559                 return;
2560         }
2561
2562         if (hammer2_debug & 0x0008)
2563                 kprintf("%*.*sCHAIN type=%d@%08jx %p/%d %04x {\n",
2564                         info->depth, info->depth, "",
2565                         chain->bref.type, chain->bref.data_off,
2566                         chain, chain->refs, chain->flags);
2567
2568         /*
2569          * If SUBMODIFIED is set we recurse the flush and adjust the
2570          * blockrefs accordingly.
2571          *
2572          * NOTE: Looping on SUBMODIFIED can prevent a flush from ever
2573          *       finishing in the face of filesystem activity.
2574          */
2575         if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
2576                 hammer2_chain_t *child;
2577                 hammer2_chain_t *saved;
2578                 hammer2_blockref_t *base;
2579                 int count;
2580
2581                 /*
2582                  * Clear SUBMODIFIED to catch races.  Note that if any
2583                  * child has to be flushed SUBMODIFIED will wind up being
2584                  * set again (for next time), but this does not stop us from
2585                  * synchronizing block updates which occurred.
2586                  *
2587                  * We don't want to set our chain to MODIFIED gratuitously.
2588                  *
2589                  * We need an extra ref on chain because we are going to
2590                  * release its lock temporarily in our child loop.
2591                  */
2592                 /* XXX SUBMODIFIED not interlocked, can race */
2593                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2594                 hammer2_chain_ref(hmp, chain);
2595
2596                 /*
2597                  * Flush the children and update the blockrefs in the chain.
2598                  * Be careful of ripouts during the loop.
2599                  */
2600                 saved = NULL;
2601                 RB_FOREACH(child, hammer2_chain_tree, &chain->rbhead) {
2602                         KKASSERT(child->parent == chain);
2603
2604                         if (saved) {
2605                                 hammer2_chain_moveunlock(hmp, saved);
2606                                 hammer2_chain_drop(hmp, saved);
2607                                 saved = NULL;
2608                         }
2609
2610                         /*
2611                          * We only recurse if SUBMODIFIED (internal node)
2612                          * or MODIFIED (internal node or leaf) is set.
2613                          * However, we must still track whether any MOVED
2614                          * entries are present to determine if the chain's
2615                          * blockref's need updating or not.
2616                          */
2617                         if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2618                                              HAMMER2_CHAIN_MODIFIED |
2619                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2620                                 continue;
2621                         }
2622                         saved = child;
2623                         hammer2_chain_ref(hmp, child);
2624                         hammer2_chain_movelock(hmp, child);
2625                         hammer2_chain_unlock(hmp, chain);
2626                         hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_MAYBE);
2627                         KKASSERT(child->parent == chain);
2628                         if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2629                                              HAMMER2_CHAIN_MODIFIED |
2630                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2631                                 hammer2_chain_unlock(hmp, child);
2632                                 hammer2_chain_lock(hmp, chain,
2633                                                    HAMMER2_RESOLVE_MAYBE);
2634                                 continue;
2635                         }
2636
2637                         /*
2638                          * Propagate the DESTROYED flag if found set, then
2639                          * recurse the flush.
2640                          */
2641                         if ((chain->flags & HAMMER2_CHAIN_DESTROYED) &&
2642                             (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) {
2643                                 atomic_set_int(&child->flags,
2644                                                HAMMER2_CHAIN_DESTROYED |
2645                                                HAMMER2_CHAIN_SUBMODIFIED);
2646                         }
2647                         ++info->depth;
2648                         hammer2_chain_flush_pass1(hmp, child, info);
2649                         --info->depth;
2650                         hammer2_chain_unlock(hmp, child);
2651                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
2652                         KKASSERT(child->parent == chain);
2653                 }
2654                 if (saved) {
2655                         hammer2_chain_moveunlock(hmp, saved);
2656                         hammer2_chain_drop(hmp, saved);
2657                         /*saved = NULL; not needed */
2658                 }
2659
2660                 /*
2661                  * Now synchronize any block updates.
2662                  */
2663                 RB_FOREACH(child, hammer2_chain_tree, &chain->rbhead) {
2664                         if ((child->flags & HAMMER2_CHAIN_MOVED) == 0)
2665                                 continue;
2666                         hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_NEVER);
2667                         KKASSERT(child->parent == chain);
2668                         if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) {
2669                                 hammer2_chain_unlock(hmp, child);
2670                                 continue;
2671                         }
2672
2673                         hammer2_chain_modify(hmp, chain,
2674                                              HAMMER2_MODIFY_NO_MODIFY_TID);
2675
2676                         switch(chain->bref.type) {
2677                         case HAMMER2_BREF_TYPE_INODE:
2678                                 KKASSERT((chain->data->ipdata.op_flags &
2679                                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
2680                                 base = &chain->data->ipdata.u.blockset.
2681                                         blockref[0];
2682                                 count = HAMMER2_SET_COUNT;
2683                                 break;
2684                         case HAMMER2_BREF_TYPE_INDIRECT:
2685                                 base = &chain->data->npdata.blockref[0];
2686                                 count = chain->bytes /
2687                                         sizeof(hammer2_blockref_t);
2688                                 break;
2689                         case HAMMER2_BREF_TYPE_VOLUME:
2690                                 base = &hmp->voldata.sroot_blockset.blockref[0];
2691                                 count = HAMMER2_SET_COUNT;
2692                                 break;
2693                         default:
2694                                 base = NULL;
2695                                 panic("hammer2_chain_get: "
2696                                       "unrecognized blockref type: %d",
2697                                       chain->bref.type);
2698                         }
2699
2700                         KKASSERT(child->index >= 0);
2701                         base[child->index] = child->bref_flush;
2702
2703                         if (chain->bref.mirror_tid <
2704                             child->bref_flush.mirror_tid) {
2705                                 chain->bref.mirror_tid =
2706                                         child->bref_flush.mirror_tid;
2707                         }
2708
2709                         if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME &&
2710                             hmp->voldata.mirror_tid <
2711                             child->bref_flush.mirror_tid) {
2712                                 hmp->voldata.mirror_tid =
2713                                         child->bref_flush.mirror_tid;
2714                         }
2715                         atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
2716                         hammer2_chain_drop(hmp, child); /* MOVED flag */
2717                         hammer2_chain_unlock(hmp, child);
2718                 }
2719                 hammer2_chain_drop(hmp, chain);
2720         }
2721
2722         /*
2723          * If destroying the object we unconditonally clear the MODIFIED
2724          * and MOVED bits, and we destroy the buffer without writing it
2725          * out.
2726          *
2727          * We don't bother updating the hash/crc or the chain bref.
2728          *
2729          * NOTE: The destroy'd object's bref has already been updated.
2730          *       so we can clear MOVED without propagating mirror_tid
2731          *       or modify_tid upward.
2732          *
2733          * XXX allocations for unflushed data can be returned to the
2734          *     free pool.
2735          */
2736         if (chain->flags & HAMMER2_CHAIN_DESTROYED) {
2737                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2738                         if (chain->bp) {
2739                                 chain->bp->b_flags |= B_INVAL|B_RELBUF;
2740                         }
2741                         atomic_clear_int(&chain->flags,
2742                                          HAMMER2_CHAIN_MODIFIED |
2743                                          HAMMER2_CHAIN_MODIFY_TID);
2744                         hammer2_chain_drop(hmp, chain);
2745                 }
2746                 if (chain->flags & HAMMER2_CHAIN_MODIFIED_AUX) {
2747                         atomic_clear_int(&chain->flags,
2748                                          HAMMER2_CHAIN_MODIFIED_AUX);
2749                 }
2750                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2751                         atomic_clear_int(&chain->flags,
2752                                          HAMMER2_CHAIN_MOVED);
2753                         hammer2_chain_drop(hmp, chain);
2754                 }
2755                 return;
2756         }
2757
2758         /*
2759          * Flush this chain entry only if it is marked modified.
2760          */
2761         if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
2762                              HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2763                 goto done;
2764         }
2765
2766         /*
2767          * Synchronize cumulative data and inode count adjustments to
2768          * the inode and propagate the deltas upward to the parent.
2769          */
2770         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2771                 hammer2_inode_t *ip;
2772
2773                 ip = chain->u.ip;
2774                 ip->ip_data.inode_count += ip->delta_icount;
2775                 ip->ip_data.data_count += ip->delta_dcount;
2776                 if (ip->pip) {
2777                         ip->pip->delta_icount += ip->delta_icount;
2778                         ip->pip->delta_dcount += ip->delta_dcount;
2779                 }
2780                 ip->delta_icount = 0;
2781                 ip->delta_dcount = 0;
2782         }
2783
2784         /*
2785          * Flush if MODIFIED or MODIFIED_AUX is set.  MODIFIED_AUX is only
2786          * used by the volume header (&hmp->vchain).
2787          */
2788         if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
2789                              HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2790                 goto done;
2791         }
2792         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED_AUX);
2793
2794         /*
2795          * Clear MODIFIED and set HAMMER2_CHAIN_MOVED.  The caller
2796          * will re-test the MOVED bit.  We must also update the mirror_tid
2797          * and modify_tid fields as appropriate.
2798          *
2799          * bits own a single chain ref and the MOVED bit owns its own
2800          * chain ref.
2801          */
2802         chain->bref.mirror_tid = info->modify_tid;
2803         if (chain->flags & HAMMER2_CHAIN_MODIFY_TID)
2804                 chain->bref.modify_tid = info->modify_tid;
2805         wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0;
2806         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
2807                                         HAMMER2_CHAIN_MODIFY_TID);
2808
2809         if (chain->flags & HAMMER2_CHAIN_MOVED) {
2810                 /*
2811                  * Drop the ref from the MODIFIED bit we cleared.
2812                  */
2813                 if (wasmodified)
2814                         hammer2_chain_drop(hmp, chain);
2815         } else {
2816                 /*
2817                  * If we were MODIFIED we inherit the ref from clearing
2818                  * that bit, otherwise we need another ref.
2819                  */
2820                 if (wasmodified == 0)
2821                         hammer2_chain_ref(hmp, chain);
2822                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2823         }
2824         chain->bref_flush = chain->bref;
2825
2826         /*
2827          * If this is part of a recursive flush we can go ahead and write
2828          * out the buffer cache buffer and pass a new bref back up the chain.
2829          *
2830          * This will never be a volume header.
2831          */
2832         switch(chain->bref.type) {
2833         case HAMMER2_BREF_TYPE_VOLUME:
2834                 /*
2835                  * The volume header is flushed manually by the syncer, not
2836                  * here.
2837                  */
2838                 break;
2839         case HAMMER2_BREF_TYPE_DATA:
2840                 /*
2841                  * Data elements have already been flushed via the logical
2842                  * file buffer cache.  Their hash was set in the bref by
2843                  * the vop_write code.
2844                  *
2845                  * Make sure the buffer(s) have been flushed out here.
2846                  */
2847                 bbytes = chain->bytes;
2848                 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
2849                 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
2850
2851                 bp = getblk(hmp->devvp, pbase, bbytes, GETBLK_NOWAIT, 0);
2852                 if (bp) {
2853                         if ((bp->b_flags & (B_CACHE | B_DIRTY)) ==
2854                             (B_CACHE | B_DIRTY)) {
2855                                 kprintf("x");
2856                                 cluster_awrite(bp);
2857                         } else {
2858                                 bp->b_flags |= B_RELBUF;
2859                                 brelse(bp);
2860                         }
2861                 }
2862                 break;
2863         case HAMMER2_BREF_TYPE_INDIRECT:
2864                 /*
2865                  * Indirect blocks may be in an INITIAL state.  Use the
2866                  * chain_lock() call to ensure that the buffer has been
2867                  * instantiated (even though it is already locked the buffer
2868                  * might not have been instantiated).
2869                  *
2870                  * Only write the buffer out if it is dirty, it is possible
2871                  * the operating system had already written out the buffer.
2872                  */
2873                 hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_ALWAYS);
2874                 KKASSERT(chain->bp != NULL);
2875
2876                 bp = chain->bp;
2877                 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) ||
2878                     (bp->b_flags & B_DIRTY)) {
2879                         bawrite(chain->bp);
2880                 } else {
2881                         brelse(chain->bp);
2882                 }
2883                 chain->bp = NULL;
2884                 chain->data = NULL;
2885                 hammer2_chain_unlock(hmp, chain);
2886                 break;
2887         default:
2888                 /*
2889                  * Embedded elements have to be flushed out.
2890                  */
2891                 KKASSERT(chain->data != NULL);
2892                 KKASSERT(chain->bp == NULL);
2893                 bref = &chain->bref;
2894
2895                 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
2896
2897                 if (chain->bp == NULL) {
2898                         /*
2899                          * The data is embedded, we have to acquire the
2900                          * buffer cache buffer and copy the data into it.
2901                          */
2902                         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
2903                                 bbytes = HAMMER2_MINIOSIZE;
2904                         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
2905                         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
2906
2907                         /*
2908                          * The getblk() optimization can only be used if the
2909                          * physical block size matches the request.
2910                          */
2911                         if (chain->bytes == bbytes) {
2912                                 bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
2913                                 error = 0;
2914                         } else {
2915                                 error = bread(hmp->devvp, pbase, bbytes, &bp);
2916                                 KKASSERT(error == 0);
2917                         }
2918                         bdata = (char *)bp->b_data + boff;
2919
2920                         /*
2921                          * Copy the data to the buffer, mark the buffer
2922                          * dirty, and convert the chain to unmodified.
2923                          *
2924                          * We expect we might have to make adjustments to
2925                          * non-data delayed-write buffers when doing an
2926                          * actual flush so use bawrite() instead of
2927                          * cluster_awrite() here.
2928                          */
2929                         bcopy(chain->data, bdata, chain->bytes);
2930                         bp->b_flags |= B_CLUSTEROK;
2931                         bawrite(bp);
2932                         bp = NULL;
2933                         chain->bref.check.iscsi32.value =
2934                                 hammer2_icrc32(chain->data, chain->bytes);
2935                         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
2936                                 ++hammer2_iod_meta_write;
2937                         else
2938                                 ++hammer2_iod_indr_write;
2939                 } else {
2940                         chain->bref.check.iscsi32.value =
2941                                 hammer2_icrc32(chain->data, chain->bytes);
2942                 }
2943         }
2944
2945         /*
2946          * Adjustments to the bref.  The caller will use this to adjust
2947          * our chain's pointer to this chain element.
2948          */
2949         bref = &chain->bref;
2950
2951         switch(bref->type) {
2952         case HAMMER2_BREF_TYPE_VOLUME:
2953                 KKASSERT(chain->data != NULL);
2954                 KKASSERT(chain->bp == NULL);
2955
2956                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
2957                         hammer2_icrc32(
2958                                 (char *)&hmp->voldata +
2959                                  HAMMER2_VOLUME_ICRC1_OFF,
2960                                 HAMMER2_VOLUME_ICRC1_SIZE);
2961                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
2962                         hammer2_icrc32(
2963                                 (char *)&hmp->voldata +
2964                                  HAMMER2_VOLUME_ICRC0_OFF,
2965                                 HAMMER2_VOLUME_ICRC0_SIZE);
2966                 hmp->voldata.icrc_volheader =
2967                         hammer2_icrc32(
2968                                 (char *)&hmp->voldata +
2969                                  HAMMER2_VOLUME_ICRCVH_OFF,
2970                                 HAMMER2_VOLUME_ICRCVH_SIZE);
2971                 break;
2972         default:
2973                 break;
2974
2975         }
2976 done:
2977         if (hammer2_debug & 0x0008) {
2978                 kprintf("%*.*s} %p/%d %04x ",
2979                         info->depth, info->depth, "",
2980                         chain, chain->refs, chain->flags);
2981         }
2982 }
2983
2984 #if 0
2985 /*
2986  * PASS2 - not yet implemented (should be called only with the root chain?)
2987  */
2988 static void
2989 hammer2_chain_flush_pass2(hammer2_mount_t *hmp, hammer2_chain_t *chain)
2990 {
2991 }
2992 #endif
2993
2994 /*
2995  * Stand-alone flush.  If the chain is unable to completely flush we have
2996  * to be sure that SUBMODIFIED propagates up the parent chain.  We must not
2997  * clear the MOVED bit after flushing in this situation or our desynchronized
2998  * bref will not properly update in the parent.
2999  *
3000  * This routine can be called from several places but the most important
3001  * is from the hammer2_vop_reclaim() function.  We want to try to completely
3002  * clean out the inode structure to prevent disconnected inodes from
3003  * building up and blowing out the kmalloc pool.
3004  *
3005  * If modify_tid is 0 (usual case), a new modify_tid is allocated and
3006  * applied to the flush.  The depth-limit handling code is the only
3007  * code which passes a non-zero modify_tid to hammer2_chain_flush().
3008  *
3009  * chain is locked on call and will remain locked on return.
3010  */
3011 void
3012 hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
3013                     hammer2_tid_t modify_tid)
3014 {
3015         hammer2_chain_t *parent;
3016         hammer2_chain_t *scan;
3017         hammer2_blockref_t *base;
3018         hammer2_flush_info_t info;
3019         int count;
3020         int reflush;
3021
3022         /*
3023          * Execute the recursive flush and handle deferrals.
3024          *
3025          * Chains can be ridiculously long (thousands deep), so to
3026          * avoid blowing out the kernel stack the recursive flush has a
3027          * depth limit.  Elements at the limit are placed on a list
3028          * for re-execution after the stack has been popped.
3029          */
3030         bzero(&info, sizeof(info));
3031         TAILQ_INIT(&info.flush_list);
3032
3033         if (modify_tid == 0) {
3034                 hammer2_voldata_lock(hmp);
3035                 info.modify_tid = hmp->voldata.alloc_tid++;
3036                 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
3037                 hammer2_voldata_unlock(hmp);
3038         } else {
3039                 info.modify_tid = modify_tid;
3040         }
3041         reflush = 1;
3042
3043         while (reflush) {
3044                 /*
3045                  * Primary recursion
3046                  */
3047                 hammer2_chain_flush_pass1(hmp, chain, &info);
3048                 reflush = 0;
3049
3050                 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
3051                         /*
3052                          * Secondary recursion.  Note that a reference is
3053                          * retained from the element's presence on the
3054                          * deferral list.
3055                          */
3056                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
3057                         TAILQ_REMOVE(&info.flush_list, scan, flush_node);
3058                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
3059
3060                         /*
3061                          * Now that we've popped back up we can do a secondary
3062                          * recursion on the deferred elements.
3063                          */
3064                         if (hammer2_debug & 0x0040)
3065                                 kprintf("defered flush %p\n", scan);
3066                         hammer2_chain_lock(hmp, scan, HAMMER2_RESOLVE_MAYBE);
3067                         hammer2_chain_flush(hmp, scan, info.modify_tid);
3068                         hammer2_chain_unlock(hmp, scan);
3069
3070                         /*
3071                          * Only flag a reflush if SUBMODIFIED is no longer
3072                          * set.  If SUBMODIFIED is set the element will just
3073                          * wind up on our flush_list again.
3074                          */
3075                         if ((scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
3076                                             HAMMER2_CHAIN_MODIFIED |
3077                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
3078                                 reflush = 1;
3079                         }
3080                         hammer2_chain_drop(hmp, scan);
3081                 }
3082                 if ((hammer2_debug & 0x0040) && reflush)
3083                         kprintf("reflush %p\n", chain);
3084         }
3085
3086         /*
3087          * The SUBMODIFIED bit must propagate upward if the chain could not
3088          * be completely flushed.
3089          */
3090         if (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
3091                             HAMMER2_CHAIN_MODIFIED |
3092                             HAMMER2_CHAIN_MODIFIED_AUX |
3093                             HAMMER2_CHAIN_MOVED)) {
3094                 hammer2_chain_parent_setsubmod(hmp, chain);
3095         }
3096
3097         /*
3098          * If the only thing left is a simple bref update try to
3099          * pro-actively update the parent, otherwise return early.
3100          */
3101         parent = chain->parent;
3102         if (parent == NULL) {
3103                 return;
3104         }
3105         if (chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
3106             (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
3107                              HAMMER2_CHAIN_MODIFIED |
3108                              HAMMER2_CHAIN_MODIFIED_AUX |
3109                              HAMMER2_CHAIN_MOVED)) != HAMMER2_CHAIN_MOVED) {
3110                 return;
3111         }
3112
3113         /*
3114          * We are locking backwards so allow the lock to fail.
3115          */
3116         if (ccms_thread_lock_nonblock(&parent->cst, CCMS_STATE_EXCLUSIVE))
3117                 return;
3118
3119         /*
3120          * We are updating brefs but we have to call chain_modify()
3121          * because our caller is not being run from a recursive flush.
3122          *
3123          * This will also chain up the parent list and set the SUBMODIFIED
3124          * flag.
3125          *
3126          * We do not want to set HAMMER2_CHAIN_MODIFY_TID here because the
3127          * modification is only related to updating a bref in the parent.
3128          *
3129          * When updating the blockset embedded in the volume header we must
3130          * also update voldata.mirror_tid.
3131          */
3132         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
3133         hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
3134
3135         switch(parent->bref.type) {
3136         case HAMMER2_BREF_TYPE_INODE:
3137                 base = &parent->data->ipdata.u.blockset.
3138                         blockref[0];
3139                 count = HAMMER2_SET_COUNT;
3140                 break;
3141         case HAMMER2_BREF_TYPE_INDIRECT:
3142                 base = &parent->data->npdata.blockref[0];
3143                 count = parent->bytes /
3144                         sizeof(hammer2_blockref_t);
3145                 break;
3146         case HAMMER2_BREF_TYPE_VOLUME:
3147                 base = &hmp->voldata.sroot_blockset.blockref[0];
3148                 count = HAMMER2_SET_COUNT;
3149                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
3150                         if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
3151                                 hmp->voldata.mirror_tid =
3152                                         chain->bref.mirror_tid;
3153                         }
3154                 }
3155                 break;
3156         default:
3157                 base = NULL;
3158                 panic("hammer2_chain_flush: "
3159                       "unrecognized blockref type: %d",
3160                       parent->bref.type);
3161         }
3162
3163         /*
3164          * Update the blockref in the parent.  We do not have to set
3165          * MOVED in the parent because the parent has been marked modified,
3166          * so the flush sequence will pick up the bref change.
3167          *
3168          * We do have to propagate mirror_tid upward.
3169          */
3170         KKASSERT(chain->index >= 0 &&
3171                  chain->index < count);
3172         KKASSERT(chain->parent == parent);
3173         if (chain->flags & HAMMER2_CHAIN_MOVED) {
3174                 base[chain->index] = chain->bref_flush;
3175                 if (parent->bref.mirror_tid < chain->bref_flush.mirror_tid)
3176                         parent->bref.mirror_tid = chain->bref_flush.mirror_tid;
3177                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
3178                 hammer2_chain_drop(hmp, chain);
3179         } else if (bcmp(&base[chain->index], &chain->bref_flush,
3180                    sizeof(chain->bref)) != 0) {
3181                 panic("hammer2: unflagged bref update(2)");
3182         }
3183         ccms_thread_unlock(&parent->cst);               /* release manual op */
3184         hammer2_chain_unlock(hmp, parent);
3185 }