hammer2 - bigger stabilization & performance pass
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem handles direct and indirect block searches, recursions,
37  * creation, and deletion.  Chains of blockrefs are tracked and modifications
38  * are flag for propagation... eventually all the way back to the volume
39  * header.
40  */
41
42 #include <sys/cdefs.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/types.h>
46 #include <sys/lock.h>
47 #include <sys/uuid.h>
48
49 #include "hammer2.h"
50
51 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
52
53 static hammer2_chain_t *hammer2_chain_create_indirect(
54                         hammer2_mount_t *hmp, hammer2_chain_t *parent,
55                         hammer2_key_t key, int keybits,
56                         int *errorp);
57
58 /*
59  * We use a red-black tree to guarantee safe lookups under shared locks.
60  */
61 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
62
63 int
64 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
65 {
66         return(chain2->index - chain1->index);
67 }
68
69 /*
70  * Recursively mark the parent chain elements so flushes can find
71  * modified elements.  Stop when we hit a chain already flagged
72  * SUBMODIFIED, but ignore the SUBMODIFIED bit that might be set
73  * in chain itself.
74  *
75  * SUBMODIFIED is not set on the chain passed in.
76  *
77  * The chain->cst.spin lock can be held to stabilize the chain->parent
78  * pointer.  The first parent is stabilized by virtue of chain being
79  * fully locked.
80  */
81 static void
82 hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
83 {
84         hammer2_chain_t *parent;
85
86         parent = chain->parent;
87         if (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
88                 spin_lock(&parent->cst.spin);
89                 for (;;) {
90                         atomic_set_int(&parent->flags,
91                                        HAMMER2_CHAIN_SUBMODIFIED);
92                         if ((chain = parent->parent) == NULL)
93                                 break;
94                         spin_lock(&chain->cst.spin);    /* upward interlock */
95                         spin_unlock(&parent->cst.spin);
96                         parent = chain;
97                 }
98                 spin_unlock(&parent->cst.spin);
99         }
100 }
101
102 /*
103  * Allocate a new disconnected chain element representing the specified
104  * bref.  The chain element is locked exclusively and refs is set to 1.
105  *
106  * This essentially allocates a system memory structure representing one
107  * of the media structure types, including inodes.
108  */
109 hammer2_chain_t *
110 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
111 {
112         hammer2_chain_t *chain;
113         hammer2_inode_t *ip;
114         hammer2_indblock_t *np;
115         hammer2_data_t *dp;
116         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
117
118         /*
119          * Construct the appropriate system structure.
120          */
121         switch(bref->type) {
122         case HAMMER2_BREF_TYPE_INODE:
123                 ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
124                 chain = &ip->chain;
125                 chain->u.ip = ip;
126                 ip->hmp = hmp;
127                 break;
128         case HAMMER2_BREF_TYPE_INDIRECT:
129                 np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
130                 chain = &np->chain;
131                 chain->u.np = np;
132                 break;
133         case HAMMER2_BREF_TYPE_DATA:
134                 dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
135                 chain = &dp->chain;
136                 chain->u.dp = dp;
137                 break;
138         case HAMMER2_BREF_TYPE_VOLUME:
139                 chain = NULL;
140                 panic("hammer2_chain_alloc volume type illegal for op");
141         default:
142                 chain = NULL;
143                 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
144                       bref->type);
145         }
146
147         /*
148          * Only set bref_flush if the bref has a real media offset, otherwise
149          * the caller has to wait for the chain to be modified/block-allocated
150          * before a blockref can be synchronized with its (future) parent.
151          */
152         chain->bref = *bref;
153         if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
154                 chain->bref_flush = *bref;
155         chain->index = -1;              /* not yet assigned */
156         chain->refs = 1;
157         chain->bytes = bytes;
158         ccms_cst_init(&chain->cst, chain);
159         ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
160
161         return (chain);
162 }
163
164 /*
165  * Deallocate a chain (the step before freeing it).  Remove the chain from
166  * its parent's tree.
167  *
168  * Caller must hold the parent and the chain exclusively locked, and
169  * chain->refs must be 0.
170  *
171  * This function unlocks, removes, and destroys chain, and will recursively
172  * destroy any sub-chains under chain (whos refs must also be 0 at this
173  * point).
174  *
175  * parent can be NULL.
176  */
177 static void
178 hammer2_chain_dealloc(hammer2_mount_t *hmp, hammer2_chain_t *chain)
179 {
180         hammer2_inode_t *ip;
181         hammer2_chain_t *parent;
182         hammer2_chain_t *child;
183
184         KKASSERT(chain->refs == 0);
185         KKASSERT(chain->flushing == 0);
186         KKASSERT((chain->flags &
187                   (HAMMER2_CHAIN_MOVED | HAMMER2_CHAIN_MODIFIED)) == 0);
188
189         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
190                 ip = chain->u.ip;
191         else
192                 ip = NULL;
193
194         /*
195          * If the sub-tree is not empty all the elements on it must have
196          * 0 refs and be deallocatable.
197          */
198         while ((child = RB_ROOT(&chain->rbhead)) != NULL) {
199                 ccms_thread_lock(&child->cst, CCMS_STATE_EXCLUSIVE);
200                 hammer2_chain_dealloc(hmp, child);
201         }
202
203         /*
204          * If the DELETED flag is not set the chain must be removed from
205          * its parent's tree.
206          *
207          * WARNING! chain->cst.spin must be held when chain->parent is
208          *          modified, even though we own the full blown lock,
209          *          to deal with setsubmod and rename races.
210          */
211         if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
212                 spin_lock(&chain->cst.spin);    /* shouldn't be needed */
213                 parent = chain->parent;
214                 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
215                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
216                 if (ip)
217                         ip->pip = NULL;
218                 chain->parent = NULL;
219                 spin_unlock(&chain->cst.spin);
220         }
221
222         /*
223          * When cleaning out a hammer2_inode we must
224          * also clean out the related ccms_inode.
225          */
226         if (ip)
227                 ccms_cst_uninit(&ip->topo_cst);
228         hammer2_chain_free(hmp, chain);
229 }
230
231 /*
232  * Free a disconnected chain element
233  */
234 void
235 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
236 {
237         void *mem;
238
239         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
240             chain->bref.type == HAMMER2_BREF_TYPE_VOLUME) {
241                 chain->data = NULL;
242         }
243
244         KKASSERT(chain->bp == NULL);
245         KKASSERT(chain->data == NULL);
246         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
247                  chain->u.ip->vp == NULL);
248         ccms_thread_unlock(&chain->cst);
249         KKASSERT(chain->cst.count == 0);
250         KKASSERT(chain->cst.upgrade == 0);
251
252         if ((mem = chain->u.mem) != NULL) {
253                 chain->u.mem = NULL;
254                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
255                         kfree(mem, hmp->minode);
256                 else
257                         kfree(mem, hmp->mchain);
258         }
259 }
260
261 /*
262  * Add a reference to a chain element, preventing its destruction.
263  *
264  * The parent chain must be locked shared or exclusive or otherwise be
265  * stable and already have a reference.
266  */
267 void
268 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
269 {
270         u_int refs;
271
272         while (chain) {
273                 refs = chain->refs;
274                 KKASSERT(chain->refs >= 0);
275                 cpu_ccfence();
276                 if (refs == 0) {
277                         /*
278                          * 0 -> 1 transition must bump the refs on the parent
279                          * too.  The caller has stabilized the parent.
280                          */
281                         if (atomic_cmpset_int(&chain->refs, 0, 1)) {
282                                 chain = chain->parent;
283                                 KKASSERT(chain == NULL || chain->refs > 0);
284                         }
285                         /* retry or continue along the parent chain */
286                 } else {
287                         /*
288                          * N -> N+1
289                          */
290                         if (atomic_cmpset_int(&chain->refs, refs, refs + 1))
291                                 break;
292                         /* retry */
293                 }
294         }
295 }
296
297 /*
298  * Drop the callers reference to the chain element.  If the ref count
299  * reaches zero we attempt to recursively drop the parent.
300  *
301  * MOVED and MODIFIED elements hold additional references so it should not
302  * be possible for the count on a modified element to drop to 0.
303  *
304  * The chain element must NOT be locked by the caller on the 1->0 transition.
305  *
306  * The parent might or might not be locked by the caller.  If we are unable
307  * to lock the parent on the 1->0 transition the destruction of the chain
308  * will be deferred but we still recurse upward and drop the ref on the
309  * parent (see the lastdrop() function)
310  */
311 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_mount_t *hmp,
312                                                 hammer2_chain_t *chain);
313
314 void
315 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
316 {
317         u_int refs;
318
319         while (chain) {
320                 refs = chain->refs;
321                 cpu_ccfence();
322                 KKASSERT(refs > 0);
323                 if (refs == 1) {
324                         /*
325                          * (1) lastdrop successfully drops the chain to 0
326                          *     refs and may may not have destroyed it.
327                          *     lastdrop will return the parent so we can
328                          *     recursively drop the implied ref from the
329                          *     1->0 transition.
330                          *
331                          * (2) lastdrop fails to transition refs from 1 to 0
332                          *     and returns the same chain, we retry.
333                          */
334                         chain = hammer2_chain_lastdrop(hmp, chain);
335                 } else {
336                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
337                                 /*
338                                  * Succeeded, count did not reach zero so
339                                  * cut out of the loop.
340                                  */
341                                 break;
342                         }
343                         /* retry the same chain */
344                 }
345         }
346 }
347
348 /*
349  * Handle SMP races during the last drop.  We must obtain a lock on
350  * chain->parent to stabilize the last pointer reference to chain
351  * (if applicable).  This reference does not have a parallel ref count,
352  * that is idle chains in the topology can have a ref count of 0.
353  *
354  * The 1->0 transition implies a ref on the parent.
355  */
356 static
357 hammer2_chain_t *
358 hammer2_chain_lastdrop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
359 {
360         hammer2_chain_t *parent;
361
362         /*
363          * Stablize chain->parent with the chain cst's spinlock.
364          * (parent can be NULL here).
365          *
366          * cst.spin locks are allowed to be nested bottom-up (the reverse
367          * of the normal top-down for full-blown cst locks), so this also
368          * allows us to attempt to obtain the parent's cst lock non-blocking
369          * (which must acquire the parent's spinlock unconditionally) while
370          * we are still holding the chain's spinlock.
371          */
372         spin_lock(&chain->cst.spin);
373         parent = chain->parent;
374
375         /*
376          * If chain->flushing is non-zero we cannot deallocate the chain
377          * here.  The flushing field will be serialized for the inline
378          * unlock made by the flusher itself and we don't care about races
379          * in any other situation because the thread lock on the parent
380          * will fail in other situations.
381          *
382          * If we have a non-NULL parent but cannot acquire its thread
383          * lock, we also cannot deallocate the chain.
384          */
385         if (chain->flushing ||
386             (parent && ccms_thread_lock_nonblock(&parent->cst,
387                                                  CCMS_STATE_EXCLUSIVE))) {
388                 if (atomic_cmpset_int(&chain->refs, 1, 0)) {
389                         spin_unlock(&chain->cst.spin);  /* success */
390                         return(parent);
391                 } else {
392                         spin_unlock(&chain->cst.spin);  /* failure */
393                         return(chain);
394                 }
395         }
396         spin_unlock(&chain->cst.spin);
397
398         /*
399          * With the parent now held we control the last pointer reference
400          * to chain ONLY IF this is the 1->0 drop.  If we fail to transition
401          * from 1->0 we raced a refs change and must retry at chain.
402          */
403         if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
404                 /* failure */
405                 if (parent)
406                         ccms_thread_unlock(&parent->cst);
407                 return(chain);
408         }
409
410         /*
411          * Ok, we succeeded.  We now own the implied ref on the parent
412          * associated with the 1->0 transition of the child.  It should not
413          * be possible for ANYTHING to access the child now, as we own the
414          * lock on the parent, so we should be able to safely lock the
415          * child and destroy it.
416          */
417         ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
418         hammer2_chain_dealloc(hmp, chain);
419
420         /*
421          * We want to return parent with its implied ref to the caller
422          * to recurse and drop the parent.
423          */
424         if (parent)
425                 ccms_thread_unlock(&parent->cst);
426         return (parent);
427 }
428
429 /*
430  * Ref and lock a chain element, acquiring its data with I/O if necessary,
431  * and specify how you would like the data to be resolved.
432  *
433  * Returns 0 on success or an error code if the data could not be acquired.
434  * The chain element is locked either way.
435  *
436  * The lock is allowed to recurse, multiple locking ops will aggregate
437  * the requested resolve types.  Once data is assigned it will not be
438  * removed until the last unlock.
439  *
440  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
441  *                         (typically used to avoid device/logical buffer
442  *                          aliasing for data)
443  *
444  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
445  *                         the INITIAL-create state (indirect blocks only).
446  *
447  *                         Do not resolve data elements for DATA chains.
448  *                         (typically used to avoid device/logical buffer
449  *                          aliasing for data)
450  *
451  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
452  *
453  * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
454  *                         it will be locked exclusive.
455  *
456  * NOTE: Embedded elements (volume header, inodes) are always resolved
457  *       regardless.
458  *
459  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
460  *       element will instantiate and zero its buffer, and flush it on
461  *       release.
462  *
463  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
464  *       so as not to instantiate a device buffer, which could alias against
465  *       a logical file buffer.  However, if ALWAYS is specified the
466  *       device buffer will be instantiated anyway.
467  */
468 int
469 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
470 {
471         hammer2_blockref_t *bref;
472         hammer2_off_t pbase;
473         hammer2_off_t peof;
474         ccms_state_t ostate;
475         size_t boff;
476         size_t bbytes;
477         int error;
478         char *bdata;
479
480         /*
481          * Ref and lock the element.  Recursive locks are allowed.
482          */
483         hammer2_chain_ref(hmp, chain);
484         if (how & HAMMER2_RESOLVE_SHARED)
485                 ccms_thread_lock(&chain->cst, CCMS_STATE_SHARED);
486         else
487                 ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
488
489         /*
490          * If we already have a valid data pointer no further action is
491          * necessary.
492          */
493         if (chain->data)
494                 return (0);
495
496         /*
497          * Do we have to resolve the data?
498          */
499         switch(how & HAMMER2_RESOLVE_MASK) {
500         case HAMMER2_RESOLVE_NEVER:
501                 return(0);
502         case HAMMER2_RESOLVE_MAYBE:
503                 if (chain->flags & HAMMER2_CHAIN_INITIAL)
504                         return(0);
505                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
506                         return(0);
507                 /* fall through */
508         case HAMMER2_RESOLVE_ALWAYS:
509                 break;
510         }
511
512         /*
513          * Upgrade to an exclusive lock so we can safely manipulate the
514          * buffer cache.  If another thread got to it before us we
515          * can just return.
516          */
517         ostate = ccms_thread_lock_upgrade(&chain->cst);
518         if (chain->data) {
519                 ccms_thread_lock_restore(&chain->cst, ostate);
520                 return (0);
521         }
522
523         /*
524          * We must resolve to a device buffer, either by issuing I/O or
525          * by creating a zero-fill element.  We do not mark the buffer
526          * dirty when creating a zero-fill element (the hammer2_chain_modify()
527          * API must still be used to do that).
528          *
529          * The device buffer is variable-sized in powers of 2 down
530          * to HAMMER2_MINALLOCSIZE (typically 1K).  A 64K physical storage
531          * chunk always contains buffers of the same size. (XXX)
532          *
533          * The minimum physical IO size may be larger than the variable
534          * block size.
535          */
536         bref = &chain->bref;
537
538         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
539                 bbytes = HAMMER2_MINIOSIZE;
540         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
541         peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
542         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
543         KKASSERT(pbase != 0);
544
545         /*
546          * The getblk() optimization can only be used on newly created
547          * elements if the physical block size matches the request.
548          */
549         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
550             chain->bytes == bbytes) {
551                 chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
552                 error = 0;
553         } else if (hammer2_cluster_enable) {
554                 error = cluster_read(hmp->devvp, peof, pbase, bbytes,
555                                      HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
556                                      &chain->bp);
557         } else {
558                 error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
559         }
560
561         if (error) {
562                 kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
563                         (intmax_t)pbase, error);
564                 bqrelse(chain->bp);
565                 chain->bp = NULL;
566                 ccms_thread_lock_restore(&chain->cst, ostate);
567                 return (error);
568         }
569
570         /*
571          * Zero the data area if the chain is in the INITIAL-create state.
572          * Mark the buffer for bdwrite().
573          */
574         bdata = (char *)chain->bp->b_data + boff;
575         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
576                 bzero(bdata, chain->bytes);
577                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
578         }
579
580         /*
581          * Setup the data pointer, either pointing it to an embedded data
582          * structure and copying the data from the buffer, or pointing it
583          * into the buffer.
584          *
585          * The buffer is not retained when copying to an embedded data
586          * structure in order to avoid potential deadlocks or recursions
587          * on the same physical buffer.
588          */
589         switch (bref->type) {
590         case HAMMER2_BREF_TYPE_VOLUME:
591                 /*
592                  * Copy data from bp to embedded buffer
593                  */
594                 panic("hammer2_chain_lock: called on unresolved volume header");
595 #if 0
596                 /* NOT YET */
597                 KKASSERT(pbase == 0);
598                 KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
599                 bcopy(bdata, &hmp->voldata, chain->bytes);
600                 chain->data = (void *)&hmp->voldata;
601                 bqrelse(chain->bp);
602                 chain->bp = NULL;
603 #endif
604                 break;
605         case HAMMER2_BREF_TYPE_INODE:
606                 /*
607                  * Copy data from bp to embedded buffer, do not retain the
608                  * device buffer.
609                  */
610                 bcopy(bdata, &chain->u.ip->ip_data, chain->bytes);
611                 chain->data = (void *)&chain->u.ip->ip_data;
612                 bqrelse(chain->bp);
613                 chain->bp = NULL;
614                 break;
615         case HAMMER2_BREF_TYPE_INDIRECT:
616         case HAMMER2_BREF_TYPE_DATA:
617         default:
618                 /*
619                  * Point data at the device buffer and leave bp intact.
620                  */
621                 chain->data = (void *)bdata;
622                 break;
623         }
624
625         /*
626          * Make sure the bp is not specifically owned by this thread before
627          * restoring to a possibly shared lock, so another hammer2 thread
628          * can release it.
629          */
630         if (chain->bp)
631                 BUF_KERNPROC(chain->bp);
632         ccms_thread_lock_restore(&chain->cst, ostate);
633         return (0);
634 }
635
636 /*
637  * Unlock and deref a chain element.
638  *
639  * On the last lock release any non-embedded data (chain->bp) will be
640  * retired.
641  */
642 void
643 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
644 {
645         long *counterp;
646
647         /*
648          * Release the CST lock but with a special 1->0 transition case.
649          *
650          * Returns non-zero if lock references remain.  When zero is
651          * returned the last lock reference is retained and any shared
652          * lock is upgraded to an exclusive lock for final disposition.
653          */
654         if (ccms_thread_unlock_zero(&chain->cst)) {
655                 KKASSERT(chain->refs > 1);
656                 atomic_add_int(&chain->refs, -1);
657                 return;
658         }
659
660         /*
661          * Shortcut the case if the data is embedded or not resolved.
662          *
663          * Do NOT null-out pointers to embedded data (e.g. inode).
664          *
665          * The DIRTYBP flag is non-applicable in this situation and can
666          * be cleared to keep the flags state clean.
667          */
668         if (chain->bp == NULL) {
669                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
670                 ccms_thread_unlock(&chain->cst);
671                 hammer2_chain_drop(hmp, chain);
672                 return;
673         }
674
675         /*
676          * Statistics
677          */
678         if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
679                 ;
680         } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
681                 switch(chain->bref.type) {
682                 case HAMMER2_BREF_TYPE_DATA:
683                         counterp = &hammer2_ioa_file_write;
684                         break;
685                 case HAMMER2_BREF_TYPE_INODE:
686                         counterp = &hammer2_ioa_meta_write;
687                         break;
688                 case HAMMER2_BREF_TYPE_INDIRECT:
689                         counterp = &hammer2_ioa_indr_write;
690                         break;
691                 default:
692                         counterp = &hammer2_ioa_volu_write;
693                         break;
694                 }
695                 ++*counterp;
696         } else {
697                 switch(chain->bref.type) {
698                 case HAMMER2_BREF_TYPE_DATA:
699                         counterp = &hammer2_iod_file_write;
700                         break;
701                 case HAMMER2_BREF_TYPE_INODE:
702                         counterp = &hammer2_iod_meta_write;
703                         break;
704                 case HAMMER2_BREF_TYPE_INDIRECT:
705                         counterp = &hammer2_iod_indr_write;
706                         break;
707                 default:
708                         counterp = &hammer2_iod_volu_write;
709                         break;
710                 }
711                 ++*counterp;
712         }
713
714         /*
715          * Clean out the bp.
716          *
717          * If a device buffer was used for data be sure to destroy the
718          * buffer when we are done to avoid aliases (XXX what about the
719          * underlying VM pages?).
720          */
721         if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
722                 chain->bp->b_flags |= B_RELBUF;
723
724         /*
725          * The DIRTYBP flag tracks whether we have to bdwrite() the buffer
726          * or not.  The flag will get re-set when chain_modify() is called,
727          * even if MODIFIED is already set, allowing the OS to retire the
728          * buffer independent of a hammer2 flus.
729          */
730         chain->data = NULL;
731         if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
732                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
733                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
734                         atomic_clear_int(&chain->flags,
735                                          HAMMER2_CHAIN_IOFLUSH);
736                         chain->bp->b_flags |= B_RELBUF;
737                         cluster_awrite(chain->bp);
738                 } else {
739                         chain->bp->b_flags |= B_CLUSTEROK;
740                         bdwrite(chain->bp);
741                 }
742         } else {
743                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
744                         atomic_clear_int(&chain->flags,
745                                          HAMMER2_CHAIN_IOFLUSH);
746                         chain->bp->b_flags |= B_RELBUF;
747                         brelse(chain->bp);
748                 } else {
749                         /* bp might still be dirty */
750                         bqrelse(chain->bp);
751                 }
752         }
753         chain->bp = NULL;
754         ccms_thread_unlock(&chain->cst);
755         hammer2_chain_drop(hmp, chain);
756 }
757
758 /*
759  * Resize the chain's physical storage allocation.  Chains can be resized
760  * smaller without reallocating the storage.  Resizing larger will reallocate
761  * the storage.
762  *
763  * Must be passed a locked chain.
764  *
765  * If you want the resize code to copy the data to the new block then the
766  * caller should lock the chain RESOLVE_MAYBE or RESOLVE_ALWAYS.
767  *
768  * If the caller already holds a logical buffer containing the data and
769  * intends to bdwrite() that buffer resolve with RESOLVE_NEVER.  The resize
770  * operation will then not copy the data.
771  *
772  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
773  * to avoid instantiating a device buffer that conflicts with the vnode
774  * data buffer.
775  *
776  * XXX flags currently ignored, uses chain->bp to detect data/no-data.
777  */
778 void
779 hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
780                      int nradix, int flags)
781 {
782         hammer2_mount_t *hmp = ip->hmp;
783         struct buf *nbp;
784         hammer2_off_t pbase;
785         size_t obytes;
786         size_t nbytes;
787         size_t bbytes;
788         int boff;
789         char *bdata;
790         int error;
791
792         /*
793          * Only data and indirect blocks can be resized for now
794          */
795         KKASSERT(chain != &hmp->vchain);
796         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
797                  chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
798
799         /*
800          * Nothing to do if the element is already the proper size
801          */
802         obytes = chain->bytes;
803         nbytes = 1U << nradix;
804         if (obytes == nbytes)
805                 return;
806
807         /*
808          * Set MODIFIED and add a chain ref to prevent destruction.  Both
809          * modified flags share the same ref.
810          *
811          * If the chain is already marked MODIFIED then we can safely
812          * return the previous allocation to the pool without having to
813          * worry about snapshots.
814          */
815         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
816                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
817                                               HAMMER2_CHAIN_MODIFY_TID);
818                 hammer2_chain_ref(hmp, chain);
819         } else {
820                 hammer2_freemap_free(hmp, chain->bref.data_off,
821                                      chain->bref.type);
822         }
823
824         /*
825          * Relocate the block, even if making it smaller (because different
826          * block sizes may be in different regions).
827          */
828         chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type,
829                                                      nbytes);
830         chain->bytes = nbytes;
831         ip->delta_dcount += (ssize_t)(nbytes - obytes); /* XXX atomic */
832
833         /*
834          * The device buffer may be larger than the allocation size.
835          */
836         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
837                 bbytes = HAMMER2_MINIOSIZE;
838         pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
839         boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
840
841         /*
842          * Only copy the data if resolved, otherwise the caller is
843          * responsible.
844          */
845         if (chain->bp) {
846                 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
847                          chain->bref.type == HAMMER2_BREF_TYPE_DATA);
848                 KKASSERT(chain != &hmp->vchain);        /* safety */
849
850                 /*
851                  * The getblk() optimization can only be used if the
852                  * physical block size matches the request.
853                  */
854                 if (nbytes == bbytes) {
855                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
856                         error = 0;
857                 } else {
858                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
859                         KKASSERT(error == 0);
860                 }
861                 bdata = (char *)nbp->b_data + boff;
862
863                 if (nbytes < obytes) {
864                         bcopy(chain->data, bdata, nbytes);
865                 } else {
866                         bcopy(chain->data, bdata, obytes);
867                         bzero(bdata + obytes, nbytes - obytes);
868                 }
869
870                 /*
871                  * NOTE: The INITIAL state of the chain is left intact.
872                  *       We depend on hammer2_chain_modify() to do the
873                  *       right thing.
874                  *
875                  * NOTE: We set B_NOCACHE to throw away the previous bp and
876                  *       any VM backing store, even if it was dirty.
877                  *       Otherwise we run the risk of a logical/device
878                  *       conflict on reallocation.
879                  */
880                 chain->bp->b_flags |= B_RELBUF | B_NOCACHE;
881                 brelse(chain->bp);
882                 chain->bp = nbp;
883                 chain->data = (void *)bdata;
884                 hammer2_chain_modify(hmp, chain, 0);
885         }
886
887         /*
888          * Make sure the chain is marked MOVED and SUBMOD is set in the
889          * parent(s) so the adjustments are picked up by flush.
890          */
891         if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
892                 hammer2_chain_ref(hmp, chain);
893                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
894         }
895         hammer2_chain_parent_setsubmod(hmp, chain);
896 }
897
898 /*
899  * Convert a locked chain that was retrieved read-only to read-write.
900  *
901  * If not already marked modified a new physical block will be allocated
902  * and assigned to the bref.
903  *
904  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
905  *                   level or the COW operation will not work.
906  *
907  * Data blocks     - The chain is usually locked RESOLVE_NEVER so as not to
908  *                   run the data through the device buffers.
909  */
910 void
911 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
912 {
913         struct buf *nbp;
914         int error;
915         hammer2_off_t pbase;
916         size_t bbytes;
917         size_t boff;
918         void *bdata;
919
920         /*
921          * Tells flush that modify_tid must be updated, otherwise only
922          * mirror_tid is updated.  This is the default.
923          */
924         if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
925                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFY_TID);
926
927         /*
928          * If the chain is already marked MODIFIED we can just return.
929          *
930          * However, it is possible that a prior lock/modify sequence
931          * retired the buffer.  During this lock/modify sequence MODIFIED
932          * may still be set but the buffer could wind up clean.  Since
933          * the caller is going to modify the buffer further we have to
934          * be sure that DIRTYBP is set again.
935          */
936         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
937                 if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
938                     chain->bp == NULL) {
939                         goto skip1;
940                 }
941                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
942                 return;
943         }
944
945         /*
946          * Set MODIFIED and add a chain ref to prevent destruction.  Both
947          * modified flags share the same ref.
948          */
949         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
950         hammer2_chain_ref(hmp, chain);
951
952         /*
953          * We must allocate the copy-on-write block.
954          *
955          * If the data is embedded no other action is required.
956          *
957          * If the data is not embedded we acquire and clear the
958          * new block.  If chain->data is not NULL we then do the
959          * copy-on-write.  chain->data will then be repointed to the new
960          * buffer and the old buffer will be released.
961          *
962          * For newly created elements with no prior allocation we go
963          * through the copy-on-write steps except without the copying part.
964          */
965         if (chain != &hmp->vchain) {
966                 if ((hammer2_debug & 0x0001) &&
967                     (chain->bref.data_off & HAMMER2_OFF_MASK)) {
968                         kprintf("Replace %d\n", chain->bytes);
969                 }
970                 chain->bref.data_off =
971                         hammer2_freemap_alloc(hmp, chain->bref.type,
972                                               chain->bytes);
973                 /* XXX failed allocation */
974         }
975
976         /*
977          * If data instantiation is optional and the chain has no current
978          * data association (typical for DATA and newly-created INDIRECT
979          * elements), don't instantiate the buffer now.
980          */
981         if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL)
982                 goto skip2;
983
984 skip1:
985         /*
986          * Setting the DIRTYBP flag will cause the buffer to be dirtied or
987          * written-out on unlock.  This bit is independent of the MODIFIED
988          * bit because the chain may still need meta-data adjustments done
989          * by virtue of MODIFIED for its parent, and the buffer can be
990          * flushed out (possibly multiple times) by the OS before that.
991          *
992          * Clearing the INITIAL flag (for indirect blocks) indicates that
993          * a zero-fill buffer has been instantiated.
994          */
995         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
996         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
997
998         /*
999          * We currently should never instantiate a device buffer for a
1000          * data chain.
1001          */
1002         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
1003
1004         /*
1005          * Execute COW operation
1006          */
1007         switch(chain->bref.type) {
1008         case HAMMER2_BREF_TYPE_VOLUME:
1009         case HAMMER2_BREF_TYPE_INODE:
1010                 /*
1011                  * The data is embedded, no copy-on-write operation is
1012                  * needed.
1013                  */
1014                 KKASSERT(chain->bp == NULL);
1015                 break;
1016         case HAMMER2_BREF_TYPE_DATA:
1017         case HAMMER2_BREF_TYPE_INDIRECT:
1018                 /*
1019                  * Perform the copy-on-write operation
1020                  */
1021                 KKASSERT(chain != &hmp->vchain);        /* safety */
1022                 /*
1023                  * The device buffer may be larger than the allocation size.
1024                  */
1025                 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
1026                         bbytes = HAMMER2_MINIOSIZE;
1027                 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
1028                 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
1029
1030                 /*
1031                  * The getblk() optimization can only be used if the
1032                  * physical block size matches the request.
1033                  */
1034                 if (chain->bytes == bbytes) {
1035                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
1036                         error = 0;
1037                 } else {
1038                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
1039                         KKASSERT(error == 0);
1040                 }
1041                 bdata = (char *)nbp->b_data + boff;
1042
1043                 /*
1044                  * Copy or zero-fill on write depending on whether
1045                  * chain->data exists or not.
1046                  */
1047                 if (chain->data) {
1048                         bcopy(chain->data, bdata, chain->bytes);
1049                         KKASSERT(chain->bp != NULL);
1050                 } else {
1051                         bzero(bdata, chain->bytes);
1052                 }
1053                 if (chain->bp) {
1054                         chain->bp->b_flags |= B_RELBUF;
1055                         brelse(chain->bp);
1056                 }
1057                 chain->bp = nbp;
1058                 chain->data = bdata;
1059                 break;
1060         default:
1061                 panic("hammer2_chain_modify: illegal non-embedded type %d",
1062                       chain->bref.type);
1063                 break;
1064
1065         }
1066 skip2:
1067         if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
1068                 hammer2_chain_parent_setsubmod(hmp, chain);
1069 }
1070
1071 /*
1072  * Mark the volume as having been modified.  This short-cut version
1073  * does not have to lock the volume's chain, which allows the ioctl
1074  * code to make adjustments to connections without deadlocking.
1075  */
1076 void
1077 hammer2_modify_volume(hammer2_mount_t *hmp)
1078 {
1079         hammer2_voldata_lock(hmp);
1080         atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
1081         hammer2_voldata_unlock(hmp);
1082 }
1083
1084 /*
1085  * Locate an in-memory chain.  The parent must be locked.  The in-memory
1086  * chain is returned or NULL if no in-memory chain is present.
1087  *
1088  * NOTE: A chain on-media might exist for this index when NULL is returned.
1089  */
1090 hammer2_chain_t *
1091 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
1092 {
1093         hammer2_chain_t dummy;
1094         hammer2_chain_t *chain;
1095
1096         dummy.index = index;
1097         chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1098         return (chain);
1099 }
1100
1101 /*
1102  * Return a locked chain structure with all associated data acquired.
1103  *
1104  * Caller must lock the parent on call, the returned child will be locked.
1105  */
1106 hammer2_chain_t *
1107 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1108                   int index, int flags)
1109 {
1110         hammer2_blockref_t *bref;
1111         hammer2_inode_t *ip;
1112         hammer2_chain_t *chain;
1113         hammer2_chain_t dummy;
1114         int how;
1115         ccms_state_t ostate;
1116
1117         /*
1118          * Figure out how to lock.  MAYBE can be used to optimized
1119          * the initial-create state for indirect blocks.
1120          */
1121         if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
1122                 how = HAMMER2_RESOLVE_NEVER;
1123         else
1124                 how = HAMMER2_RESOLVE_MAYBE;
1125         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
1126                 how |= HAMMER2_RESOLVE_SHARED;
1127
1128         /*
1129          * First see if we have a (possibly modified) chain element cached
1130          * for this (parent, index).  Acquire the data if necessary.
1131          *
1132          * If chain->data is non-NULL the chain should already be marked
1133          * modified.
1134          */
1135         dummy.index = index;
1136         chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1137         if (chain) {
1138                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1139                         hammer2_chain_ref(hmp, chain);
1140                 else
1141                         hammer2_chain_lock(hmp, chain, how);
1142                 return(chain);
1143         }
1144
1145         /*
1146          * Upgrade our thread lock and handle any race that may have
1147          * occurred.  Leave the lock upgraded for the rest of the get.
1148          * We have to do this because we will be modifying the chain
1149          * structure.
1150          */
1151         ostate = ccms_thread_lock_upgrade(&parent->cst);
1152         chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1153         if (chain) {
1154                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1155                         hammer2_chain_ref(hmp, chain);
1156                 else
1157                         hammer2_chain_lock(hmp, chain, how);
1158                 ccms_thread_lock_restore(&parent->cst, ostate);
1159                 return(chain);
1160         }
1161
1162         /*
1163          * The get function must always succeed, panic if there's no
1164          * data to index.
1165          */
1166         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1167                 ccms_thread_lock_restore(&parent->cst, ostate);
1168                 panic("hammer2_chain_get: Missing bref(1)");
1169                 /* NOT REACHED */
1170         }
1171
1172         /*
1173          * Otherwise lookup the bref and issue I/O (switch on the parent)
1174          */
1175         switch(parent->bref.type) {
1176         case HAMMER2_BREF_TYPE_INODE:
1177                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1178                 bref = &parent->data->ipdata.u.blockset.blockref[index];
1179                 break;
1180         case HAMMER2_BREF_TYPE_INDIRECT:
1181                 KKASSERT(parent->data != NULL);
1182                 KKASSERT(index >= 0 &&
1183                          index < parent->bytes / sizeof(hammer2_blockref_t));
1184                 bref = &parent->data->npdata.blockref[index];
1185                 break;
1186         case HAMMER2_BREF_TYPE_VOLUME:
1187                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1188                 bref = &hmp->voldata.sroot_blockset.blockref[index];
1189                 break;
1190         default:
1191                 bref = NULL;
1192                 panic("hammer2_chain_get: unrecognized blockref type: %d",
1193                       parent->bref.type);
1194         }
1195         if (bref->type == 0) {
1196                 panic("hammer2_chain_get: Missing bref(2)");
1197                 /* NOT REACHED */
1198         }
1199
1200         /*
1201          * Allocate a chain structure representing the existing media
1202          * entry.
1203          *
1204          * The locking operation we do later will issue I/O to read it.
1205          */
1206         chain = hammer2_chain_alloc(hmp, bref);
1207
1208         /*
1209          * Link the chain into its parent.  Caller is expected to hold an
1210          * exclusive lock on the parent.
1211          */
1212         chain->parent = parent;
1213         chain->index = index;
1214         if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
1215                 panic("hammer2_chain_link: collision");
1216         atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
1217         KKASSERT(parent->refs > 0);
1218         atomic_add_int(&parent->refs, 1);       /* for red-black entry */
1219         ccms_thread_lock_restore(&parent->cst, ostate);
1220
1221         /*
1222          * Additional linkage for inodes.  Reuse the parent pointer to
1223          * find the parent directory.
1224          *
1225          * The ccms_inode is initialized from its parent directory.  The
1226          * chain of ccms_inode's is seeded by the mount code.
1227          */
1228         if (bref->type == HAMMER2_BREF_TYPE_INODE) {
1229                 ip = chain->u.ip;
1230                 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1231                         parent = parent->parent;
1232                 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
1233                         ip->pip = parent->u.ip;
1234                         ip->pmp = parent->u.ip->pmp;
1235                         ccms_cst_init(&ip->topo_cst, &ip->chain);
1236                 }
1237         }
1238
1239         /*
1240          * Our new chain structure has already been referenced and locked
1241          * but the lock code handles the I/O so call it to resolve the data.
1242          * Then release one of our two exclusive locks.
1243          *
1244          * If NOLOCK is set the release will release the one-and-only lock.
1245          */
1246         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1247                 hammer2_chain_lock(hmp, chain, how);    /* recusive lock */
1248                 hammer2_chain_drop(hmp, chain);         /* excess ref */
1249         }
1250         ccms_thread_unlock(&chain->cst);                        /* from alloc */
1251
1252         return (chain);
1253 }
1254
1255 /*
1256  * Locate any key between key_beg and key_end inclusive.  (*parentp)
1257  * typically points to an inode but can also point to a related indirect
1258  * block and this function will recurse upwards and find the inode again.
1259  *
1260  * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
1261  *           WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
1262  *           WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
1263  *
1264  * (*parentp) must be exclusively locked and referenced and can be an inode
1265  * or an existing indirect block within the inode.
1266  *
1267  * On return (*parentp) will be modified to point at the deepest parent chain
1268  * element encountered during the search, as a helper for an insertion or
1269  * deletion.   The new (*parentp) will be locked and referenced and the old
1270  * will be unlocked and dereferenced (no change if they are both the same).
1271  *
1272  * The matching chain will be returned exclusively locked and referenced.
1273  *
1274  * NULL is returned if no match was found, but (*parentp) will still
1275  * potentially be adjusted.
1276  *
1277  * This function will also recurse up the chain if the key is not within the
1278  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
1279  * can simply allow (*parentp) to float inside the loop.
1280  */
1281 hammer2_chain_t *
1282 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1283                      hammer2_key_t key_beg, hammer2_key_t key_end,
1284                      int flags)
1285 {
1286         hammer2_chain_t *parent;
1287         hammer2_chain_t *chain;
1288         hammer2_chain_t *tmp;
1289         hammer2_blockref_t *base;
1290         hammer2_blockref_t *bref;
1291         hammer2_key_t scan_beg;
1292         hammer2_key_t scan_end;
1293         int count = 0;
1294         int i;
1295         int how_always = HAMMER2_RESOLVE_ALWAYS;
1296         int how_maybe = HAMMER2_RESOLVE_MAYBE;
1297
1298         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) {
1299                 how_maybe |= HAMMER2_RESOLVE_SHARED;
1300                 how_always |= HAMMER2_RESOLVE_SHARED;
1301         }
1302
1303         /*
1304          * Recurse (*parentp) upward if necessary until the parent completely
1305          * encloses the key range or we hit the inode.
1306          */
1307         parent = *parentp;
1308         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1309                 scan_beg = parent->bref.key;
1310                 scan_end = scan_beg +
1311                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1312                 if (key_beg >= scan_beg && key_end <= scan_end)
1313                         break;
1314                 hammer2_chain_ref(hmp, parent);         /* ref old parent */
1315                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1316                 parent = parent->parent;
1317                                                         /* lock new parent */
1318                 hammer2_chain_lock(hmp, parent, how_maybe);
1319                 hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
1320                 *parentp = parent;                      /* new parent */
1321         }
1322
1323 again:
1324         /*
1325          * Locate the blockref array.  Currently we do a fully associative
1326          * search through the array.
1327          */
1328         switch(parent->bref.type) {
1329         case HAMMER2_BREF_TYPE_INODE:
1330                 /*
1331                  * Special shortcut for embedded data returns the inode
1332                  * itself.  Callers must detect this condition and access
1333                  * the embedded data (the strategy code does this for us).
1334                  *
1335                  * This is only applicable to regular files and softlinks.
1336                  */
1337                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1338                         if (flags & HAMMER2_LOOKUP_NOLOCK)
1339                                 hammer2_chain_ref(hmp, parent);
1340                         else
1341                                 hammer2_chain_lock(hmp, parent, how_always);
1342                         return (parent);
1343                 }
1344                 base = &parent->data->ipdata.u.blockset.blockref[0];
1345                 count = HAMMER2_SET_COUNT;
1346                 break;
1347         case HAMMER2_BREF_TYPE_INDIRECT:
1348                 /*
1349                  * Optimize indirect blocks in the INITIAL state to avoid
1350                  * I/O.
1351                  */
1352                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1353                         base = NULL;
1354                 } else {
1355                         if (parent->data == NULL)
1356                                 panic("parent->data is NULL");
1357                         base = &parent->data->npdata.blockref[0];
1358                 }
1359                 count = parent->bytes / sizeof(hammer2_blockref_t);
1360                 break;
1361         case HAMMER2_BREF_TYPE_VOLUME:
1362                 base = &hmp->voldata.sroot_blockset.blockref[0];
1363                 count = HAMMER2_SET_COUNT;
1364                 break;
1365         default:
1366                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
1367                       parent->bref.type);
1368                 base = NULL;    /* safety */
1369                 count = 0;      /* safety */
1370         }
1371
1372         /*
1373          * If the element and key overlap we use the element.
1374          *
1375          * NOTE!  Deleted elements are effectively invisible.  A Deleted
1376          *        elements covers (makes invisible) any original media
1377          *        data.
1378          */
1379         bref = NULL;
1380         for (i = 0; i < count; ++i) {
1381                 tmp = hammer2_chain_find(hmp, parent, i);
1382                 if (tmp) {
1383                         if (tmp->flags & HAMMER2_CHAIN_DELETED)
1384                                 continue;
1385                         bref = &tmp->bref;
1386                         KKASSERT(bref->type != 0);
1387                 } else if (base == NULL || base[i].type == 0) {
1388                         continue;
1389                 } else {
1390                         bref = &base[i];
1391                 }
1392                 scan_beg = bref->key;
1393                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1394                 if (key_beg <= scan_end && key_end >= scan_beg)
1395                         break;
1396         }
1397         if (i == count) {
1398                 if (key_beg == key_end)
1399                         return (NULL);
1400                 return (hammer2_chain_next(hmp, parentp, NULL,
1401                                            key_beg, key_end, flags));
1402         }
1403
1404         /*
1405          * Acquire the new chain element.  If the chain element is an
1406          * indirect block we must search recursively.
1407          */
1408         chain = hammer2_chain_get(hmp, parent, i, flags);
1409         if (chain == NULL)
1410                 return (NULL);
1411
1412         /*
1413          * If the chain element is an indirect block it becomes the new
1414          * parent and we loop on it.
1415          *
1416          * The parent always has to be locked with at least RESOLVE_MAYBE,
1417          * so it might need a fixup if the caller passed incompatible flags.
1418          */
1419         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1420                 hammer2_chain_unlock(hmp, parent);
1421                 *parentp = parent = chain;
1422                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1423                         hammer2_chain_lock(hmp, chain, how_maybe);
1424                         hammer2_chain_drop(hmp, chain); /* excess ref */
1425                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1426                         hammer2_chain_lock(hmp, chain, how_maybe);
1427                         hammer2_chain_unlock(hmp, chain);
1428                 }
1429                 goto again;
1430         }
1431
1432         /*
1433          * All done, return chain
1434          */
1435         return (chain);
1436 }
1437
1438 /*
1439  * After having issued a lookup we can iterate all matching keys.
1440  *
1441  * If chain is non-NULL we continue the iteration from just after it's index.
1442  *
1443  * If chain is NULL we assume the parent was exhausted and continue the
1444  * iteration at the next parent.
1445  *
1446  * parent must be locked on entry and remains locked throughout.  chain's
1447  * lock status must match flags.
1448  */
1449 hammer2_chain_t *
1450 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1451                    hammer2_chain_t *chain,
1452                    hammer2_key_t key_beg, hammer2_key_t key_end,
1453                    int flags)
1454 {
1455         hammer2_chain_t *parent;
1456         hammer2_chain_t *tmp;
1457         hammer2_blockref_t *base;
1458         hammer2_blockref_t *bref;
1459         hammer2_key_t scan_beg;
1460         hammer2_key_t scan_end;
1461         int i;
1462         int how_maybe = HAMMER2_RESOLVE_MAYBE;
1463         int count;
1464
1465         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
1466                 how_maybe |= HAMMER2_RESOLVE_SHARED;
1467
1468         parent = *parentp;
1469
1470 again:
1471         /*
1472          * Calculate the next index and recalculate the parent if necessary.
1473          */
1474         if (chain) {
1475                 /*
1476                  * Continue iteration within current parent.  If not NULL
1477                  * the passed-in chain may or may not be locked, based on
1478                  * the LOOKUP_NOLOCK flag (passed in as returned from lookup
1479                  * or a prior next).
1480                  */
1481                 i = chain->index + 1;
1482                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1483                         hammer2_chain_drop(hmp, chain);
1484                 else
1485                         hammer2_chain_unlock(hmp, chain);
1486
1487                 /*
1488                  * Any scan where the lookup returned degenerate data embedded
1489                  * in the inode has an invalid index and must terminate.
1490                  */
1491                 if (chain == parent)
1492                         return(NULL);
1493                 chain = NULL;
1494         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) {
1495                 /*
1496                  * We reached the end of the iteration.
1497                  */
1498                 return (NULL);
1499         } else {
1500                 /*
1501                  * Continue iteration with next parent unless the current
1502                  * parent covers the range.
1503                  */
1504                 hammer2_chain_t *nparent;
1505
1506                 scan_beg = parent->bref.key;
1507                 scan_end = scan_beg +
1508                             ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1509                 if (key_beg >= scan_beg && key_end <= scan_end)
1510                         return (NULL);
1511
1512                 i = parent->index + 1;
1513                 nparent = parent->parent;
1514                 hammer2_chain_ref(hmp, nparent);        /* ref new parent */
1515                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1516                                                         /* lock new parent */
1517                 hammer2_chain_lock(hmp, nparent, how_maybe);
1518                 hammer2_chain_drop(hmp, nparent);       /* drop excess ref */
1519                 *parentp = parent = nparent;
1520         }
1521
1522 again2:
1523         /*
1524          * Locate the blockref array.  Currently we do a fully associative
1525          * search through the array.
1526          */
1527         switch(parent->bref.type) {
1528         case HAMMER2_BREF_TYPE_INODE:
1529                 base = &parent->data->ipdata.u.blockset.blockref[0];
1530                 count = HAMMER2_SET_COUNT;
1531                 break;
1532         case HAMMER2_BREF_TYPE_INDIRECT:
1533                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1534                         base = NULL;
1535                 } else {
1536                         KKASSERT(parent->data != NULL);
1537                         base = &parent->data->npdata.blockref[0];
1538                 }
1539                 count = parent->bytes / sizeof(hammer2_blockref_t);
1540                 break;
1541         case HAMMER2_BREF_TYPE_VOLUME:
1542                 base = &hmp->voldata.sroot_blockset.blockref[0];
1543                 count = HAMMER2_SET_COUNT;
1544                 break;
1545         default:
1546                 panic("hammer2_chain_next: unrecognized blockref type: %d",
1547                       parent->bref.type);
1548                 base = NULL;    /* safety */
1549                 count = 0;      /* safety */
1550                 break;
1551         }
1552         KKASSERT(i <= count);
1553
1554         /*
1555          * Look for the key.  If we are unable to find a match and an exact
1556          * match was requested we return NULL.  If a range was requested we
1557          * run hammer2_chain_next() to iterate.
1558          *
1559          * NOTE!  Deleted elements are effectively invisible.  A Deleted
1560          *        elements covers (makes invisible) any original media
1561          *        data.
1562          */
1563         bref = NULL;
1564         while (i < count) {
1565                 tmp = hammer2_chain_find(hmp, parent, i);
1566                 if (tmp) {
1567                         if (tmp->flags & HAMMER2_CHAIN_DELETED) {
1568                                 ++i;
1569                                 continue;
1570                         }
1571                         bref = &tmp->bref;
1572                 } else if (base == NULL || base[i].type == 0) {
1573                         ++i;
1574                         continue;
1575                 } else {
1576                         bref = &base[i];
1577                 }
1578                 scan_beg = bref->key;
1579                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1580                 if (key_beg <= scan_end && key_end >= scan_beg)
1581                         break;
1582                 ++i;
1583         }
1584
1585         /*
1586          * If we couldn't find a match recurse up a parent to continue the
1587          * search.
1588          */
1589         if (i == count)
1590                 goto again;
1591
1592         /*
1593          * Acquire the new chain element.  If the chain element is an
1594          * indirect block we must search recursively.
1595          */
1596         chain = hammer2_chain_get(hmp, parent, i, flags);
1597         if (chain == NULL)
1598                 return (NULL);
1599
1600         /*
1601          * If the chain element is an indirect block it becomes the new
1602          * parent and we loop on it.
1603          *
1604          * The parent always has to be locked with at least RESOLVE_MAYBE,
1605          * so it might need a fixup if the caller passed incompatible flags.
1606          */
1607         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1608                 hammer2_chain_unlock(hmp, parent);
1609                 *parentp = parent = chain;
1610                 chain = NULL;
1611                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1612                         hammer2_chain_lock(hmp, parent, how_maybe);
1613                         hammer2_chain_drop(hmp, parent);        /* excess ref */
1614                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1615                         hammer2_chain_lock(hmp, parent, how_maybe);
1616                         hammer2_chain_unlock(hmp, parent);
1617                 }
1618                 i = 0;
1619                 goto again2;
1620         }
1621
1622         /*
1623          * All done, return chain
1624          */
1625         return (chain);
1626 }
1627
1628 /*
1629  * Create and return a new hammer2 system memory structure of the specified
1630  * key, type and size and insert it RELATIVE TO (PARENT).
1631  *
1632  * (parent) is typically either an inode or an indirect block, acquired
1633  * acquired as a side effect of issuing a prior failed lookup.  parent
1634  * must be locked and held.  Do not pass the inode chain to this function
1635  * unless that is the chain returned by the failed lookup.
1636  *
1637  * Non-indirect types will automatically allocate indirect blocks as required
1638  * if the new item does not fit in the current (parent).
1639  *
1640  * Indirect types will move a portion of the existing blockref array in
1641  * (parent) into the new indirect type and then use one of the free slots
1642  * to emplace the new indirect type.
1643  *
1644  * A new locked, referenced chain element is returned of the specified type.
1645  * The element may or may not have a data area associated with it:
1646  *
1647  *      VOLUME          not allowed here
1648  *      INODE           embedded data are will be set-up
1649  *      INDIRECT        not allowed here
1650  *      DATA            no data area will be set-up (caller is expected
1651  *                      to have logical buffers, we don't want to alias
1652  *                      the data onto device buffers!).
1653  *
1654  * Requires an exclusively locked parent.
1655  */
1656 hammer2_chain_t *
1657 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1658                      hammer2_chain_t *chain,
1659                      hammer2_key_t key, int keybits, int type, size_t bytes,
1660                      int *errorp)
1661 {
1662         hammer2_blockref_t dummy;
1663         hammer2_blockref_t *base;
1664         hammer2_chain_t dummy_chain;
1665         int unlock_parent = 0;
1666         int allocated = 0;
1667         int count;
1668         int i;
1669
1670         KKASSERT(ccms_thread_lock_owned(&parent->cst));
1671         *errorp = 0;
1672
1673         if (chain == NULL) {
1674                 /*
1675                  * First allocate media space and construct the dummy bref,
1676                  * then allocate the in-memory chain structure.
1677                  */
1678                 bzero(&dummy, sizeof(dummy));
1679                 dummy.type = type;
1680                 dummy.key = key;
1681                 dummy.keybits = keybits;
1682                 dummy.data_off = hammer2_bytes_to_radix(bytes);
1683                 chain = hammer2_chain_alloc(hmp, &dummy);
1684                 allocated = 1;
1685
1686                 /*
1687                  * We do NOT set INITIAL here (yet).  INITIAL is only
1688                  * used for indirect blocks.
1689                  *
1690                  * Recalculate bytes to reflect the actual media block
1691                  * allocation.
1692                  */
1693                 bytes = (hammer2_off_t)1 <<
1694                         (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
1695                 chain->bytes = bytes;
1696
1697                 switch(type) {
1698                 case HAMMER2_BREF_TYPE_VOLUME:
1699                         panic("hammer2_chain_create: called with volume type");
1700                         break;
1701                 case HAMMER2_BREF_TYPE_INODE:
1702                         KKASSERT(bytes == HAMMER2_INODE_BYTES);
1703                         chain->data = (void *)&chain->u.ip->ip_data;
1704                         break;
1705                 case HAMMER2_BREF_TYPE_INDIRECT:
1706                         panic("hammer2_chain_create: cannot be used to"
1707                               "create indirect block");
1708                         break;
1709                 case HAMMER2_BREF_TYPE_DATA:
1710                 default:
1711                         /* leave chain->data NULL */
1712                         KKASSERT(chain->data == NULL);
1713                         break;
1714                 }
1715         } else {
1716                 /*
1717                  * Potentially update the chain's key/keybits.
1718                  */
1719                 chain->bref.key = key;
1720                 chain->bref.keybits = keybits;
1721         }
1722
1723 again:
1724         /*
1725          * Locate a free blockref in the parent's array
1726          */
1727         switch(parent->bref.type) {
1728         case HAMMER2_BREF_TYPE_INODE:
1729                 KKASSERT((parent->u.ip->ip_data.op_flags &
1730                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
1731                 KKASSERT(parent->data != NULL);
1732                 base = &parent->data->ipdata.u.blockset.blockref[0];
1733                 count = HAMMER2_SET_COUNT;
1734                 break;
1735         case HAMMER2_BREF_TYPE_INDIRECT:
1736                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1737                         base = NULL;
1738                 } else {
1739                         KKASSERT(parent->data != NULL);
1740                         base = &parent->data->npdata.blockref[0];
1741                 }
1742                 count = parent->bytes / sizeof(hammer2_blockref_t);
1743                 break;
1744         case HAMMER2_BREF_TYPE_VOLUME:
1745                 KKASSERT(parent->data != NULL);
1746                 base = &hmp->voldata.sroot_blockset.blockref[0];
1747                 count = HAMMER2_SET_COUNT;
1748                 break;
1749         default:
1750                 panic("hammer2_chain_create: unrecognized blockref type: %d",
1751                       parent->bref.type);
1752                 count = 0;
1753                 break;
1754         }
1755
1756         /*
1757          * Scan for an unallocated bref, also skipping any slots occupied
1758          * by in-memory chain elements that may not yet have been updated
1759          * in the parent's bref array.
1760          */
1761         bzero(&dummy_chain, sizeof(dummy_chain));
1762         for (i = 0; i < count; ++i) {
1763                 if (base == NULL) {
1764                         dummy_chain.index = i;
1765                         if (RB_FIND(hammer2_chain_tree,
1766                                     &parent->rbhead, &dummy_chain) == NULL) {
1767                                 break;
1768                         }
1769                 } else if (base[i].type == 0) {
1770                         dummy_chain.index = i;
1771                         if (RB_FIND(hammer2_chain_tree,
1772                                     &parent->rbhead, &dummy_chain) == NULL) {
1773                                 break;
1774                         }
1775                 }
1776         }
1777
1778         /*
1779          * If no free blockref could be found we must create an indirect
1780          * block and move a number of blockrefs into it.  With the parent
1781          * locked we can safely lock each child in order to move it without
1782          * causing a deadlock.
1783          *
1784          * This may return the new indirect block or the old parent depending
1785          * on where the key falls.  NULL is returned on error.  The most
1786          * typical error is EAGAIN (flush conflict during chain move).
1787          */
1788         if (i == count) {
1789                 hammer2_chain_t *nparent;
1790
1791                 nparent = hammer2_chain_create_indirect(hmp, parent,
1792                                                         key, keybits,
1793                                                         errorp);
1794                 if (nparent == NULL) {
1795                         if (allocated)
1796                                 hammer2_chain_free(hmp, chain);
1797                         chain = NULL;
1798                         goto done;
1799                 }
1800                 if (parent != nparent) {
1801                         if (unlock_parent)
1802                                 hammer2_chain_unlock(hmp, parent);
1803                         parent = nparent;
1804                         unlock_parent = 1;
1805                 }
1806                 goto again;
1807         }
1808
1809         /*
1810          * Link the chain into its parent.  Later on we will have to set
1811          * the MOVED bit in situations where we don't mark the new chain
1812          * as being modified.
1813          */
1814         if (chain->parent != NULL)
1815                 panic("hammer2: hammer2_chain_create: chain already connected");
1816         KKASSERT(chain->parent == NULL);
1817         chain->parent = parent;
1818         chain->index = i;
1819         if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
1820                 panic("hammer2_chain_link: collision");
1821         atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
1822         KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
1823         KKASSERT(parent->refs > 0);
1824         atomic_add_int(&parent->refs, 1);
1825
1826         /*
1827          * Additional linkage for inodes.  Reuse the parent pointer to
1828          * find the parent directory.
1829          *
1830          * Cumulative adjustments are inherited on [re]attach and will
1831          * propagate up the tree on the next flush.
1832          *
1833          * The ccms_inode is initialized from its parent directory.  The
1834          * chain of ccms_inode's is seeded by the mount code.
1835          */
1836         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1837                 hammer2_chain_t *scan = parent;
1838                 hammer2_inode_t *ip = chain->u.ip;
1839
1840                 while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1841                         scan = scan->parent;
1842                 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE) {
1843                         ip->pip = scan->u.ip;
1844                         ip->pmp = scan->u.ip->pmp;
1845                         ip->pip->delta_icount += ip->ip_data.inode_count;
1846                         ip->pip->delta_dcount += ip->ip_data.data_count;
1847                         ++ip->pip->delta_icount;
1848                         ccms_cst_init(&ip->topo_cst, &ip->chain);
1849                 }
1850         }
1851
1852         /*
1853          * (allocated) indicates that this is a newly-created chain element
1854          * rather than a renamed chain element.  In this situation we want
1855          * to place the chain element in the MODIFIED state.
1856          *
1857          * The data area will be set up as follows:
1858          *
1859          *      VOLUME          not allowed here.
1860          *
1861          *      INODE           embedded data are will be set-up.
1862          *
1863          *      INDIRECT        not allowed here.
1864          *
1865          *      DATA            no data area will be set-up (caller is expected
1866          *                      to have logical buffers, we don't want to alias
1867          *                      the data onto device buffers!).
1868          */
1869         if (allocated) {
1870                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1871                         hammer2_chain_modify(hmp, chain,
1872                                              HAMMER2_MODIFY_OPTDATA);
1873                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1874                         /* not supported in this function */
1875                         panic("hammer2_chain_create: bad type");
1876                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1877                         hammer2_chain_modify(hmp, chain,
1878                                              HAMMER2_MODIFY_OPTDATA);
1879                 } else {
1880                         hammer2_chain_modify(hmp, chain, 0);
1881                 }
1882         } else {
1883                 /*
1884                  * When reconnecting inodes we have to call setsubmod()
1885                  * to ensure that its state propagates up the newly
1886                  * connected parent.
1887                  *
1888                  * Make sure MOVED is set but do not update bref_flush.  If
1889                  * the chain is undergoing modification bref_flush will be
1890                  * updated when it gets flushed.  If it is not then the
1891                  * bref may not have been flushed yet and we do not want to
1892                  * set MODIFIED here as this could result in unnecessary
1893                  * reallocations.
1894                  */
1895                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1896                         hammer2_chain_ref(hmp, chain);
1897                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1898                 }
1899                 hammer2_chain_parent_setsubmod(hmp, chain);
1900         }
1901
1902 done:
1903         if (unlock_parent)
1904                 hammer2_chain_unlock(hmp, parent);
1905         return (chain);
1906 }
1907
1908 /*
1909  * Create an indirect block that covers one or more of the elements in the
1910  * current parent.  Either returns the existing parent with no locking or
1911  * ref changes or returns the new indirect block locked and referenced
1912  * and leaving the original parent lock/ref intact as well.
1913  *
1914  * If an error occurs, NULL is returned and *errorp is set to the error.
1915  * EAGAIN can be returned to indicate a flush collision which requires the
1916  * caller to retry.
1917  *
1918  * The returned chain depends on where the specified key falls.
1919  *
1920  * The key/keybits for the indirect mode only needs to follow three rules:
1921  *
1922  * (1) That all elements underneath it fit within its key space and
1923  *
1924  * (2) That all elements outside it are outside its key space.
1925  *
1926  * (3) When creating the new indirect block any elements in the current
1927  *     parent that fit within the new indirect block's keyspace must be
1928  *     moved into the new indirect block.
1929  *
1930  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
1931  *     keyspace the the current parent, but lookup/iteration rules will
1932  *     ensure (and must ensure) that rule (2) for all parents leading up
1933  *     to the nearest inode or the root volume header is adhered to.  This
1934  *     is accomplished by always recursing through matching keyspaces in
1935  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
1936  *
1937  * The current implementation calculates the current worst-case keyspace by
1938  * iterating the current parent and then divides it into two halves, choosing
1939  * whichever half has the most elements (not necessarily the half containing
1940  * the requested key).
1941  *
1942  * We can also opt to use the half with the least number of elements.  This
1943  * causes lower-numbered keys (aka logical file offsets) to recurse through
1944  * fewer indirect blocks and higher-numbered keys to recurse through more.
1945  * This also has the risk of not moving enough elements to the new indirect
1946  * block and being forced to create several indirect blocks before the element
1947  * can be inserted.
1948  *
1949  * Must be called with an exclusively locked parent.
1950  */
1951 static
1952 hammer2_chain_t *
1953 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1954                               hammer2_key_t create_key, int create_bits,
1955                               int *errorp)
1956 {
1957         hammer2_blockref_t *base;
1958         hammer2_blockref_t *bref;
1959         hammer2_chain_t *chain;
1960         hammer2_chain_t *ichain;
1961         hammer2_chain_t dummy;
1962         hammer2_key_t key = create_key;
1963         int keybits = create_bits;
1964         int locount = 0;
1965         int hicount = 0;
1966         int count;
1967         int nbytes;
1968         int i;
1969
1970         /*
1971          * Calculate the base blockref pointer or NULL if the chain
1972          * is known to be empty.  We need to calculate the array count
1973          * for RB lookups either way.
1974          */
1975         KKASSERT(ccms_thread_lock_owned(&parent->cst));
1976         *errorp = 0;
1977
1978         hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
1979         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1980                 base = NULL;
1981
1982                 switch(parent->bref.type) {
1983                 case HAMMER2_BREF_TYPE_INODE:
1984                         count = HAMMER2_SET_COUNT;
1985                         break;
1986                 case HAMMER2_BREF_TYPE_INDIRECT:
1987                         count = parent->bytes / sizeof(hammer2_blockref_t);
1988                         break;
1989                 case HAMMER2_BREF_TYPE_VOLUME:
1990                         count = HAMMER2_SET_COUNT;
1991                         break;
1992                 default:
1993                         panic("hammer2_chain_create_indirect: "
1994                               "unrecognized blockref type: %d",
1995                               parent->bref.type);
1996                         count = 0;
1997                         break;
1998                 }
1999         } else {
2000                 switch(parent->bref.type) {
2001                 case HAMMER2_BREF_TYPE_INODE:
2002                         base = &parent->data->ipdata.u.blockset.blockref[0];
2003                         count = HAMMER2_SET_COUNT;
2004                         break;
2005                 case HAMMER2_BREF_TYPE_INDIRECT:
2006                         base = &parent->data->npdata.blockref[0];
2007                         count = parent->bytes / sizeof(hammer2_blockref_t);
2008                         break;
2009                 case HAMMER2_BREF_TYPE_VOLUME:
2010                         base = &hmp->voldata.sroot_blockset.blockref[0];
2011                         count = HAMMER2_SET_COUNT;
2012                         break;
2013                 default:
2014                         panic("hammer2_chain_create_indirect: "
2015                               "unrecognized blockref type: %d",
2016                               parent->bref.type);
2017                         count = 0;
2018                         break;
2019                 }
2020         }
2021
2022         /*
2023          * Scan for an unallocated bref, also skipping any slots occupied
2024          * by in-memory chain elements which may not yet have been updated
2025          * in the parent's bref array.
2026          */
2027         bzero(&dummy, sizeof(dummy));
2028         for (i = 0; i < count; ++i) {
2029                 int nkeybits;
2030
2031                 dummy.index = i;
2032                 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
2033                 if (chain) {
2034                         /*
2035                          * NOTE! CHAIN_DELETED elements have to be adjusted
2036                          *       too, they cannot be ignored.
2037                          */
2038                         bref = &chain->bref;
2039                 } else if (base && base[i].type) {
2040                         bref = &base[i];
2041                 } else {
2042                         continue;
2043                 }
2044
2045                 /*
2046                  * Expand our calculated key range (key, keybits) to fit
2047                  * the scanned key.  nkeybits represents the full range
2048                  * that we will later cut in half (two halves @ nkeybits - 1).
2049                  */
2050                 nkeybits = keybits;
2051                 if (nkeybits < bref->keybits)
2052                         nkeybits = bref->keybits;
2053                 while (nkeybits < 64 &&
2054                        (~(((hammer2_key_t)1 << nkeybits) - 1) &
2055                         (key ^ bref->key)) != 0) {
2056                         ++nkeybits;
2057                 }
2058
2059                 /*
2060                  * If the new key range is larger we have to determine
2061                  * which side of the new key range the existing keys fall
2062                  * under by checking the high bit, then collapsing the
2063                  * locount into the hicount or vise-versa.
2064                  */
2065                 if (keybits != nkeybits) {
2066                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
2067                                 hicount += locount;
2068                                 locount = 0;
2069                         } else {
2070                                 locount += hicount;
2071                                 hicount = 0;
2072                         }
2073                         keybits = nkeybits;
2074                 }
2075
2076                 /*
2077                  * The newly scanned key will be in the lower half or the
2078                  * higher half of the (new) key range.
2079                  */
2080                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
2081                         ++hicount;
2082                 else
2083                         ++locount;
2084         }
2085
2086         /*
2087          * Adjust keybits to represent half of the full range calculated
2088          * above (radix 63 max)
2089          */
2090         --keybits;
2091
2092         /*
2093          * Select whichever half contains the most elements.  Theoretically
2094          * we can select either side as long as it contains at least one
2095          * element (in order to ensure that a free slot is present to hold
2096          * the indirect block).
2097          */
2098         key &= ~(((hammer2_key_t)1 << keybits) - 1);
2099         if (hammer2_indirect_optimize) {
2100                 /*
2101                  * Insert node for least number of keys, this will arrange
2102                  * the first few blocks of a large file or the first few
2103                  * inodes in a directory with fewer indirect blocks when
2104                  * created linearly.
2105                  */
2106                 if (hicount < locount && hicount != 0)
2107                         key |= (hammer2_key_t)1 << keybits;
2108                 else
2109                         key &= ~(hammer2_key_t)1 << keybits;
2110         } else {
2111                 /*
2112                  * Insert node for most number of keys, best for heavily
2113                  * fragmented files.
2114                  */
2115                 if (hicount > locount)
2116                         key |= (hammer2_key_t)1 << keybits;
2117                 else
2118                         key &= ~(hammer2_key_t)1 << keybits;
2119         }
2120
2121         /*
2122          * How big should our new indirect block be?  It has to be at least
2123          * as large as its parent.
2124          */
2125         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
2126                 nbytes = HAMMER2_IND_BYTES_MIN;
2127         else
2128                 nbytes = HAMMER2_IND_BYTES_MAX;
2129         if (nbytes < count * sizeof(hammer2_blockref_t))
2130                 nbytes = count * sizeof(hammer2_blockref_t);
2131
2132         /*
2133          * Ok, create our new indirect block
2134          */
2135         dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
2136         dummy.bref.key = key;
2137         dummy.bref.keybits = keybits;
2138         dummy.bref.data_off = hammer2_bytes_to_radix(nbytes);
2139         ichain = hammer2_chain_alloc(hmp, &dummy.bref);
2140         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
2141
2142         /*
2143          * Iterate the original parent and move the matching brefs into
2144          * the new indirect block.
2145          */
2146         for (i = 0; i < count; ++i) {
2147                 /*
2148                  * For keying purposes access the bref from the media or
2149                  * from our in-memory cache.  In cases where the in-memory
2150                  * cache overrides the media the keyrefs will be the same
2151                  * anyway so we can avoid checking the cache when the media
2152                  * has a key.
2153                  */
2154                 dummy.index = i;
2155                 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
2156                 if (chain) {
2157                         /*
2158                          * NOTE! CHAIN_DELETED elements have to be adjusted
2159                          *       too, they cannot be ignored.
2160                          */
2161                         bref = &chain->bref;
2162                 } else if (base && base[i].type) {
2163                         bref = &base[i];
2164                 } else {
2165                         if (ichain->index < 0)
2166                                 ichain->index = i;
2167                         continue;
2168                 }
2169
2170                 /*
2171                  * Skip keys not in the chosen half (low or high), only bit
2172                  * (keybits - 1) needs to be compared but for safety we
2173                  * will compare all msb bits plus that bit again.
2174                  */
2175                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
2176                     (key ^ bref->key)) != 0) {
2177                         continue;
2178                 }
2179
2180                 /*
2181                  * This element is being moved from the parent, its slot
2182                  * is available for our new indirect block.
2183                  */
2184                 if (ichain->index < 0)
2185                         ichain->index = i;
2186
2187                 /*
2188                  * Load the new indirect block by acquiring or allocating
2189                  * the related chain entries, then simply move them to the
2190                  * new parent (ichain).  We cannot move chains which are
2191                  * undergoing flushing and will break out of the loop in
2192                  * that case.
2193                  *
2194                  * When adjusting the parent/child relationship we must
2195                  * set the MOVED bit but we do NOT update bref_flush
2196                  * because otherwise we might synchronize a bref that has
2197                  * not yet been flushed.  We depend on chain's bref_flush
2198                  * either being correct or the chain being in a MODIFIED
2199                  * state.
2200                  *
2201                  * We do not want to set MODIFIED here as this would result
2202                  * in unnecessary reallocations.
2203                  *
2204                  * We must still set SUBMODIFIED in the parent but we do
2205                  * that after the loop.
2206                  *
2207                  * WARNING! chain->cst.spin must be held when chain->parent is
2208                  *          modified, even though we own the full blown lock,
2209                  *          to deal with setsubmod and rename races.
2210                  */
2211                 chain = hammer2_chain_get(hmp, parent, i,
2212                                           HAMMER2_LOOKUP_NODATA);
2213                 if (chain->flushing) {
2214                         hammer2_chain_unlock(hmp, chain);
2215                         break;
2216                 }
2217
2218                 spin_lock(&chain->cst.spin);
2219                 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
2220                 if (RB_INSERT(hammer2_chain_tree, &ichain->rbhead, chain))
2221                         panic("hammer2_chain_create_indirect: collision");
2222                 chain->parent = ichain;
2223                 spin_unlock(&chain->cst.spin);
2224
2225                 if (base)
2226                         bzero(&base[i], sizeof(base[i]));
2227                 atomic_add_int(&parent->refs, -1);
2228                 atomic_add_int(&ichain->refs, 1);
2229                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2230                         hammer2_chain_ref(hmp, chain);
2231                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2232                 }
2233                 hammer2_chain_unlock(hmp, chain);
2234                 KKASSERT(parent->refs > 0);
2235                 chain = NULL;
2236         }
2237
2238         /*
2239          * If we hit a chain that is undergoing flushing we're screwed and
2240          * we have to duno the whole mess.  Since ichain has not been linked
2241          * in yet, the moved chains are not reachable and will not have been
2242          * disposed of.
2243          *
2244          * WARNING! This code is pretty hairy because the flusher is sitting
2245          *          on the parent processing one of the children that we
2246          *          haven't yet moved, and will do a RB_NEXT loop on that
2247          *          child.  So the children we're moving back have to be
2248          *          returned to the same place in the iteration that they
2249          *          were removed from.
2250          */
2251         if (i != count) {
2252                 kprintf("hammer2_chain_create_indirect: EAGAIN\n");
2253                 *errorp = EAGAIN;
2254                 while ((chain = RB_ROOT(&ichain->rbhead)) != NULL) {
2255                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_NEVER);
2256                         KKASSERT(chain->flushing == 0);
2257                         RB_REMOVE(hammer2_chain_tree, &ichain->rbhead, chain);
2258                         if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
2259                                 panic("hammer2_chain_create_indirect: collision");
2260                         chain->parent = parent;
2261                         atomic_add_int(&parent->refs, 1);
2262                         atomic_add_int(&ichain->refs, -1);
2263                         /* MOVED bit might have been inherited, cannot undo */
2264                         hammer2_chain_unlock(hmp, chain);
2265                 }
2266                 hammer2_chain_free(hmp, ichain);
2267                 return(NULL);
2268         }
2269
2270         /*
2271          * Insert the new indirect block into the parent now that we've
2272          * cleared out some entries in the parent.  We calculated a good
2273          * insertion index in the loop above (ichain->index).
2274          *
2275          * We don't have to set MOVED here because we mark ichain modified
2276          * down below (so the normal modified -> flush -> set-moved sequence
2277          * applies).
2278          */
2279         KKASSERT(ichain->index >= 0);
2280         if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, ichain))
2281                 panic("hammer2_chain_create_indirect: ichain insertion");
2282         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_ONRBTREE);
2283         ichain->parent = parent;
2284         atomic_add_int(&parent->refs, 1);
2285
2286         /*
2287          * Mark the new indirect block modified after insertion, which
2288          * will propagate up through parent all the way to the root and
2289          * also allocate the physical block in ichain for our caller,
2290          * and assign ichain->data to a pre-zero'd space (because there
2291          * is not prior data to copy into it).
2292          *
2293          * We have to set SUBMODIFIED in ichain's flags manually so the
2294          * flusher knows it has to recurse through it to get to all of
2295          * our moved blocks, then call setsubmod() to set the bit
2296          * recursively.
2297          */
2298         hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA);
2299         hammer2_chain_parent_setsubmod(hmp, ichain);
2300         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2301
2302         /*
2303          * Figure out what to return.
2304          */
2305         if (create_bits > keybits) {
2306                 /*
2307                  * Key being created is way outside the key range,
2308                  * return the original parent.
2309                  */
2310                 hammer2_chain_unlock(hmp, ichain);
2311         } else if (~(((hammer2_key_t)1 << keybits) - 1) &
2312                    (create_key ^ key)) {
2313                 /*
2314                  * Key being created is outside the key range,
2315                  * return the original parent.
2316                  */
2317                 hammer2_chain_unlock(hmp, ichain);
2318         } else {
2319                 /*
2320                  * Otherwise its in the range, return the new parent.
2321                  * (leave both the new and old parent locked).
2322                  */
2323                 parent = ichain;
2324         }
2325
2326         return(parent);
2327 }
2328
2329 /*
2330  * Physically delete the specified chain element.  Note that inodes with
2331  * open descriptors should not be deleted (as with other filesystems) until
2332  * the last open descriptor is closed.
2333  *
2334  * This routine will remove the chain element from its parent and potentially
2335  * also recurse upward and delete indirect blocks which become empty as a
2336  * side effect.
2337  *
2338  * The caller must pass a pointer to the chain's parent, also locked and
2339  * referenced.  (*parentp) will be modified in a manner similar to a lookup
2340  * or iteration when indirect blocks are also deleted as a side effect.
2341  *
2342  * XXX This currently does not adhere to the MOVED flag protocol in that
2343  *     the removal is immediately indicated in the parent's blockref[]
2344  *     array.
2345  *
2346  * Must be called with an exclusively locked parent and chain.
2347  */
2348 void
2349 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
2350                      hammer2_chain_t *chain, int retain)
2351 {
2352         hammer2_blockref_t *base;
2353         hammer2_inode_t *ip;
2354         int count;
2355
2356         if (chain->parent != parent)
2357                 panic("hammer2_chain_delete: parent mismatch");
2358         KKASSERT(ccms_thread_lock_owned(&parent->cst));
2359
2360         /*
2361          * Mark the parent modified so our base[] pointer remains valid
2362          * while we move entries.  For the optimized indirect block
2363          * case mark the parent moved instead.
2364          *
2365          * Calculate the blockref reference in the parent
2366          */
2367         switch(parent->bref.type) {
2368         case HAMMER2_BREF_TYPE_INODE:
2369                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2370                 base = &parent->data->ipdata.u.blockset.blockref[0];
2371                 count = HAMMER2_SET_COUNT;
2372                 break;
2373         case HAMMER2_BREF_TYPE_INDIRECT:
2374                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA |
2375                                                   HAMMER2_MODIFY_NO_MODIFY_TID);
2376                 if (parent->flags & HAMMER2_CHAIN_INITIAL)
2377                         base = NULL;
2378                 else
2379                         base = &parent->data->npdata.blockref[0];
2380                 count = parent->bytes / sizeof(hammer2_blockref_t);
2381                 break;
2382         case HAMMER2_BREF_TYPE_VOLUME:
2383                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2384                 base = &hmp->voldata.sroot_blockset.blockref[0];
2385                 count = HAMMER2_SET_COUNT;
2386                 break;
2387         default:
2388                 panic("hammer2_chain_delete: unrecognized blockref type: %d",
2389                       parent->bref.type);
2390                 count = 0;
2391                 break;
2392         }
2393         KKASSERT(chain->index >= 0 && chain->index < count);
2394
2395         /*
2396          * We may not be able to immediately disconnect the chain if a
2397          * flush is in progress.  If retain is non-zero we MUST disconnect
2398          * the chain now and callers are responsible for making sure that
2399          * flushing is zero.
2400          */
2401         spin_lock(&chain->cst.spin);
2402         if ((retain || chain->flushing == 0) &&
2403             (chain->flags & HAMMER2_CHAIN_ONRBTREE)) {
2404                 if (base)
2405                         bzero(&base[chain->index], sizeof(*base));
2406                 KKASSERT(chain->flushing == 0);
2407                 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
2408                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
2409                 atomic_add_int(&parent->refs, -1);   /* for red-black entry */
2410                 chain->index = -1;
2411                 chain->parent = NULL;
2412         }
2413         spin_unlock(&chain->cst.spin);
2414
2415         /*
2416          * Cumulative adjustments must be propagated to the parent inode
2417          * when deleting and synchronized to ip.  This occurs even if we
2418          * cannot detach the chain from its parent.
2419          *
2420          * NOTE:  We do not propagate ip->delta_*count to the parent because
2421          *        these represent adjustments that have not yet been
2422          *        propagated upward, so we don't need to remove them from
2423          *        the parent.
2424          *
2425          * Clear the pointer to the parent inode.
2426          */
2427         if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
2428             chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2429                 ip = chain->u.ip;
2430                 if (ip->pip) {
2431                         /* XXX SMP, pip chain not necessarily parent chain */
2432                         ip->pip->delta_icount -= ip->ip_data.inode_count;
2433                         ip->pip->delta_dcount -= ip->ip_data.data_count;
2434                         ip->ip_data.inode_count += ip->delta_icount;
2435                         ip->ip_data.data_count += ip->delta_dcount;
2436                         ip->delta_icount = 0;
2437                         ip->delta_dcount = 0;
2438                         --ip->pip->delta_icount;
2439                         spin_lock(&chain->cst.spin); /* XXX */
2440                         ip->pip = NULL;
2441                         spin_unlock(&chain->cst.spin);
2442                 }
2443         }
2444
2445         /*
2446          * If retain is 0 the deletion is permanent.  Because the chain is
2447          * no longer connected to the topology a flush will have no
2448          * visibility into it.  We must dispose of the references related
2449          * to the MODIFIED and MOVED flags, otherwise the ref count will
2450          * never transition to 0.
2451          *
2452          * If retain is non-zero the deleted element is likely an inode
2453          * which the vnops frontend will mark DESTROYED and flush.  In that
2454          * situation we must retain the flags for any open file descriptors
2455          * on the (removed) inode.  The final close will destroy the
2456          * disconnected chain.
2457          */
2458         if (retain == 0) {
2459                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2460                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2461                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
2462                         hammer2_chain_drop(hmp, chain);
2463                 }
2464                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2465                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2466                         hammer2_chain_drop(hmp, chain);
2467                 }
2468         }
2469
2470         /*
2471          * The chain is still likely referenced, possibly even by a vnode
2472          * (if an inode), so defer further action until the chain gets
2473          * dropped.
2474          */
2475 }
2476
2477 /*
2478  * Recursively flush the specified chain.  The chain is locked and
2479  * referenced by the caller and will remain so on return.  The chain
2480  * will remain referenced throughout but can temporarily lose its
2481  * lock during the recursion to avoid unnecessarily stalling user
2482  * processes.
2483  */
2484 struct hammer2_flush_info {
2485         struct flush_deferral_list flush_list;
2486         int             depth;
2487         hammer2_tid_t   modify_tid;
2488 };
2489
2490 typedef struct hammer2_flush_info hammer2_flush_info_t;
2491
2492 static __inline
2493 void
2494 hammer2_saved_child_cleanup(hammer2_mount_t *hmp,
2495                             hammer2_chain_t *parent, hammer2_chain_t *child)
2496 {
2497         atomic_add_int(&child->flushing, -1);
2498         if (child->flushing == 0 && (child->flags & HAMMER2_CHAIN_DELETED)) {
2499                 kprintf("hammer2: fixup deferred deleted child\n");
2500                 hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_MAYBE);
2501                 hammer2_chain_delete(hmp, parent, child, 0);
2502                 hammer2_chain_unlock(hmp, child);
2503         }
2504 }
2505
2506 static void
2507 hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
2508                           hammer2_flush_info_t *info)
2509 {
2510         hammer2_blockref_t *bref;
2511         hammer2_off_t pbase;
2512         size_t bbytes;
2513         size_t boff;
2514         char *bdata;
2515         struct buf *bp;
2516         int error;
2517         int wasmodified;
2518
2519         /*
2520          * If we hit the stack recursion depth limit defer the operation.
2521          * The controller of the info structure will execute the deferral
2522          * list and then retry.
2523          *
2524          * This is only applicable if SUBMODIFIED is set.  After a reflush
2525          * SUBMODIFIED will probably be cleared and we want to drop through
2526          * to finish processing the current element so our direct parent
2527          * can process the results.
2528          */
2529         if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT &&
2530             (chain->flags & HAMMER2_CHAIN_SUBMODIFIED)) {
2531                 if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
2532                         hammer2_chain_ref(hmp, chain);
2533                         TAILQ_INSERT_TAIL(&info->flush_list,
2534                                           chain, flush_node);
2535                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
2536                 }
2537                 return;
2538         }
2539
2540         if (hammer2_debug & 0x0008)
2541                 kprintf("%*.*sCHAIN type=%d@%08jx %p/%d %04x {\n",
2542                         info->depth, info->depth, "",
2543                         chain->bref.type, chain->bref.data_off,
2544                         chain, chain->refs, chain->flags);
2545
2546         /*
2547          * If SUBMODIFIED is set we recurse the flush and adjust the
2548          * blockrefs accordingly.
2549          *
2550          * NOTE: Looping on SUBMODIFIED can prevent a flush from ever
2551          *       finishing in the face of filesystem activity.
2552          */
2553         if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
2554                 hammer2_chain_t *child;
2555                 hammer2_chain_t *saved;
2556                 hammer2_blockref_t *base;
2557                 int count;
2558
2559                 /*
2560                  * Clear SUBMODIFIED to catch races.  Note that if any
2561                  * child has to be flushed SUBMODIFIED will wind up being
2562                  * set again (for next time), but this does not stop us from
2563                  * synchronizing block updates which occurred.
2564                  *
2565                  * We don't want to set our chain to MODIFIED gratuitously.
2566                  *
2567                  * We need an extra ref on chain because we are going to
2568                  * release its lock temporarily in our child loop.
2569                  */
2570                 /* XXX SUBMODIFIED not interlocked, can race */
2571                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2572                 hammer2_chain_ref(hmp, chain);
2573
2574                 /*
2575                  * Flush the children and update the blockrefs in the chain.
2576                  * Be careful of ripouts during the loop.
2577                  *
2578                  * The flushing counter prevents ripouts on lastdrop and
2579                  * also prevents moves (causes renames to sleep/retry).
2580                  * Be very careful with it.
2581                  */
2582                 RB_FOREACH(child, hammer2_chain_tree, &chain->rbhead) {
2583                         KASSERT(child->parent == chain,
2584                                 ("hammer2_flush: child->parent mismatch %p/%p",
2585                                  child->parent, chain));
2586
2587                         /*
2588                          * We only recurse if SUBMODIFIED (internal node)
2589                          * or MODIFIED (internal node or leaf) is set.
2590                          * However, we must still track whether any MOVED
2591                          * entries are present to determine if the chain's
2592                          * blockref's need updating or not.
2593                          */
2594                         if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2595                                              HAMMER2_CHAIN_MODIFIED |
2596                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2597                                 continue;
2598                         }
2599
2600                         /*
2601                          * flushing can only be adjusted while its parent
2602                          * is locked, and prevent the destruction/removal
2603                          * of the child from the parent's B-Tree.  This allows
2604                          * us to temporarily unlock the parent.
2605                          *
2606                          * To unwind, we must hold the parent locked before
2607                          * decrementing flushing to prevent child corruption
2608                          * during our loop.
2609                          */
2610                         atomic_add_int(&child->flushing, 1);
2611                         hammer2_chain_unlock(hmp, chain);
2612                         hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_MAYBE);
2613                         KASSERT(child->parent == chain,
2614                                 ("hammer2_flush: child->parent mismatch %p/%p",
2615                                  child->parent, chain));
2616                         if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2617                                              HAMMER2_CHAIN_MODIFIED |
2618                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2619                                 hammer2_chain_unlock(hmp, child);
2620                                 hammer2_chain_lock(hmp, chain,
2621                                                    HAMMER2_RESOLVE_ALWAYS);
2622                                 KKASSERT(child->parent == chain);
2623                                 atomic_add_int(&child->flushing, -1);
2624                                 continue;
2625                         }
2626
2627                         /*
2628                          * Propagate the DESTROYED flag if found set, then
2629                          * recurse the flush.
2630                          */
2631                         if ((chain->flags & HAMMER2_CHAIN_DESTROYED) &&
2632                             (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) {
2633                                 atomic_set_int(&child->flags,
2634                                                HAMMER2_CHAIN_DESTROYED |
2635                                                HAMMER2_CHAIN_SUBMODIFIED);
2636                         }
2637                         ++info->depth;
2638                         hammer2_chain_flush_pass1(hmp, child, info);
2639                         --info->depth;
2640                         hammer2_chain_unlock(hmp, child);
2641
2642                         /*
2643                          * Always resolve when relocking the parent.
2644                          */
2645                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_ALWAYS);
2646                         KASSERT(child->parent == chain,
2647                                 ("hammer2_flush: child->parent mismatch %p/%p",
2648                                  child->parent, chain));
2649                         atomic_add_int(&child->flushing, -1);
2650                 }
2651
2652                 /*
2653                  * Now synchronize any block updates and handle any
2654                  * chains marked DELETED.
2655                  *
2656                  * The flushing counter prevents ripouts on lastdrop and
2657                  * also prevents moves (causes renames to sleep/retry).
2658                  * Be very careful with it.
2659                  */
2660                 saved = NULL;
2661                 RB_FOREACH(child, hammer2_chain_tree, &chain->rbhead) {
2662                         if ((child->flags & (HAMMER2_CHAIN_MOVED |
2663                                              HAMMER2_CHAIN_DELETED)) == 0) {
2664                                 continue;
2665                         }
2666                         atomic_add_int(&child->flushing, 1);
2667                         if (saved) {
2668                                 hammer2_saved_child_cleanup(hmp, chain, saved);
2669                                 saved = NULL;
2670                         }
2671                         saved = child;
2672                         hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_NEVER);
2673                         KKASSERT(child->parent == chain);
2674                         if ((child->flags & (HAMMER2_CHAIN_MOVED |
2675                                              HAMMER2_CHAIN_DELETED)) == 0) {
2676                                 hammer2_chain_unlock(hmp, child);
2677                                 continue;
2678                         }
2679                         if (child->flags & HAMMER2_CHAIN_MOVED) {
2680                                 hammer2_chain_modify(hmp, chain,
2681                                              HAMMER2_MODIFY_NO_MODIFY_TID);
2682                         }
2683
2684                         switch(chain->bref.type) {
2685                         case HAMMER2_BREF_TYPE_INODE:
2686                                 KKASSERT((chain->data->ipdata.op_flags &
2687                                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
2688                                 base = &chain->data->ipdata.u.blockset.
2689                                         blockref[0];
2690                                 count = HAMMER2_SET_COUNT;
2691                                 break;
2692                         case HAMMER2_BREF_TYPE_INDIRECT:
2693                                 if (chain->data) {
2694                                         base = &chain->data->npdata.blockref[0];
2695                                 } else {
2696                                         base = NULL;
2697                                         KKASSERT(child->flags &
2698                                                  HAMMER2_CHAIN_DELETED);
2699                                 }
2700                                 count = chain->bytes /
2701                                         sizeof(hammer2_blockref_t);
2702                                 break;
2703                         case HAMMER2_BREF_TYPE_VOLUME:
2704                                 base = &hmp->voldata.sroot_blockset.blockref[0];
2705                                 count = HAMMER2_SET_COUNT;
2706                                 break;
2707                         default:
2708                                 base = NULL;
2709                                 panic("hammer2_chain_get: "
2710                                       "unrecognized blockref type: %d",
2711                                       chain->bref.type);
2712                         }
2713
2714                         KKASSERT(child->index >= 0);
2715
2716                         if (chain->bref.mirror_tid <
2717                             child->bref_flush.mirror_tid) {
2718                                 chain->bref.mirror_tid =
2719                                         child->bref_flush.mirror_tid;
2720                         }
2721                         if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME &&
2722                             hmp->voldata.mirror_tid <
2723                             child->bref_flush.mirror_tid) {
2724                                 hmp->voldata.mirror_tid =
2725                                         child->bref_flush.mirror_tid;
2726                         }
2727                         if (child->flags & HAMMER2_CHAIN_DELETED) {
2728                                 bzero(&child->bref_flush,
2729                                       sizeof(child->bref_flush));
2730                         }
2731                         if (base)
2732                                 base[child->index] = child->bref_flush;
2733                         if (child->flags & HAMMER2_CHAIN_MOVED) {
2734                                 atomic_clear_int(&child->flags,
2735                                                  HAMMER2_CHAIN_MOVED);
2736                                 hammer2_chain_drop(hmp, child); /* flag */
2737                         }
2738                         hammer2_chain_unlock(hmp, child);
2739                 }
2740                 if (saved) {
2741                         hammer2_saved_child_cleanup(hmp, chain, saved);
2742                         saved = NULL;
2743                 }
2744                 hammer2_chain_drop(hmp, chain);
2745         }
2746
2747         /*
2748          * If destroying the object we unconditonally clear the MODIFIED
2749          * and MOVED bits, and we destroy the buffer without writing it
2750          * out.
2751          *
2752          * We don't bother updating the hash/crc or the chain bref.
2753          *
2754          * NOTE: The destroy'd object's bref has already been updated.
2755          *       so we can clear MOVED without propagating mirror_tid
2756          *       or modify_tid upward.
2757          *
2758          * XXX allocations for unflushed data can be returned to the
2759          *     free pool.
2760          */
2761         if (chain->flags & HAMMER2_CHAIN_DESTROYED) {
2762                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2763                         if (chain->bp) {
2764                                 chain->bp->b_flags |= B_INVAL|B_RELBUF;
2765                         }
2766                         atomic_clear_int(&chain->flags,
2767                                          HAMMER2_CHAIN_MODIFIED |
2768                                          HAMMER2_CHAIN_MODIFY_TID);
2769                         hammer2_chain_drop(hmp, chain);
2770                 }
2771                 if (chain->flags & HAMMER2_CHAIN_MODIFIED_AUX) {
2772                         atomic_clear_int(&chain->flags,
2773                                          HAMMER2_CHAIN_MODIFIED_AUX);
2774                 }
2775                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2776                         atomic_clear_int(&chain->flags,
2777                                          HAMMER2_CHAIN_MOVED);
2778                         hammer2_chain_drop(hmp, chain);
2779                 }
2780                 return;
2781         }
2782
2783         /*
2784          * Flush this chain entry only if it is marked modified.
2785          */
2786         if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
2787                              HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2788                 goto done;
2789         }
2790
2791         /*
2792          * Synchronize cumulative data and inode count adjustments to
2793          * the inode and propagate the deltas upward to the parent.
2794          */
2795         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2796                 hammer2_inode_t *ip;
2797
2798                 ip = chain->u.ip;
2799                 ip->ip_data.inode_count += ip->delta_icount;
2800                 ip->ip_data.data_count += ip->delta_dcount;
2801                 if (ip->pip) {
2802                         ip->pip->delta_icount += ip->delta_icount;
2803                         ip->pip->delta_dcount += ip->delta_dcount;
2804                 }
2805                 ip->delta_icount = 0;
2806                 ip->delta_dcount = 0;
2807         }
2808
2809         /*
2810          * Flush if MODIFIED or MODIFIED_AUX is set.  MODIFIED_AUX is only
2811          * used by the volume header (&hmp->vchain).
2812          */
2813         if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
2814                              HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2815                 goto done;
2816         }
2817         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED_AUX);
2818
2819         /*
2820          * Clear MODIFIED and set HAMMER2_CHAIN_MOVED.  The caller
2821          * will re-test the MOVED bit.  We must also update the mirror_tid
2822          * and modify_tid fields as appropriate.
2823          *
2824          * bits own a single chain ref and the MOVED bit owns its own
2825          * chain ref.
2826          */
2827         chain->bref.mirror_tid = info->modify_tid;
2828         if (chain->flags & HAMMER2_CHAIN_MODIFY_TID)
2829                 chain->bref.modify_tid = info->modify_tid;
2830         wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0;
2831         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
2832                                         HAMMER2_CHAIN_MODIFY_TID);
2833
2834         if (chain->flags & HAMMER2_CHAIN_MOVED) {
2835                 /*
2836                  * Drop the ref from the MODIFIED bit we cleared.
2837                  */
2838                 if (wasmodified)
2839                         hammer2_chain_drop(hmp, chain);
2840         } else {
2841                 /*
2842                  * If we were MODIFIED we inherit the ref from clearing
2843                  * that bit, otherwise we need another ref.
2844                  */
2845                 if (wasmodified == 0)
2846                         hammer2_chain_ref(hmp, chain);
2847                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2848         }
2849         chain->bref_flush = chain->bref;
2850
2851         /*
2852          * If this is part of a recursive flush we can go ahead and write
2853          * out the buffer cache buffer and pass a new bref back up the chain.
2854          *
2855          * This will never be a volume header.
2856          */
2857         switch(chain->bref.type) {
2858         case HAMMER2_BREF_TYPE_VOLUME:
2859                 /*
2860                  * The volume header is flushed manually by the syncer, not
2861                  * here.
2862                  */
2863                 KKASSERT(chain->data != NULL);
2864                 KKASSERT(chain->bp == NULL);
2865                 kprintf("volume header mirror_tid %jd\n",
2866                         hmp->voldata.mirror_tid);
2867
2868                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
2869                         hammer2_icrc32(
2870                                 (char *)&hmp->voldata +
2871                                  HAMMER2_VOLUME_ICRC1_OFF,
2872                                 HAMMER2_VOLUME_ICRC1_SIZE);
2873                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
2874                         hammer2_icrc32(
2875                                 (char *)&hmp->voldata +
2876                                  HAMMER2_VOLUME_ICRC0_OFF,
2877                                 HAMMER2_VOLUME_ICRC0_SIZE);
2878                 hmp->voldata.icrc_volheader =
2879                         hammer2_icrc32(
2880                                 (char *)&hmp->voldata +
2881                                  HAMMER2_VOLUME_ICRCVH_OFF,
2882                                 HAMMER2_VOLUME_ICRCVH_SIZE);
2883                 hmp->volsync = hmp->voldata;
2884                 break;
2885         case HAMMER2_BREF_TYPE_DATA:
2886                 /*
2887                  * Data elements have already been flushed via the logical
2888                  * file buffer cache.  Their hash was set in the bref by
2889                  * the vop_write code.
2890                  *
2891                  * Make sure the buffer(s) have been flushed out here.
2892                  */
2893                 bbytes = chain->bytes;
2894                 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
2895                 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
2896
2897                 bp = getblk(hmp->devvp, pbase, bbytes, GETBLK_NOWAIT, 0);
2898                 if (bp) {
2899                         if ((bp->b_flags & (B_CACHE | B_DIRTY)) ==
2900                             (B_CACHE | B_DIRTY)) {
2901                                 kprintf("x");
2902                                 cluster_awrite(bp);
2903                         } else {
2904                                 bp->b_flags |= B_RELBUF;
2905                                 brelse(bp);
2906                         }
2907                 }
2908                 break;
2909         case HAMMER2_BREF_TYPE_INDIRECT:
2910                 /*
2911                  * Indirect blocks may be in an INITIAL state.  Use the
2912                  * chain_lock() call to ensure that the buffer has been
2913                  * instantiated (even though it is already locked the buffer
2914                  * might not have been instantiated).
2915                  *
2916                  * Only write the buffer out if it is dirty, it is possible
2917                  * the operating system had already written out the buffer.
2918                  */
2919                 hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_ALWAYS);
2920                 KKASSERT(chain->bp != NULL);
2921
2922                 bp = chain->bp;
2923                 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) ||
2924                     (bp->b_flags & B_DIRTY)) {
2925                         bdwrite(chain->bp);
2926                 } else {
2927                         brelse(chain->bp);
2928                 }
2929                 chain->bp = NULL;
2930                 chain->data = NULL;
2931                 hammer2_chain_unlock(hmp, chain);
2932                 break;
2933         default:
2934                 /*
2935                  * Embedded elements have to be flushed out.
2936                  */
2937                 KKASSERT(chain->data != NULL);
2938                 KKASSERT(chain->bp == NULL);
2939                 bref = &chain->bref;
2940
2941                 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
2942
2943                 if (chain->bp == NULL) {
2944                         /*
2945                          * The data is embedded, we have to acquire the
2946                          * buffer cache buffer and copy the data into it.
2947                          */
2948                         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
2949                                 bbytes = HAMMER2_MINIOSIZE;
2950                         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
2951                         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
2952
2953                         /*
2954                          * The getblk() optimization can only be used if the
2955                          * physical block size matches the request.
2956                          */
2957                         if (chain->bytes == bbytes) {
2958                                 bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
2959                                 error = 0;
2960                         } else {
2961                                 error = bread(hmp->devvp, pbase, bbytes, &bp);
2962                                 KKASSERT(error == 0);
2963                         }
2964                         bdata = (char *)bp->b_data + boff;
2965
2966                         /*
2967                          * Copy the data to the buffer, mark the buffer
2968                          * dirty, and convert the chain to unmodified.
2969                          */
2970                         bcopy(chain->data, bdata, chain->bytes);
2971                         bp->b_flags |= B_CLUSTEROK;
2972                         bdwrite(bp);
2973                         bp = NULL;
2974                         chain->bref.check.iscsi32.value =
2975                                 hammer2_icrc32(chain->data, chain->bytes);
2976                         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
2977                                 ++hammer2_iod_meta_write;
2978                         else
2979                                 ++hammer2_iod_indr_write;
2980                 } else {
2981                         chain->bref.check.iscsi32.value =
2982                                 hammer2_icrc32(chain->data, chain->bytes);
2983                 }
2984         }
2985 done:
2986         if (hammer2_debug & 0x0008) {
2987                 kprintf("%*.*s} %p/%d %04x ",
2988                         info->depth, info->depth, "",
2989                         chain, chain->refs, chain->flags);
2990         }
2991 }
2992
2993 #if 0
2994 /*
2995  * PASS2 - not yet implemented (should be called only with the root chain?)
2996  */
2997 static void
2998 hammer2_chain_flush_pass2(hammer2_mount_t *hmp, hammer2_chain_t *chain)
2999 {
3000 }
3001 #endif
3002
3003 /*
3004  * Stand-alone flush.  If the chain is unable to completely flush we have
3005  * to be sure that SUBMODIFIED propagates up the parent chain.  We must not
3006  * clear the MOVED bit after flushing in this situation or our desynchronized
3007  * bref will not properly update in the parent.
3008  *
3009  * This routine can be called from several places but the most important
3010  * is from the hammer2_vop_reclaim() function.  We want to try to completely
3011  * clean out the inode structure to prevent disconnected inodes from
3012  * building up and blowing out the kmalloc pool.
3013  *
3014  * If modify_tid is 0 (usual case), a new modify_tid is allocated and
3015  * applied to the flush.  The depth-limit handling code is the only
3016  * code which passes a non-zero modify_tid to hammer2_chain_flush().
3017  *
3018  * chain is locked on call and will remain locked on return.
3019  */
3020 void
3021 hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
3022                     hammer2_tid_t modify_tid)
3023 {
3024         hammer2_chain_t *parent;
3025         hammer2_chain_t *scan;
3026         hammer2_blockref_t *base;
3027         hammer2_flush_info_t info;
3028         int count;
3029         int reflush;
3030
3031         /*
3032          * Execute the recursive flush and handle deferrals.
3033          *
3034          * Chains can be ridiculously long (thousands deep), so to
3035          * avoid blowing out the kernel stack the recursive flush has a
3036          * depth limit.  Elements at the limit are placed on a list
3037          * for re-execution after the stack has been popped.
3038          */
3039         bzero(&info, sizeof(info));
3040         TAILQ_INIT(&info.flush_list);
3041
3042         if (modify_tid == 0) {
3043                 hammer2_voldata_lock(hmp);
3044                 info.modify_tid = hmp->voldata.alloc_tid++;
3045                 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
3046                 hammer2_voldata_unlock(hmp);
3047         } else {
3048                 info.modify_tid = modify_tid;
3049         }
3050         reflush = 1;
3051
3052         while (reflush) {
3053                 /*
3054                  * Primary recursion
3055                  */
3056                 hammer2_chain_flush_pass1(hmp, chain, &info);
3057                 reflush = 0;
3058
3059                 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
3060                         /*
3061                          * Secondary recursion.  Note that a reference is
3062                          * retained from the element's presence on the
3063                          * deferral list.
3064                          */
3065                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
3066                         TAILQ_REMOVE(&info.flush_list, scan, flush_node);
3067                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
3068
3069                         /*
3070                          * Now that we've popped back up we can do a secondary
3071                          * recursion on the deferred elements.
3072                          */
3073                         if (hammer2_debug & 0x0040)
3074                                 kprintf("defered flush %p\n", scan);
3075                         hammer2_chain_lock(hmp, scan, HAMMER2_RESOLVE_MAYBE);
3076                         hammer2_chain_flush(hmp, scan, info.modify_tid);
3077                         hammer2_chain_unlock(hmp, scan);
3078
3079                         /*
3080                          * Only flag a reflush if SUBMODIFIED is no longer
3081                          * set.  If SUBMODIFIED is set the element will just
3082                          * wind up on our flush_list again.
3083                          */
3084                         if ((scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
3085                                             HAMMER2_CHAIN_MODIFIED |
3086                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
3087                                 reflush = 1;
3088                         }
3089                         hammer2_chain_drop(hmp, scan);
3090                 }
3091                 if ((hammer2_debug & 0x0040) && reflush)
3092                         kprintf("reflush %p\n", chain);
3093         }
3094
3095         /*
3096          * The SUBMODIFIED bit must propagate upward if the chain could not
3097          * be completely flushed.
3098          */
3099         if (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
3100                             HAMMER2_CHAIN_MODIFIED |
3101                             HAMMER2_CHAIN_MODIFIED_AUX |
3102                             HAMMER2_CHAIN_MOVED)) {
3103                 hammer2_chain_parent_setsubmod(hmp, chain);
3104         }
3105
3106         /*
3107          * If the only thing left is a simple bref update try to
3108          * pro-actively update the parent, otherwise return early.
3109          */
3110         parent = chain->parent;
3111         if (parent == NULL) {
3112                 return;
3113         }
3114         if (chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
3115             (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
3116                              HAMMER2_CHAIN_MODIFIED |
3117                              HAMMER2_CHAIN_MODIFIED_AUX |
3118                              HAMMER2_CHAIN_MOVED)) != HAMMER2_CHAIN_MOVED) {
3119                 return;
3120         }
3121
3122         /*
3123          * We are locking backwards so allow the lock to fail.
3124          */
3125         if (ccms_thread_lock_nonblock(&parent->cst, CCMS_STATE_EXCLUSIVE))
3126                 return;
3127
3128         /*
3129          * We are updating brefs but we have to call chain_modify()
3130          * because our caller is not being run from a recursive flush.
3131          *
3132          * This will also chain up the parent list and set the SUBMODIFIED
3133          * flag.
3134          *
3135          * We do not want to set HAMMER2_CHAIN_MODIFY_TID here because the
3136          * modification is only related to updating a bref in the parent.
3137          *
3138          * When updating the blockset embedded in the volume header we must
3139          * also update voldata.mirror_tid.
3140          */
3141         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
3142         hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
3143
3144         switch(parent->bref.type) {
3145         case HAMMER2_BREF_TYPE_INODE:
3146                 base = &parent->data->ipdata.u.blockset.
3147                         blockref[0];
3148                 count = HAMMER2_SET_COUNT;
3149                 break;
3150         case HAMMER2_BREF_TYPE_INDIRECT:
3151                 base = &parent->data->npdata.blockref[0];
3152                 count = parent->bytes /
3153                         sizeof(hammer2_blockref_t);
3154                 break;
3155         case HAMMER2_BREF_TYPE_VOLUME:
3156                 base = &hmp->voldata.sroot_blockset.blockref[0];
3157                 count = HAMMER2_SET_COUNT;
3158                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
3159                         if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
3160                                 hmp->voldata.mirror_tid =
3161                                         chain->bref.mirror_tid;
3162                         }
3163                 }
3164                 break;
3165         default:
3166                 base = NULL;
3167                 panic("hammer2_chain_flush: "
3168                       "unrecognized blockref type: %d",
3169                       parent->bref.type);
3170         }
3171
3172         /*
3173          * Update the blockref in the parent.  We do not have to set
3174          * MOVED in the parent because the parent has been marked modified,
3175          * so the flush sequence will pick up the bref change.
3176          *
3177          * We do have to propagate mirror_tid upward.
3178          */
3179         KKASSERT(chain->index >= 0 &&
3180                  chain->index < count);
3181         KKASSERT(chain->parent == parent);
3182         if (chain->flags & HAMMER2_CHAIN_MOVED) {
3183                 base[chain->index] = chain->bref_flush;
3184                 if (parent->bref.mirror_tid < chain->bref_flush.mirror_tid)
3185                         parent->bref.mirror_tid = chain->bref_flush.mirror_tid;
3186                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
3187                 hammer2_chain_drop(hmp, chain);
3188         } else if (bcmp(&base[chain->index], &chain->bref_flush,
3189                    sizeof(chain->bref)) != 0) {
3190                 panic("hammer2: unflagged bref update(2)");
3191         }
3192         ccms_thread_unlock(&parent->cst);               /* release manual op */
3193         hammer2_chain_unlock(hmp, parent);
3194 }
3195
3196 void
3197 hammer2_chain_wait(hammer2_mount_t *hmp, hammer2_chain_t *chain)
3198 {
3199         tsleep(chain, 0, "chnflw", 1);
3200 }