sys/vfs/hammer2: Simplify chain allocation in hammer2_chain_alloc()
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2020 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem implements most of the core support functions for
37  * the hammer2_chain structure.
38  *
39  * Chains are the in-memory version on media objects (volume header, inodes,
40  * indirect blocks, data blocks, etc).  Chains represent a portion of the
41  * HAMMER2 topology.
42  *
43  * Chains are no-longer delete-duplicated.  Instead, the original in-memory
44  * chain will be moved along with its block reference (e.g. for things like
45  * renames, hardlink operations, modifications, etc), and will be indexed
46  * on a secondary list for flush handling instead of propagating a flag
47  * upward to the root.
48  *
49  * Concurrent front-end operations can still run against backend flushes
50  * as long as they do not cross the current flush boundary.  An operation
51  * running above the current flush (in areas not yet flushed) can become
52  * part of the current flush while ano peration running below the current
53  * flush can become part of the next flush.
54  */
55 #include <sys/cdefs.h>
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/types.h>
59 #include <sys/lock.h>
60 #include <sys/kern_syscall.h>
61 #include <sys/uuid.h>
62
63 #include <crypto/sha2/sha2.h>
64
65 #include "hammer2.h"
66
67 static hammer2_chain_t *hammer2_chain_create_indirect(
68                 hammer2_chain_t *parent,
69                 hammer2_key_t key, int keybits,
70                 hammer2_tid_t mtid, int for_type, int *errorp);
71 static int hammer2_chain_delete_obref(hammer2_chain_t *parent,
72                 hammer2_chain_t *chain,
73                 hammer2_tid_t mtid, int flags,
74                 hammer2_blockref_t *obref);
75 static hammer2_chain_t *hammer2_combined_find(
76                 hammer2_chain_t *parent,
77                 hammer2_blockref_t *base, int count,
78                 hammer2_key_t *key_nextp,
79                 hammer2_key_t key_beg, hammer2_key_t key_end,
80                 hammer2_blockref_t **bresp);
81 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain,
82                                 int depth);
83 static void hammer2_chain_lru_flush(hammer2_pfs_t *pmp);
84
85 /*
86  * There are many degenerate situations where an extreme rate of console
87  * output can occur from warnings and errors.  Make sure this output does
88  * not impede operations.
89  */
90 static struct krate krate_h2chk = { .freq = 5 };
91 static struct krate krate_h2me = { .freq = 1 };
92 static struct krate krate_h2em = { .freq = 1 };
93
94 /*
95  * Basic RBTree for chains (core.rbtree).
96  */
97 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
98
99 int
100 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
101 {
102         hammer2_key_t c1_beg;
103         hammer2_key_t c1_end;
104         hammer2_key_t c2_beg;
105         hammer2_key_t c2_end;
106
107         /*
108          * Compare chains.  Overlaps are not supposed to happen and catch
109          * any software issues early we count overlaps as a match.
110          */
111         c1_beg = chain1->bref.key;
112         c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1;
113         c2_beg = chain2->bref.key;
114         c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1;
115
116         if (c1_end < c2_beg)    /* fully to the left */
117                 return(-1);
118         if (c1_beg > c2_end)    /* fully to the right */
119                 return(1);
120         return(0);              /* overlap (must not cross edge boundary) */
121 }
122
123 /*
124  * Assert that a chain has no media data associated with it.
125  */
126 static __inline void
127 hammer2_chain_assert_no_data(hammer2_chain_t *chain)
128 {
129         KKASSERT(chain->dio == NULL);
130         if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME &&
131             chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP &&
132             chain->data) {
133                 panic("hammer2_chain_assert_no_data: chain %p still has data",
134                     chain);
135         }
136 }
137
138 /*
139  * Make a chain visible to the flusher.  The flusher operates using a top-down
140  * recursion based on the ONFLUSH flag.  It locates MODIFIED and UPDATE chains,
141  * flushes them, and updates blocks back to the volume root.
142  *
143  * This routine sets the ONFLUSH flag upward from the triggering chain until
144  * it hits an inode root or the volume root.  Inode chains serve as inflection
145  * points, requiring the flusher to bridge across trees.  Inodes include
146  * regular inodes, PFS roots (pmp->iroot), and the media super root
147  * (spmp->iroot).
148  */
149 void
150 hammer2_chain_setflush(hammer2_chain_t *chain)
151 {
152         hammer2_chain_t *parent;
153
154         if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) {
155                 hammer2_spin_sh(&chain->core.spin);
156                 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) {
157                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
158                         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
159                                 break;
160                         if ((parent = chain->parent) == NULL)
161                                 break;
162                         hammer2_spin_sh(&parent->core.spin);
163                         hammer2_spin_unsh(&chain->core.spin);
164                         chain = parent;
165                 }
166                 hammer2_spin_unsh(&chain->core.spin);
167         }
168 }
169
170 /*
171  * Allocate a new disconnected chain element representing the specified
172  * bref.  chain->refs is set to 1 and the passed bref is copied to
173  * chain->bref.  chain->bytes is derived from the bref.
174  *
175  * chain->pmp inherits pmp unless the chain is an inode (other than the
176  * super-root inode).
177  *
178  * NOTE: Returns a referenced but unlocked (because there is no core) chain.
179  */
180 hammer2_chain_t *
181 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp,
182                     hammer2_blockref_t *bref)
183 {
184         hammer2_chain_t *chain;
185         u_int bytes;
186
187         if (bref->type == HAMMER2_BREF_TYPE_EMPTY)
188                 panic("hammer2_chain_alloc: empty blockref type\n");
189
190         /*
191          * Special case - radix of 0 indicates a chain that does not
192          * need a data reference (context is completely embedded in the
193          * bref).
194          */
195         if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX))
196                 bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
197         else
198                 bytes = 0;
199
200         atomic_add_long(&hammer2_chain_allocs, 1);
201         chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO);
202
203         /*
204          * Initialize the new chain structure.  pmp must be set to NULL for
205          * chains belonging to the super-root topology of a device mount.
206          */
207         if (pmp == hmp->spmp)
208                 chain->pmp = NULL;
209         else
210                 chain->pmp = pmp;
211
212         chain->hmp = hmp;
213         chain->bref = *bref;
214         chain->bytes = bytes;
215         chain->refs = 1;
216         chain->flags = HAMMER2_CHAIN_ALLOCATED;
217         lockinit(&chain->diolk, "chdio", 0, 0);
218
219         /*
220          * Set the PFS boundary flag if this chain represents a PFS root.
221          */
222         if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT)
223                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY);
224         hammer2_chain_core_init(chain);
225
226         return (chain);
227 }
228
229 /*
230  * Initialize a chain's core structure.  This structure used to be allocated
231  * but is now embedded.
232  *
233  * The core is not locked.  No additional refs on the chain are made.
234  * (trans) must not be NULL if (core) is not NULL.
235  */
236 void
237 hammer2_chain_core_init(hammer2_chain_t *chain)
238 {
239         /*
240          * Fresh core under nchain (no multi-homing of ochain's
241          * sub-tree).
242          */
243         RB_INIT(&chain->core.rbtree);   /* live chains */
244         hammer2_mtx_init(&chain->lock, "h2chain");
245 }
246
247 /*
248  * Add a reference to a chain element, preventing its destruction.
249  *
250  * (can be called with spinlock held)
251  */
252 void
253 hammer2_chain_ref(hammer2_chain_t *chain)
254 {
255         if (atomic_fetchadd_int(&chain->refs, 1) == 0) {
256                 /*
257                  * Just flag that the chain was used and should be recycled
258                  * on the LRU if it encounters it later.
259                  */
260                 if (chain->flags & HAMMER2_CHAIN_ONLRU)
261                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_LRUHINT);
262
263 #if 0
264                 /*
265                  * REMOVED - reduces contention, lru_list is more heuristical
266                  * now.
267                  *
268                  * 0->non-zero transition must ensure that chain is removed
269                  * from the LRU list.
270                  *
271                  * NOTE: Already holding lru_spin here so we cannot call
272                  *       hammer2_chain_ref() to get it off lru_list, do
273                  *       it manually.
274                  */
275                 if (chain->flags & HAMMER2_CHAIN_ONLRU) {
276                         hammer2_pfs_t *pmp = chain->pmp;
277                         hammer2_spin_ex(&pmp->lru_spin);
278                         if (chain->flags & HAMMER2_CHAIN_ONLRU) {
279                                 atomic_add_int(&pmp->lru_count, -1);
280                                 atomic_clear_int(&chain->flags,
281                                                  HAMMER2_CHAIN_ONLRU);
282                                 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node);
283                         }
284                         hammer2_spin_unex(&pmp->lru_spin);
285                 }
286 #endif
287         }
288 }
289
290 /*
291  * Ref a locked chain and force the data to be held across an unlock.
292  * Chain must be currently locked.  The user of the chain who desires
293  * to release the hold must call hammer2_chain_lock_unhold() to relock
294  * and unhold the chain, then unlock normally, or may simply call
295  * hammer2_chain_drop_unhold() (which is safer against deadlocks).
296  */
297 void
298 hammer2_chain_ref_hold(hammer2_chain_t *chain)
299 {
300         atomic_add_int(&chain->lockcnt, 1);
301         hammer2_chain_ref(chain);
302 }
303
304 /*
305  * Insert the chain in the core rbtree.
306  *
307  * Normal insertions are placed in the live rbtree.  Insertion of a deleted
308  * chain is a special case used by the flush code that is placed on the
309  * unstaged deleted list to avoid confusing the live view.
310  */
311 #define HAMMER2_CHAIN_INSERT_SPIN       0x0001
312 #define HAMMER2_CHAIN_INSERT_LIVE       0x0002
313 #define HAMMER2_CHAIN_INSERT_RACE       0x0004
314
315 static
316 int
317 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain,
318                      int flags, int generation)
319 {
320         hammer2_chain_t *xchain;
321         int error = 0;
322
323         if (flags & HAMMER2_CHAIN_INSERT_SPIN)
324                 hammer2_spin_ex(&parent->core.spin);
325
326         /*
327          * Interlocked by spinlock, check for race
328          */
329         if ((flags & HAMMER2_CHAIN_INSERT_RACE) &&
330             parent->core.generation != generation) {
331                 error = HAMMER2_ERROR_EAGAIN;
332                 goto failed;
333         }
334
335         /*
336          * Insert chain
337          */
338         xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain);
339         KASSERT(xchain == NULL,
340                 ("hammer2_chain_insert: collision %p %p (key=%016jx)",
341                 chain, xchain, chain->bref.key));
342         atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
343         chain->parent = parent;
344         ++parent->core.chain_count;
345         ++parent->core.generation;      /* XXX incs for _get() too, XXX */
346
347         /*
348          * We have to keep track of the effective live-view blockref count
349          * so the create code knows when to push an indirect block.
350          */
351         if (flags & HAMMER2_CHAIN_INSERT_LIVE)
352                 atomic_add_int(&parent->core.live_count, 1);
353 failed:
354         if (flags & HAMMER2_CHAIN_INSERT_SPIN)
355                 hammer2_spin_unex(&parent->core.spin);
356         return error;
357 }
358
359 /*
360  * Drop the caller's reference to the chain.  When the ref count drops to
361  * zero this function will try to disassociate the chain from its parent and
362  * deallocate it, then recursely drop the parent using the implied ref
363  * from the chain's chain->parent.
364  *
365  * Nobody should own chain's mutex on the 1->0 transition, unless this drop
366  * races an acquisition by another cpu.  Therefore we can loop if we are
367  * unable to acquire the mutex, and refs is unlikely to be 1 unless we again
368  * race against another drop.
369  */
370 void
371 hammer2_chain_drop(hammer2_chain_t *chain)
372 {
373         u_int refs;
374
375         KKASSERT(chain->refs > 0);
376
377         while (chain) {
378                 refs = chain->refs;
379                 cpu_ccfence();
380                 KKASSERT(refs > 0);
381
382                 if (refs == 1) {
383                         if (hammer2_mtx_ex_try(&chain->lock) == 0)
384                                 chain = hammer2_chain_lastdrop(chain, 0);
385                         /* retry the same chain, or chain from lastdrop */
386                 } else {
387                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1))
388                                 break;
389                         /* retry the same chain */
390                 }
391                 cpu_pause();
392         }
393 }
394
395 /*
396  * Unhold a held and probably not-locked chain, ensure that the data is
397  * dropped on the 1->0 transition of lockcnt by obtaining an exclusive
398  * lock and then simply unlocking the chain.
399  */
400 void
401 hammer2_chain_unhold(hammer2_chain_t *chain)
402 {
403         u_int lockcnt;
404         int iter = 0;
405
406         for (;;) {
407                 lockcnt = chain->lockcnt;
408                 cpu_ccfence();
409                 if (lockcnt > 1) {
410                         if (atomic_cmpset_int(&chain->lockcnt,
411                                               lockcnt, lockcnt - 1)) {
412                                 break;
413                         }
414                 } else if (hammer2_mtx_ex_try(&chain->lock) == 0) {
415                         hammer2_chain_unlock(chain);
416                         break;
417                 } else {
418                         /*
419                          * This situation can easily occur on SMP due to
420                          * the gap inbetween the 1->0 transition and the
421                          * final unlock.  We cannot safely block on the
422                          * mutex because lockcnt might go above 1.
423                          *
424                          * XXX Sleep for one tick if it takes too long.
425                          */
426                         if (++iter > 1000) {
427                                 if (iter > 1000 + hz) {
428                                         kprintf("hammer2: h2race1 %p\n", chain);
429                                         iter = 1000;
430                                 }
431                                 tsleep(&iter, 0, "h2race1", 1);
432                         }
433                         cpu_pause();
434                 }
435         }
436 }
437
438 void
439 hammer2_chain_drop_unhold(hammer2_chain_t *chain)
440 {
441         hammer2_chain_unhold(chain);
442         hammer2_chain_drop(chain);
443 }
444
445 void
446 hammer2_chain_rehold(hammer2_chain_t *chain)
447 {
448         hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED);
449         atomic_add_int(&chain->lockcnt, 1);
450         hammer2_chain_unlock(chain);
451 }
452
453 /*
454  * Handles the (potential) last drop of chain->refs from 1->0.  Called with
455  * the mutex exclusively locked, refs == 1, and lockcnt 0.  SMP races are
456  * possible against refs and lockcnt.  We must dispose of the mutex on chain.
457  *
458  * This function returns an unlocked chain for recursive drop or NULL.  It
459  * can return the same chain if it determines it has raced another ref.
460  *
461  * --
462  *
463  * When two chains need to be recursively dropped we use the chain we
464  * would otherwise free to placehold the additional chain.  It's a bit
465  * convoluted but we can't just recurse without potentially blowing out
466  * the kernel stack.
467  *
468  * The chain cannot be freed if it has any children.
469  * The chain cannot be freed if flagged MODIFIED unless we can dispose of it.
470  * The chain cannot be freed if flagged UPDATE unless we can dispose of it.
471  * Any dedup registration can remain intact.
472  *
473  * The core spinlock is allowed to nest child-to-parent (not parent-to-child).
474  */
475 static
476 hammer2_chain_t *
477 hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth)
478 {
479         hammer2_pfs_t *pmp;
480         hammer2_dev_t *hmp;
481         hammer2_chain_t *parent;
482         hammer2_chain_t *rdrop;
483
484         /*
485          * We need chain's spinlock to interlock the sub-tree test.
486          * We already have chain's mutex, protecting chain->parent.
487          *
488          * Remember that chain->refs can be in flux.
489          */
490         hammer2_spin_ex(&chain->core.spin);
491
492         if (chain->parent != NULL) {
493                 /*
494                  * If the chain has a parent the UPDATE bit prevents scrapping
495                  * as the chain is needed to properly flush the parent.  Try
496                  * to complete the 1->0 transition and return NULL.  Retry
497                  * (return chain) if we are unable to complete the 1->0
498                  * transition, else return NULL (nothing more to do).
499                  *
500                  * If the chain has a parent the MODIFIED bit prevents
501                  * scrapping.
502                  *
503                  * Chains with UPDATE/MODIFIED are *not* put on the LRU list!
504                  */
505                 if (chain->flags & (HAMMER2_CHAIN_UPDATE |
506                                     HAMMER2_CHAIN_MODIFIED)) {
507                         if (atomic_cmpset_int(&chain->refs, 1, 0)) {
508                                 hammer2_spin_unex(&chain->core.spin);
509                                 hammer2_chain_assert_no_data(chain);
510                                 hammer2_mtx_unlock(&chain->lock);
511                                 chain = NULL;
512                         } else {
513                                 hammer2_spin_unex(&chain->core.spin);
514                                 hammer2_mtx_unlock(&chain->lock);
515                         }
516                         return (chain);
517                 }
518                 /* spinlock still held */
519         } else if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME ||
520                    chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP) {
521                 /*
522                  * Retain the static vchain and fchain.  Clear bits that
523                  * are not relevant.  Do not clear the MODIFIED bit,
524                  * and certainly do not put it on the delayed-flush queue.
525                  */
526                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
527         } else {
528                 /*
529                  * The chain has no parent and can be flagged for destruction.
530                  * Since it has no parent, UPDATE can also be cleared.
531                  */
532                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
533                 if (chain->flags & HAMMER2_CHAIN_UPDATE)
534                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
535
536                 /*
537                  * If the chain has children we must propagate the DESTROY
538                  * flag downward and rip the disconnected topology apart.
539                  * This is accomplished by calling hammer2_flush() on the
540                  * chain.
541                  *
542                  * Any dedup is already handled by the underlying DIO, so
543                  * we do not have to specifically flush it here.
544                  */
545                 if (chain->core.chain_count) {
546                         hammer2_spin_unex(&chain->core.spin);
547                         hammer2_flush(chain, HAMMER2_FLUSH_TOP |
548                                              HAMMER2_FLUSH_ALL);
549                         hammer2_mtx_unlock(&chain->lock);
550
551                         return(chain);  /* retry drop */
552                 }
553
554                 /*
555                  * Otherwise we can scrap the MODIFIED bit if it is set,
556                  * and continue along the freeing path.
557                  *
558                  * Be sure to clean-out any dedup bits.  Without a parent
559                  * this chain will no longer be visible to the flush code.
560                  * Easy check data_off to avoid the volume root.
561                  */
562                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
563                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
564                         atomic_add_long(&hammer2_count_modified_chains, -1);
565                         if (chain->pmp)
566                                 hammer2_pfs_memory_wakeup(chain->pmp, -1);
567                 }
568                 /* spinlock still held */
569         }
570
571         /* spinlock still held */
572
573         /*
574          * If any children exist we must leave the chain intact with refs == 0.
575          * They exist because chains are retained below us which have refs or
576          * may require flushing.
577          *
578          * Retry (return chain) if we fail to transition the refs to 0, else
579          * return NULL indication nothing more to do.
580          *
581          * Chains with children are NOT put on the LRU list.
582          */
583         if (chain->core.chain_count) {
584                 if (atomic_cmpset_int(&chain->refs, 1, 0)) {
585                         hammer2_spin_unex(&chain->core.spin);
586                         hammer2_chain_assert_no_data(chain);
587                         hammer2_mtx_unlock(&chain->lock);
588                         chain = NULL;
589                 } else {
590                         hammer2_spin_unex(&chain->core.spin);
591                         hammer2_mtx_unlock(&chain->lock);
592                 }
593                 return (chain);
594         }
595         /* spinlock still held */
596         /* no chains left under us */
597
598         /*
599          * chain->core has no children left so no accessors can get to our
600          * chain from there.  Now we have to lock the parent core to interlock
601          * remaining possible accessors that might bump chain's refs before
602          * we can safely drop chain's refs with intent to free the chain.
603          */
604         hmp = chain->hmp;
605         pmp = chain->pmp;       /* can be NULL */
606         rdrop = NULL;
607
608         parent = chain->parent;
609
610         /*
611          * WARNING! chain's spin lock is still held here, and other spinlocks
612          *          will be acquired and released in the code below.  We
613          *          cannot be making fancy procedure calls!
614          */
615
616         /*
617          * We can cache the chain if it is associated with a pmp
618          * and not flagged as being destroyed or requesting a full
619          * release.  In this situation the chain is not removed
620          * from its parent, i.e. it can still be looked up.
621          *
622          * We intentionally do not cache DATA chains because these
623          * were likely used to load data into the logical buffer cache
624          * and will not be accessed again for some time.
625          */
626         if ((chain->flags &
627              (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 &&
628             chain->pmp &&
629             chain->bref.type != HAMMER2_BREF_TYPE_DATA) {
630                 if (parent)
631                         hammer2_spin_ex(&parent->core.spin);
632                 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
633                         /*
634                          * 1->0 transition failed, retry.  Do not drop
635                          * the chain's data yet!
636                          */
637                         if (parent)
638                                 hammer2_spin_unex(&parent->core.spin);
639                         hammer2_spin_unex(&chain->core.spin);
640                         hammer2_mtx_unlock(&chain->lock);
641
642                         return(chain);
643                 }
644
645                 /*
646                  * Success
647                  */
648                 hammer2_chain_assert_no_data(chain);
649
650                 /*
651                  * Make sure we are on the LRU list, clean up excessive
652                  * LRU entries.  We can only really drop one but there might
653                  * be other entries that we can remove from the lru_list
654                  * without dropping.
655                  *
656                  * NOTE: HAMMER2_CHAIN_ONLRU may only be safely set when
657                  *       chain->core.spin AND pmp->lru_spin are held, but
658                  *       can be safely cleared only holding pmp->lru_spin.
659                  */
660                 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) {
661                         hammer2_spin_ex(&pmp->lru_spin);
662                         if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) {
663                                 atomic_set_int(&chain->flags,
664                                                HAMMER2_CHAIN_ONLRU);
665                                 TAILQ_INSERT_TAIL(&pmp->lru_list,
666                                                   chain, lru_node);
667                                 atomic_add_int(&pmp->lru_count, 1);
668                         }
669                         if (pmp->lru_count < HAMMER2_LRU_LIMIT)
670                                 depth = 1;      /* disable lru_list flush */
671                         hammer2_spin_unex(&pmp->lru_spin);
672                 } else {
673                         /* disable lru flush */
674                         depth = 1;
675                 }
676
677                 if (parent) {
678                         hammer2_spin_unex(&parent->core.spin);
679                         parent = NULL;  /* safety */
680                 }
681                 hammer2_spin_unex(&chain->core.spin);
682                 hammer2_mtx_unlock(&chain->lock);
683
684                 /*
685                  * lru_list hysteresis (see above for depth overrides).
686                  * Note that depth also prevents excessive lastdrop recursion.
687                  */
688                 if (depth == 0)
689                         hammer2_chain_lru_flush(pmp);
690
691                 return NULL;
692                 /* NOT REACHED */
693         }
694
695         /*
696          * Make sure we are not on the LRU list.
697          */
698         if (chain->flags & HAMMER2_CHAIN_ONLRU) {
699                 hammer2_spin_ex(&pmp->lru_spin);
700                 if (chain->flags & HAMMER2_CHAIN_ONLRU) {
701                         atomic_add_int(&pmp->lru_count, -1);
702                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU);
703                         TAILQ_REMOVE(&pmp->lru_list, chain, lru_node);
704                 }
705                 hammer2_spin_unex(&pmp->lru_spin);
706         }
707
708         /*
709          * Spinlock the parent and try to drop the last ref on chain.
710          * On success determine if we should dispose of the chain
711          * (remove the chain from its parent, etc).
712          *
713          * (normal core locks are top-down recursive but we define
714          * core spinlocks as bottom-up recursive, so this is safe).
715          */
716         if (parent) {
717                 hammer2_spin_ex(&parent->core.spin);
718                 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
719                         /*
720                          * 1->0 transition failed, retry.
721                          */
722                         hammer2_spin_unex(&parent->core.spin);
723                         hammer2_spin_unex(&chain->core.spin);
724                         hammer2_mtx_unlock(&chain->lock);
725
726                         return(chain);
727                 }
728
729                 /*
730                  * 1->0 transition successful, parent spin held to prevent
731                  * new lookups, chain spinlock held to protect parent field.
732                  * Remove chain from the parent.
733                  *
734                  * If the chain is being removed from the parent's btree but
735                  * is not bmapped, we have to adjust live_count downward.  If
736                  * it is bmapped then the blockref is retained in the parent
737                  * as is its associated live_count.  This case can occur when
738                  * a chain added to the topology is unable to flush and is
739                  * then later deleted.
740                  */
741                 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
742                         if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) &&
743                             (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
744                                 atomic_add_int(&parent->core.live_count, -1);
745                         }
746                         RB_REMOVE(hammer2_chain_tree,
747                                   &parent->core.rbtree, chain);
748                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
749                         --parent->core.chain_count;
750                         chain->parent = NULL;
751                 }
752
753                 /*
754                  * If our chain was the last chain in the parent's core the
755                  * core is now empty and its parent might have to be
756                  * re-dropped if it has 0 refs.
757                  */
758                 if (parent->core.chain_count == 0) {
759                         rdrop = parent;
760                         atomic_add_int(&rdrop->refs, 1);
761                         /*
762                         if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0)
763                                 rdrop = NULL;
764                         */
765                 }
766                 hammer2_spin_unex(&parent->core.spin);
767                 parent = NULL;  /* safety */
768                 /* FALL THROUGH */
769         } else {
770                 /*
771                  * No-parent case.
772                  */
773                 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
774                         /*
775                          * 1->0 transition failed, retry.
776                          */
777                         hammer2_spin_unex(&parent->core.spin);
778                         hammer2_spin_unex(&chain->core.spin);
779                         hammer2_mtx_unlock(&chain->lock);
780
781                         return(chain);
782                 }
783         }
784
785         /*
786          * Successful 1->0 transition, no parent, no children... no way for
787          * anyone to ref this chain any more.  We can clean-up and free it.
788          *
789          * We still have the core spinlock, and core's chain_count is 0.
790          * Any parent spinlock is gone.
791          */
792         hammer2_spin_unex(&chain->core.spin);
793         hammer2_chain_assert_no_data(chain);
794         hammer2_mtx_unlock(&chain->lock);
795         KKASSERT(RB_EMPTY(&chain->core.rbtree) &&
796                  chain->core.chain_count == 0);
797
798         /*
799          * All locks are gone, no pointers remain to the chain, finish
800          * freeing it.
801          */
802         KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE |
803                                   HAMMER2_CHAIN_MODIFIED)) == 0);
804
805         /*
806          * Once chain resources are gone we can use the now dead chain
807          * structure to placehold what might otherwise require a recursive
808          * drop, because we have potentially two things to drop and can only
809          * return one directly.
810          */
811         if (chain->flags & HAMMER2_CHAIN_ALLOCATED) {
812                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED);
813                 chain->hmp = NULL;
814                 kfree(chain, hmp->mchain);
815         }
816
817         /*
818          * Possible chaining loop when parent re-drop needed.
819          */
820         return(rdrop);
821 }
822
823 /*
824  * Heuristical flush of the LRU, try to reduce the number of entries
825  * on the LRU to (HAMMER2_LRU_LIMIT * 2 / 3).  This procedure is called
826  * only when lru_count exceeds HAMMER2_LRU_LIMIT.
827  */
828 static
829 void
830 hammer2_chain_lru_flush(hammer2_pfs_t *pmp)
831 {
832         hammer2_chain_t *chain;
833
834 again:
835         chain = NULL;
836         hammer2_spin_ex(&pmp->lru_spin);
837         while (pmp->lru_count > HAMMER2_LRU_LIMIT * 2 / 3) {
838                 /*
839                  * Pick a chain off the lru_list, just recycle it quickly
840                  * if LRUHINT is set (the chain was ref'd but left on
841                  * the lru_list, so cycle to the end).
842                  */
843                 chain = TAILQ_FIRST(&pmp->lru_list);
844                 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node);
845
846                 if (chain->flags & HAMMER2_CHAIN_LRUHINT) {
847                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_LRUHINT);
848                         TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node);
849                         chain = NULL;
850                         continue;
851                 }
852
853                 /*
854                  * Ok, we are off the LRU.  We must adjust refs before we
855                  * can safely clear the ONLRU flag.
856                  */
857                 atomic_add_int(&pmp->lru_count, -1);
858                 if (atomic_cmpset_int(&chain->refs, 0, 1)) {
859                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU);
860                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
861                         break;
862                 }
863                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU);
864                 chain = NULL;
865         }
866         hammer2_spin_unex(&pmp->lru_spin);
867         if (chain == NULL)
868                 return;
869
870         /*
871          * If we picked a chain off the lru list we may be able to lastdrop
872          * it.  Use a depth of 1 to prevent excessive lastdrop recursion.
873          */
874         while (chain) {
875                 u_int refs;
876
877                 refs = chain->refs;
878                 cpu_ccfence();
879                 KKASSERT(refs > 0);
880
881                 if (refs == 1) {
882                         if (hammer2_mtx_ex_try(&chain->lock) == 0)
883                                 chain = hammer2_chain_lastdrop(chain, 1);
884                         /* retry the same chain, or chain from lastdrop */
885                 } else {
886                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1))
887                                 break;
888                         /* retry the same chain */
889                 }
890                 cpu_pause();
891         }
892         goto again;
893 }
894
895 /*
896  * On last lock release.
897  */
898 static hammer2_io_t *
899 hammer2_chain_drop_data(hammer2_chain_t *chain)
900 {
901         hammer2_io_t *dio;
902
903         if ((dio = chain->dio) != NULL) {
904                 chain->dio = NULL;
905                 chain->data = NULL;
906         } else {
907                 switch(chain->bref.type) {
908                 case HAMMER2_BREF_TYPE_VOLUME:
909                 case HAMMER2_BREF_TYPE_FREEMAP:
910                         break;
911                 default:
912                         if (chain->data != NULL) {
913                                 hammer2_spin_unex(&chain->core.spin);
914                                 panic("chain data not null: "
915                                       "chain %p bref %016jx.%02x "
916                                       "refs %d parent %p dio %p data %p",
917                                       chain, chain->bref.data_off,
918                                       chain->bref.type, chain->refs,
919                                       chain->parent,
920                                       chain->dio, chain->data);
921                         }
922                         KKASSERT(chain->data == NULL);
923                         break;
924                 }
925         }
926         return dio;
927 }
928
929 /*
930  * Lock a referenced chain element, acquiring its data with I/O if necessary,
931  * and specify how you would like the data to be resolved.
932  *
933  * If an I/O or other fatal error occurs, chain->error will be set to non-zero.
934  *
935  * The lock is allowed to recurse, multiple locking ops will aggregate
936  * the requested resolve types.  Once data is assigned it will not be
937  * removed until the last unlock.
938  *
939  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
940  *                         (typically used to avoid device/logical buffer
941  *                          aliasing for data)
942  *
943  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
944  *                         the INITIAL-create state (indirect blocks only).
945  *
946  *                         Do not resolve data elements for DATA chains.
947  *                         (typically used to avoid device/logical buffer
948  *                          aliasing for data)
949  *
950  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
951  *
952  * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
953  *                         it will be locked exclusive.
954  *
955  * HAMMER2_RESOLVE_NONBLOCK- (flag) The chain is locked non-blocking.  If
956  *                         the lock fails, EAGAIN is returned.
957  *
958  * NOTE: Embedded elements (volume header, inodes) are always resolved
959  *       regardless.
960  *
961  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
962  *       element will instantiate and zero its buffer, and flush it on
963  *       release.
964  *
965  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
966  *       so as not to instantiate a device buffer, which could alias against
967  *       a logical file buffer.  However, if ALWAYS is specified the
968  *       device buffer will be instantiated anyway.
969  *
970  * NOTE: The return value is always 0 unless NONBLOCK is specified, in which
971  *       case it can be either 0 or EAGAIN.
972  *
973  * WARNING! This function blocks on I/O if data needs to be fetched.  This
974  *          blocking can run concurrent with other compatible lock holders
975  *          who do not need data returning.  The lock is not upgraded to
976  *          exclusive during a data fetch, a separate bit is used to
977  *          interlock I/O.  However, an exclusive lock holder can still count
978  *          on being interlocked against an I/O fetch managed by a shared
979  *          lock holder.
980  */
981 int
982 hammer2_chain_lock(hammer2_chain_t *chain, int how)
983 {
984         KKASSERT(chain->refs > 0);
985
986         if (how & HAMMER2_RESOLVE_NONBLOCK) {
987                 /*
988                  * We still have to bump lockcnt before acquiring the lock,
989                  * even for non-blocking operation, because the unlock code
990                  * live-loops on lockcnt == 1 when dropping the last lock.
991                  *
992                  * If the non-blocking operation fails we have to use an
993                  * unhold sequence to undo the mess.
994                  *
995                  * NOTE: LOCKAGAIN must always succeed without blocking,
996                  *       even if NONBLOCK is specified.
997                  */
998                 atomic_add_int(&chain->lockcnt, 1);
999                 if (how & HAMMER2_RESOLVE_SHARED) {
1000                         if (how & HAMMER2_RESOLVE_LOCKAGAIN) {
1001                                 hammer2_mtx_sh_again(&chain->lock);
1002                         } else {
1003                                 if (hammer2_mtx_sh_try(&chain->lock) != 0) {
1004                                         hammer2_chain_unhold(chain);
1005                                         return EAGAIN;
1006                                 }
1007                         }
1008                 } else {
1009                         if (hammer2_mtx_ex_try(&chain->lock) != 0) {
1010                                 hammer2_chain_unhold(chain);
1011                                 return EAGAIN;
1012                         }
1013                 }
1014         } else {
1015                 /*
1016                  * Get the appropriate lock.  If LOCKAGAIN is flagged with
1017                  * SHARED the caller expects a shared lock to already be
1018                  * present and we are giving it another ref.  This case must
1019                  * importantly not block if there is a pending exclusive lock
1020                  * request.
1021                  */
1022                 atomic_add_int(&chain->lockcnt, 1);
1023                 if (how & HAMMER2_RESOLVE_SHARED) {
1024                         if (how & HAMMER2_RESOLVE_LOCKAGAIN) {
1025                                 hammer2_mtx_sh_again(&chain->lock);
1026                         } else {
1027                                 hammer2_mtx_sh(&chain->lock);
1028                         }
1029                 } else {
1030                         hammer2_mtx_ex(&chain->lock);
1031                 }
1032         }
1033
1034         /*
1035          * If we already have a valid data pointer make sure the data is
1036          * synchronized to the current cpu, and then no further action is
1037          * necessary.
1038          */
1039         if (chain->data) {
1040                 if (chain->dio)
1041                         hammer2_io_bkvasync(chain->dio);
1042                 return 0;
1043         }
1044
1045         /*
1046          * Do we have to resolve the data?  This is generally only
1047          * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased.
1048          * Other BREF types expects the data to be there.
1049          */
1050         switch(how & HAMMER2_RESOLVE_MASK) {
1051         case HAMMER2_RESOLVE_NEVER:
1052                 return 0;
1053         case HAMMER2_RESOLVE_MAYBE:
1054                 if (chain->flags & HAMMER2_CHAIN_INITIAL)
1055                         return 0;
1056                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
1057                         return 0;
1058 #if 0
1059                 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE)
1060                         return 0;
1061                 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF)
1062                         return 0;
1063 #endif
1064                 /* fall through */
1065         case HAMMER2_RESOLVE_ALWAYS:
1066         default:
1067                 break;
1068         }
1069
1070         /*
1071          * Caller requires data
1072          */
1073         hammer2_chain_load_data(chain);
1074
1075         return 0;
1076 }
1077
1078 /*
1079  * Lock the chain, retain the hold, and drop the data persistence count.
1080  * The data should remain valid because we never transitioned lockcnt
1081  * through 0.
1082  */
1083 void
1084 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how)
1085 {
1086         hammer2_chain_lock(chain, how);
1087         atomic_add_int(&chain->lockcnt, -1);
1088 }
1089
1090 #if 0
1091 /*
1092  * Downgrade an exclusive chain lock to a shared chain lock.
1093  *
1094  * NOTE: There is no upgrade equivalent due to the ease of
1095  *       deadlocks in that direction.
1096  */
1097 void
1098 hammer2_chain_lock_downgrade(hammer2_chain_t *chain)
1099 {
1100         hammer2_mtx_downgrade(&chain->lock);
1101 }
1102 #endif
1103
1104 /*
1105  * Issue I/O and install chain->data.  Caller must hold a chain lock, lock
1106  * may be of any type.
1107  *
1108  * Once chain->data is set it cannot be disposed of until all locks are
1109  * released.
1110  *
1111  * Make sure the data is synchronized to the current cpu.
1112  */
1113 void
1114 hammer2_chain_load_data(hammer2_chain_t *chain)
1115 {
1116         hammer2_blockref_t *bref;
1117         hammer2_dev_t *hmp;
1118         hammer2_io_t *dio;
1119         char *bdata;
1120         int error;
1121
1122         /*
1123          * Degenerate case, data already present, or chain has no media
1124          * reference to load.
1125          */
1126         KKASSERT(chain->lock.mtx_lock & MTX_MASK);
1127         if (chain->data) {
1128                 if (chain->dio)
1129                         hammer2_io_bkvasync(chain->dio);
1130                 return;
1131         }
1132         if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0)
1133                 return;
1134
1135         hmp = chain->hmp;
1136         KKASSERT(hmp != NULL);
1137
1138         /*
1139          * Gain the IOINPROG bit, interlocked block.
1140          */
1141         for (;;) {
1142                 u_int oflags;
1143                 u_int nflags;
1144
1145                 oflags = chain->flags;
1146                 cpu_ccfence();
1147                 if (oflags & HAMMER2_CHAIN_IOINPROG) {
1148                         nflags = oflags | HAMMER2_CHAIN_IOSIGNAL;
1149                         tsleep_interlock(&chain->flags, 0);
1150                         if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
1151                                 tsleep(&chain->flags, PINTERLOCKED,
1152                                         "h2iocw", 0);
1153                         }
1154                         /* retry */
1155                 } else {
1156                         nflags = oflags | HAMMER2_CHAIN_IOINPROG;
1157                         if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
1158                                 break;
1159                         }
1160                         /* retry */
1161                 }
1162         }
1163
1164         /*
1165          * We own CHAIN_IOINPROG
1166          *
1167          * Degenerate case if we raced another load.
1168          */
1169         if (chain->data) {
1170                 if (chain->dio)
1171                         hammer2_io_bkvasync(chain->dio);
1172                 goto done;
1173         }
1174
1175         /*
1176          * We must resolve to a device buffer, either by issuing I/O or
1177          * by creating a zero-fill element.  We do not mark the buffer
1178          * dirty when creating a zero-fill element (the hammer2_chain_modify()
1179          * API must still be used to do that).
1180          *
1181          * The device buffer is variable-sized in powers of 2 down
1182          * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1183          * chunk always contains buffers of the same size. (XXX)
1184          *
1185          * The minimum physical IO size may be larger than the variable
1186          * block size.
1187          */
1188         bref = &chain->bref;
1189
1190         /*
1191          * The getblk() optimization can only be used on newly created
1192          * elements if the physical block size matches the request.
1193          */
1194         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1195                 error = hammer2_io_new(hmp, bref->type,
1196                                        bref->data_off, chain->bytes,
1197                                        &chain->dio);
1198         } else {
1199                 error = hammer2_io_bread(hmp, bref->type,
1200                                          bref->data_off, chain->bytes,
1201                                          &chain->dio);
1202                 hammer2_adjreadcounter(chain->bref.type, chain->bytes);
1203         }
1204         if (error) {
1205                 chain->error = HAMMER2_ERROR_EIO;
1206                 kprintf("hammer2_chain_load_data: I/O error %016jx: %d\n",
1207                         (intmax_t)bref->data_off, error);
1208                 hammer2_io_bqrelse(&chain->dio);
1209                 goto done;
1210         }
1211         chain->error = 0;
1212
1213         /*
1214          * This isn't perfect and can be ignored on OSs which do not have
1215          * an indication as to whether a buffer is coming from cache or
1216          * if I/O was actually issued for the read.  TESTEDGOOD will work
1217          * pretty well without the B_IOISSUED logic because chains are
1218          * cached, but in that situation (without B_IOISSUED) it will not
1219          * detect whether a re-read via I/O is corrupted verses the original
1220          * read.
1221          *
1222          * We can't re-run the CRC on every fresh lock.  That would be
1223          * insanely expensive.
1224          *
1225          * If the underlying kernel buffer covers the entire chain we can
1226          * use the B_IOISSUED indication to determine if we have to re-run
1227          * the CRC on chain data for chains that managed to stay cached
1228          * across the kernel disposal of the original buffer.
1229          */
1230         if ((dio = chain->dio) != NULL && dio->bp) {
1231                 struct buf *bp = dio->bp;
1232
1233                 if (dio->psize == chain->bytes &&
1234                     (bp->b_flags & B_IOISSUED)) {
1235                         atomic_clear_int(&chain->flags,
1236                                          HAMMER2_CHAIN_TESTEDGOOD);
1237                         bp->b_flags &= ~B_IOISSUED;
1238                 }
1239         }
1240
1241         /*
1242          * NOTE: A locked chain's data cannot be modified without first
1243          *       calling hammer2_chain_modify().
1244          */
1245
1246         /*
1247          * NOTE: hammer2_io_data() call issues bkvasync()
1248          */
1249         bdata = hammer2_io_data(chain->dio, chain->bref.data_off);
1250
1251         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1252                 /*
1253                  * Clear INITIAL.  In this case we used io_new() and the
1254                  * buffer has been zero'd and marked dirty.
1255                  *
1256                  * CHAIN_MODIFIED has not been set yet, and we leave it
1257                  * that way for now.  Set a temporary CHAIN_NOTTESTED flag
1258                  * to prevent hammer2_chain_testcheck() from trying to match
1259                  * a check code that has not yet been generated.  This bit
1260                  * should NOT end up on the actual media.
1261                  */
1262                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1263                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED);
1264         } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
1265                 /*
1266                  * check data not currently synchronized due to
1267                  * modification.  XXX assumes data stays in the buffer
1268                  * cache, which might not be true (need biodep on flush
1269                  * to calculate crc?  or simple crc?).
1270                  */
1271         } else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) {
1272                 if (hammer2_chain_testcheck(chain, bdata) == 0) {
1273                         chain->error = HAMMER2_ERROR_CHECK;
1274                 } else {
1275                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD);
1276                 }
1277         }
1278
1279         /*
1280          * Setup the data pointer, either pointing it to an embedded data
1281          * structure and copying the data from the buffer, or pointing it
1282          * into the buffer.
1283          *
1284          * The buffer is not retained when copying to an embedded data
1285          * structure in order to avoid potential deadlocks or recursions
1286          * on the same physical buffer.
1287          *
1288          * WARNING! Other threads can start using the data the instant we
1289          *          set chain->data non-NULL.
1290          */
1291         switch (bref->type) {
1292         case HAMMER2_BREF_TYPE_VOLUME:
1293         case HAMMER2_BREF_TYPE_FREEMAP:
1294                 /*
1295                  * Copy data from bp to embedded buffer
1296                  */
1297                 panic("hammer2_chain_load_data: unresolved volume header");
1298                 break;
1299         case HAMMER2_BREF_TYPE_DIRENT:
1300                 KKASSERT(chain->bytes != 0);
1301                 /* fall through */
1302         case HAMMER2_BREF_TYPE_INODE:
1303         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1304         case HAMMER2_BREF_TYPE_INDIRECT:
1305         case HAMMER2_BREF_TYPE_DATA:
1306         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1307         default:
1308                 /*
1309                  * Point data at the device buffer and leave dio intact.
1310                  */
1311                 chain->data = (void *)bdata;
1312                 break;
1313         }
1314
1315         /*
1316          * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested.
1317          */
1318 done:
1319         for (;;) {
1320                 u_int oflags;
1321                 u_int nflags;
1322
1323                 oflags = chain->flags;
1324                 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG |
1325                                     HAMMER2_CHAIN_IOSIGNAL);
1326                 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG);
1327                 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
1328                         if (oflags & HAMMER2_CHAIN_IOSIGNAL)
1329                                 wakeup(&chain->flags);
1330                         break;
1331                 }
1332         }
1333 }
1334
1335 /*
1336  * Unlock and deref a chain element.
1337  *
1338  * Remember that the presence of children under chain prevent the chain's
1339  * destruction but do not add additional references, so the dio will still
1340  * be dropped.
1341  */
1342 void
1343 hammer2_chain_unlock(hammer2_chain_t *chain)
1344 {
1345         hammer2_io_t *dio;
1346         u_int lockcnt;
1347         int iter = 0;
1348
1349         /*
1350          * If multiple locks are present (or being attempted) on this
1351          * particular chain we can just unlock, drop refs, and return.
1352          *
1353          * Otherwise fall-through on the 1->0 transition.
1354          */
1355         for (;;) {
1356                 lockcnt = chain->lockcnt;
1357                 KKASSERT(lockcnt > 0);
1358                 cpu_ccfence();
1359                 if (lockcnt > 1) {
1360                         if (atomic_cmpset_int(&chain->lockcnt,
1361                                               lockcnt, lockcnt - 1)) {
1362                                 hammer2_mtx_unlock(&chain->lock);
1363                                 return;
1364                         }
1365                 } else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) {
1366                         /* while holding the mutex exclusively */
1367                         if (atomic_cmpset_int(&chain->lockcnt, 1, 0))
1368                                 break;
1369                 } else {
1370                         /*
1371                          * This situation can easily occur on SMP due to
1372                          * the gap inbetween the 1->0 transition and the
1373                          * final unlock.  We cannot safely block on the
1374                          * mutex because lockcnt might go above 1.
1375                          *
1376                          * XXX Sleep for one tick if it takes too long.
1377                          */
1378                         if (++iter > 1000) {
1379                                 if (iter > 1000 + hz) {
1380                                         kprintf("hammer2: h2race2 %p\n", chain);
1381                                         iter = 1000;
1382                                 }
1383                                 tsleep(&iter, 0, "h2race2", 1);
1384                         }
1385                         cpu_pause();
1386                 }
1387                 /* retry */
1388         }
1389
1390         /*
1391          * Last unlock / mutex upgraded to exclusive.  Drop the data
1392          * reference.
1393          */
1394         dio = hammer2_chain_drop_data(chain);
1395         if (dio)
1396                 hammer2_io_bqrelse(&dio);
1397         hammer2_mtx_unlock(&chain->lock);
1398 }
1399
1400 /*
1401  * Unlock and hold chain data intact
1402  */
1403 void
1404 hammer2_chain_unlock_hold(hammer2_chain_t *chain)
1405 {
1406         atomic_add_int(&chain->lockcnt, 1);
1407         hammer2_chain_unlock(chain);
1408 }
1409
1410 /*
1411  * Helper to obtain the blockref[] array base and count for a chain.
1412  *
1413  * XXX Not widely used yet, various use cases need to be validated and
1414  *     converted to use this function.
1415  */
1416 static
1417 hammer2_blockref_t *
1418 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp)
1419 {
1420         hammer2_blockref_t *base;
1421         int count;
1422
1423         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1424                 base = NULL;
1425
1426                 switch(parent->bref.type) {
1427                 case HAMMER2_BREF_TYPE_INODE:
1428                         count = HAMMER2_SET_COUNT;
1429                         break;
1430                 case HAMMER2_BREF_TYPE_INDIRECT:
1431                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1432                         count = parent->bytes / sizeof(hammer2_blockref_t);
1433                         break;
1434                 case HAMMER2_BREF_TYPE_VOLUME:
1435                         count = HAMMER2_SET_COUNT;
1436                         break;
1437                 case HAMMER2_BREF_TYPE_FREEMAP:
1438                         count = HAMMER2_SET_COUNT;
1439                         break;
1440                 default:
1441                         panic("hammer2_chain_base_and_count: "
1442                               "unrecognized blockref type: %d",
1443                               parent->bref.type);
1444                         count = 0;
1445                         break;
1446                 }
1447         } else {
1448                 switch(parent->bref.type) {
1449                 case HAMMER2_BREF_TYPE_INODE:
1450                         base = &parent->data->ipdata.u.blockset.blockref[0];
1451                         count = HAMMER2_SET_COUNT;
1452                         break;
1453                 case HAMMER2_BREF_TYPE_INDIRECT:
1454                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1455                         base = &parent->data->npdata[0];
1456                         count = parent->bytes / sizeof(hammer2_blockref_t);
1457                         break;
1458                 case HAMMER2_BREF_TYPE_VOLUME:
1459                         base = &parent->data->voldata.
1460                                         sroot_blockset.blockref[0];
1461                         count = HAMMER2_SET_COUNT;
1462                         break;
1463                 case HAMMER2_BREF_TYPE_FREEMAP:
1464                         base = &parent->data->blkset.blockref[0];
1465                         count = HAMMER2_SET_COUNT;
1466                         break;
1467                 default:
1468                         panic("hammer2_chain_base_and_count: "
1469                               "unrecognized blockref type: %d",
1470                               parent->bref.type);
1471                         base = NULL;
1472                         count = 0;
1473                         break;
1474                 }
1475         }
1476         *countp = count;
1477
1478         return base;
1479 }
1480
1481 /*
1482  * This counts the number of live blockrefs in a block array and
1483  * also calculates the point at which all remaining blockrefs are empty.
1484  * This routine can only be called on a live chain.
1485  *
1486  * Caller holds the chain locked, but possibly with a shared lock.  We
1487  * must use an exclusive spinlock to prevent corruption.
1488  *
1489  * NOTE: Flag is not set until after the count is complete, allowing
1490  *       callers to test the flag without holding the spinlock.
1491  *
1492  * NOTE: If base is NULL the related chain is still in the INITIAL
1493  *       state and there are no blockrefs to count.
1494  *
1495  * NOTE: live_count may already have some counts accumulated due to
1496  *       creation and deletion and could even be initially negative.
1497  */
1498 void
1499 hammer2_chain_countbrefs(hammer2_chain_t *chain,
1500                          hammer2_blockref_t *base, int count)
1501 {
1502         hammer2_spin_ex(&chain->core.spin);
1503         if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) {
1504                 if (base) {
1505                         while (--count >= 0) {
1506                                 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY)
1507                                         break;
1508                         }
1509                         chain->core.live_zero = count + 1;
1510                         while (count >= 0) {
1511                                 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY)
1512                                         atomic_add_int(&chain->core.live_count,
1513                                                        1);
1514                                 --count;
1515                         }
1516                 } else {
1517                         chain->core.live_zero = 0;
1518                 }
1519                 /* else do not modify live_count */
1520                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS);
1521         }
1522         hammer2_spin_unex(&chain->core.spin);
1523 }
1524
1525 /*
1526  * Resize the chain's physical storage allocation in-place.  This function does
1527  * not usually adjust the data pointer and must be followed by (typically) a
1528  * hammer2_chain_modify() call to copy any old data over and adjust the
1529  * data pointer.
1530  *
1531  * Chains can be resized smaller without reallocating the storage.  Resizing
1532  * larger will reallocate the storage.  Excess or prior storage is reclaimed
1533  * asynchronously at a later time.
1534  *
1535  * An nradix value of 0 is special-cased to mean that the storage should
1536  * be disassociated, that is the chain is being resized to 0 bytes (not 1
1537  * byte).
1538  *
1539  * Must be passed an exclusively locked parent and chain.
1540  *
1541  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
1542  * to avoid instantiating a device buffer that conflicts with the vnode data
1543  * buffer.  However, because H2 can compress or encrypt data, the chain may
1544  * have a dio assigned to it in those situations, and they do not conflict.
1545  *
1546  * XXX return error if cannot resize.
1547  */
1548 int
1549 hammer2_chain_resize(hammer2_chain_t *chain,
1550                      hammer2_tid_t mtid, hammer2_off_t dedup_off,
1551                      int nradix, int flags)
1552 {
1553         hammer2_dev_t *hmp;
1554         size_t obytes;
1555         size_t nbytes;
1556         int error;
1557
1558         hmp = chain->hmp;
1559
1560         /*
1561          * Only data and indirect blocks can be resized for now.
1562          * (The volu root, inodes, and freemap elements use a fixed size).
1563          */
1564         KKASSERT(chain != &hmp->vchain);
1565         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1566                  chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1567                  chain->bref.type == HAMMER2_BREF_TYPE_DIRENT);
1568
1569         /*
1570          * Nothing to do if the element is already the proper size
1571          */
1572         obytes = chain->bytes;
1573         nbytes = (nradix) ? (1U << nradix) : 0;
1574         if (obytes == nbytes)
1575                 return (chain->error);
1576
1577         /*
1578          * Make sure the old data is instantiated so we can copy it.  If this
1579          * is a data block, the device data may be superfluous since the data
1580          * might be in a logical block, but compressed or encrypted data is
1581          * another matter.
1582          *
1583          * NOTE: The modify will set BMAPUPD for us if BMAPPED is set.
1584          */
1585         error = hammer2_chain_modify(chain, mtid, dedup_off, 0);
1586         if (error)
1587                 return error;
1588
1589         /*
1590          * Reallocate the block, even if making it smaller (because different
1591          * block sizes may be in different regions).
1592          *
1593          * NOTE: Operation does not copy the data and may only be used
1594          *       to resize data blocks in-place, or directory entry blocks
1595          *       which are about to be modified in some manner.
1596          */
1597         error = hammer2_freemap_alloc(chain, nbytes);
1598         if (error)
1599                 return error;
1600
1601         chain->bytes = nbytes;
1602
1603         /*
1604          * We don't want the followup chain_modify() to try to copy data
1605          * from the old (wrong-sized) buffer.  It won't know how much to
1606          * copy.  This case should only occur during writes when the
1607          * originator already has the data to write in-hand.
1608          */
1609         if (chain->dio) {
1610                 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1611                          chain->bref.type == HAMMER2_BREF_TYPE_DIRENT);
1612                 hammer2_io_brelse(&chain->dio);
1613                 chain->data = NULL;
1614         }
1615         return (chain->error);
1616 }
1617
1618 /*
1619  * Set the chain modified so its data can be changed by the caller, or
1620  * install deduplicated data.  The caller must call this routine for each
1621  * set of modifications it makes, even if the chain is already flagged
1622  * MODIFIED.
1623  *
1624  * Sets bref.modify_tid to mtid only if mtid != 0.  Note that bref.modify_tid
1625  * is a CLC (cluster level change) field and is not updated by parent
1626  * propagation during a flush.
1627  *
1628  * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect
1629  * chain->error except for HAMMER2_ERROR_ENOSPC.  If the allocation fails
1630  * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain
1631  * remains unmodified with its old data ref intact and chain->error
1632  * unchanged.
1633  *
1634  *                               Dedup Handling
1635  *
1636  * If the DEDUPABLE flag is set in the chain the storage must be reallocated
1637  * even if the chain is still flagged MODIFIED.  In this case the chain's
1638  * DEDUPABLE flag will be cleared once the new storage has been assigned.
1639  *
1640  * If the caller passes a non-zero dedup_off we will use it to assign the
1641  * new storage.  The MODIFIED flag will be *CLEARED* in this case, and
1642  * DEDUPABLE will be set (NOTE: the UPDATE flag is always set).  The caller
1643  * must not modify the data content upon return.
1644  */
1645 int
1646 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid,
1647                      hammer2_off_t dedup_off, int flags)
1648 {
1649         hammer2_blockref_t obref;
1650         hammer2_dev_t *hmp;
1651         hammer2_io_t *dio;
1652         int error;
1653         int wasinitial;
1654         int setmodified;
1655         int setupdate;
1656         int newmod;
1657         char *bdata;
1658
1659         hmp = chain->hmp;
1660         obref = chain->bref;
1661         KKASSERT(chain->lock.mtx_lock & MTX_EXCLUSIVE);
1662
1663         /*
1664          * Data is not optional for freemap chains (we must always be sure
1665          * to copy the data on COW storage allocations).
1666          */
1667         if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
1668             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
1669                 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) ||
1670                          (flags & HAMMER2_MODIFY_OPTDATA) == 0);
1671         }
1672
1673         /*
1674          * Data must be resolved if already assigned, unless explicitly
1675          * flagged otherwise.  If we cannot safety load the data the
1676          * modification fails and we return early.
1677          */
1678         if (chain->data == NULL && chain->bytes != 0 &&
1679             (flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
1680             (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
1681                 hammer2_chain_load_data(chain);
1682                 if (chain->error)
1683                         return (chain->error);
1684         }
1685         error = 0;
1686
1687         /*
1688          * Set MODIFIED to indicate that the chain has been modified.  A new
1689          * allocation is required when modifying a chain.
1690          *
1691          * Set UPDATE to ensure that the blockref is updated in the parent.
1692          *
1693          * If MODIFIED is already set determine if we can reuse the assigned
1694          * data block or if we need a new data block.
1695          */
1696         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1697                 /*
1698                  * Must set modified bit.
1699                  */
1700                 atomic_add_long(&hammer2_count_modified_chains, 1);
1701                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1702                 hammer2_pfs_memory_inc(chain->pmp);  /* can be NULL */
1703                 setmodified = 1;
1704
1705                 /*
1706                  * We may be able to avoid a copy-on-write if the chain's
1707                  * check mode is set to NONE and the chain's current
1708                  * modify_tid is beyond the last explicit snapshot tid.
1709                  *
1710                  * This implements HAMMER2's overwrite-in-place feature.
1711                  *
1712                  * NOTE! This data-block cannot be used as a de-duplication
1713                  *       source when the check mode is set to NONE.
1714                  */
1715                 if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1716                      chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) &&
1717                     (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 &&
1718                     (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 &&
1719                     HAMMER2_DEC_CHECK(chain->bref.methods) ==
1720                      HAMMER2_CHECK_NONE &&
1721                     chain->pmp &&
1722                     chain->bref.modify_tid >
1723                      chain->pmp->iroot->meta.pfs_lsnap_tid) {
1724                         /*
1725                          * Sector overwrite allowed.
1726                          */
1727                         newmod = 0;
1728                 } else if ((hmp->hflags & HMNT2_EMERG) &&
1729                            chain->pmp &&
1730                            chain->bref.modify_tid >
1731                             chain->pmp->iroot->meta.pfs_lsnap_tid) {
1732                         /*
1733                          * If in emergency delete mode then do a modify-in-
1734                          * place on any chain type belonging to the PFS as
1735                          * long as it doesn't mess up a snapshot.  We might
1736                          * be forced to do this anyway a little further down
1737                          * in the code if the allocation fails.
1738                          *
1739                          * Also note that in emergency mode, these modify-in-
1740                          * place operations are NOT SAFE.  A storage failure,
1741                          * power failure, or panic can corrupt the filesystem.
1742                          */
1743                         newmod = 0;
1744                 } else {
1745                         /*
1746                          * Sector overwrite not allowed, must copy-on-write.
1747                          */
1748                         newmod = 1;
1749                 }
1750         } else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) {
1751                 /*
1752                  * If the modified chain was registered for dedup we need
1753                  * a new allocation.  This only happens for delayed-flush
1754                  * chains (i.e. which run through the front-end buffer
1755                  * cache).
1756                  */
1757                 newmod = 1;
1758                 setmodified = 0;
1759         } else {
1760                 /*
1761                  * Already flagged modified, no new allocation is needed.
1762                  */
1763                 newmod = 0;
1764                 setmodified = 0;
1765         }
1766
1767         /*
1768          * Flag parent update required.
1769          */
1770         if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) {
1771                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1772                 setupdate = 1;
1773         } else {
1774                 setupdate = 0;
1775         }
1776
1777         /*
1778          * The XOP code returns held but unlocked focus chains.  This
1779          * prevents the chain from being destroyed but does not prevent
1780          * it from being modified.  diolk is used to interlock modifications
1781          * against XOP frontend accesses to the focus.
1782          *
1783          * This allows us to theoretically avoid deadlocking the frontend
1784          * if one of the backends lock up by not formally locking the
1785          * focused chain in the frontend.  In addition, the synchronization
1786          * code relies on this mechanism to avoid deadlocking concurrent
1787          * synchronization threads.
1788          */
1789         lockmgr(&chain->diolk, LK_EXCLUSIVE);
1790
1791         /*
1792          * The modification or re-modification requires an allocation and
1793          * possible COW.  If an error occurs, the previous content and data
1794          * reference is retained and the modification fails.
1795          *
1796          * If dedup_off is non-zero, the caller is requesting a deduplication
1797          * rather than a modification.  The MODIFIED bit is not set and the
1798          * data offset is set to the deduplication offset.  The data cannot
1799          * be modified.
1800          *
1801          * NOTE: The dedup offset is allowed to be in a partially free state
1802          *       and we must be sure to reset it to a fully allocated state
1803          *       to force two bulkfree passes to free it again.
1804          *
1805          * NOTE: Only applicable when chain->bytes != 0.
1806          *
1807          * XXX can a chain already be marked MODIFIED without a data
1808          * assignment?  If not, assert here instead of testing the case.
1809          */
1810         if (chain != &hmp->vchain && chain != &hmp->fchain &&
1811             chain->bytes) {
1812                 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 ||
1813                     newmod
1814                 ) {
1815                         /*
1816                          * NOTE: We do not have to remove the dedup
1817                          *       registration because the area is still
1818                          *       allocated and the underlying DIO will
1819                          *       still be flushed.
1820                          */
1821                         if (dedup_off) {
1822                                 chain->bref.data_off = dedup_off;
1823                                 chain->bytes = 1 << (dedup_off &
1824                                                      HAMMER2_OFF_MASK_RADIX);
1825                                 chain->error = 0;
1826                                 atomic_clear_int(&chain->flags,
1827                                                  HAMMER2_CHAIN_MODIFIED);
1828                                 atomic_add_long(&hammer2_count_modified_chains,
1829                                                 -1);
1830                                 if (chain->pmp) {
1831                                         hammer2_pfs_memory_wakeup(
1832                                                 chain->pmp, -1);
1833                                 }
1834                                 hammer2_freemap_adjust(hmp, &chain->bref,
1835                                                 HAMMER2_FREEMAP_DORECOVER);
1836                                 atomic_set_int(&chain->flags,
1837                                                 HAMMER2_CHAIN_DEDUPABLE);
1838                         } else {
1839                                 error = hammer2_freemap_alloc(chain,
1840                                                               chain->bytes);
1841                                 atomic_clear_int(&chain->flags,
1842                                                 HAMMER2_CHAIN_DEDUPABLE);
1843
1844                                 /*
1845                                  * If we are unable to allocate a new block
1846                                  * but we are in emergency mode, issue a
1847                                  * warning to the console and reuse the same
1848                                  * block.
1849                                  *
1850                                  * We behave as if the allocation were
1851                                  * successful.
1852                                  *
1853                                  * THIS IS IMPORTANT: These modifications
1854                                  * are virtually guaranteed to corrupt any
1855                                  * snapshots related to this filesystem.
1856                                  */
1857                                 if (error && (hmp->hflags & HMNT2_EMERG)) {
1858                                         error = 0;
1859                                         chain->bref.flags |=
1860                                                 HAMMER2_BREF_FLAG_EMERG_MIP;
1861
1862                                         krateprintf(&krate_h2em,
1863                                             "hammer2: Emergency Mode WARNING: "
1864                                             "Operation will likely corrupt "
1865                                             "related snapshot: "
1866                                             "%016jx.%02x key=%016jx\n",
1867                                             chain->bref.data_off,
1868                                             chain->bref.type,
1869                                             chain->bref.key);
1870                                 } else if (error == 0) {
1871                                         chain->bref.flags &=
1872                                                 ~HAMMER2_BREF_FLAG_EMERG_MIP;
1873                                 }
1874                         }
1875                 }
1876         }
1877
1878         /*
1879          * Stop here if error.  We have to undo any flag bits we might
1880          * have set above.
1881          */
1882         if (error) {
1883                 if (setmodified) {
1884                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1885                         atomic_add_long(&hammer2_count_modified_chains, -1);
1886                         if (chain->pmp)
1887                                 hammer2_pfs_memory_wakeup(chain->pmp, -1);
1888                 }
1889                 if (setupdate) {
1890                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1891                 }
1892                 lockmgr(&chain->diolk, LK_RELEASE);
1893
1894                 return error;
1895         }
1896
1897         /*
1898          * Update mirror_tid and modify_tid.  modify_tid is only updated
1899          * if not passed as zero (during flushes, parent propagation passes
1900          * the value 0).
1901          *
1902          * NOTE: chain->pmp could be the device spmp.
1903          */
1904         chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1;
1905         if (mtid)
1906                 chain->bref.modify_tid = mtid;
1907
1908         /*
1909          * Set BMAPUPD to tell the flush code that an existing blockmap entry
1910          * requires updating as well as to tell the delete code that the
1911          * chain's blockref might not exactly match (in terms of physical size
1912          * or block offset) the one in the parent's blocktable.  The base key
1913          * of course will still match.
1914          */
1915         if (chain->flags & HAMMER2_CHAIN_BMAPPED)
1916                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD);
1917
1918         /*
1919          * Short-cut data block handling when the caller does not need an
1920          * actual data reference to (aka OPTDATA), as long as the chain does
1921          * not already have a data pointer to the data and no de-duplication
1922          * occurred.
1923          *
1924          * This generally means that the modifications are being done via the
1925          * logical buffer cache.
1926          *
1927          * NOTE: If deduplication occurred we have to run through the data
1928          *       stuff to clear INITIAL, and the caller will likely want to
1929          *       assign the check code anyway.  Leaving INITIAL set on a
1930          *       dedup can be deadly (it can cause the block to be zero'd!).
1931          *
1932          * This code also handles bytes == 0 (most dirents).
1933          */
1934         if (chain->bref.type == HAMMER2_BREF_TYPE_DATA &&
1935             (flags & HAMMER2_MODIFY_OPTDATA) &&
1936             chain->data == NULL) {
1937                 if (dedup_off == 0) {
1938                         KKASSERT(chain->dio == NULL);
1939                         goto skip2;
1940                 }
1941         }
1942
1943         /*
1944          * Clearing the INITIAL flag (for indirect blocks) indicates that
1945          * we've processed the uninitialized storage allocation.
1946          *
1947          * If this flag is already clear we are likely in a copy-on-write
1948          * situation but we have to be sure NOT to bzero the storage if
1949          * no data is present.
1950          *
1951          * Clearing of NOTTESTED is allowed if the MODIFIED bit is set,
1952          */
1953         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1954                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1955                 wasinitial = 1;
1956         } else {
1957                 wasinitial = 0;
1958         }
1959
1960         /*
1961          * Instantiate data buffer and possibly execute COW operation
1962          */
1963         switch(chain->bref.type) {
1964         case HAMMER2_BREF_TYPE_VOLUME:
1965         case HAMMER2_BREF_TYPE_FREEMAP:
1966                 /*
1967                  * The data is embedded, no copy-on-write operation is
1968                  * needed.
1969                  */
1970                 KKASSERT(chain->dio == NULL);
1971                 break;
1972         case HAMMER2_BREF_TYPE_DIRENT:
1973                 /*
1974                  * The data might be fully embedded.
1975                  */
1976                 if (chain->bytes == 0) {
1977                         KKASSERT(chain->dio == NULL);
1978                         break;
1979                 }
1980                 /* fall through */
1981         case HAMMER2_BREF_TYPE_INODE:
1982         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1983         case HAMMER2_BREF_TYPE_DATA:
1984         case HAMMER2_BREF_TYPE_INDIRECT:
1985         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1986                 /*
1987                  * Perform the copy-on-write operation
1988                  *
1989                  * zero-fill or copy-on-write depending on whether
1990                  * chain->data exists or not and set the dirty state for
1991                  * the new buffer.  hammer2_io_new() will handle the
1992                  * zero-fill.
1993                  *
1994                  * If a dedup_off was supplied this is an existing block
1995                  * and no COW, copy, or further modification is required.
1996                  */
1997                 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain);
1998
1999                 if (wasinitial && dedup_off == 0) {
2000                         error = hammer2_io_new(hmp, chain->bref.type,
2001                                                chain->bref.data_off,
2002                                                chain->bytes, &dio);
2003                 } else {
2004                         error = hammer2_io_bread(hmp, chain->bref.type,
2005                                                  chain->bref.data_off,
2006                                                  chain->bytes, &dio);
2007                 }
2008                 hammer2_adjreadcounter(chain->bref.type, chain->bytes);
2009
2010                 /*
2011                  * If an I/O error occurs make sure callers cannot accidently
2012                  * modify the old buffer's contents and corrupt the filesystem.
2013                  *
2014                  * NOTE: hammer2_io_data() call issues bkvasync()
2015                  */
2016                 if (error) {
2017                         kprintf("hammer2_chain_modify: hmp=%p I/O error\n",
2018                                 hmp);
2019                         chain->error = HAMMER2_ERROR_EIO;
2020                         hammer2_io_brelse(&dio);
2021                         hammer2_io_brelse(&chain->dio);
2022                         chain->data = NULL;
2023                         break;
2024                 }
2025                 chain->error = 0;
2026                 bdata = hammer2_io_data(dio, chain->bref.data_off);
2027
2028                 if (chain->data) {
2029                         /*
2030                          * COW (unless a dedup).
2031                          */
2032                         KKASSERT(chain->dio != NULL);
2033                         if (chain->data != (void *)bdata && dedup_off == 0) {
2034                                 bcopy(chain->data, bdata, chain->bytes);
2035                         }
2036                 } else if (wasinitial == 0 && dedup_off == 0) {
2037                         /*
2038                          * We have a problem.  We were asked to COW but
2039                          * we don't have any data to COW with!
2040                          */
2041                         panic("hammer2_chain_modify: having a COW %p\n",
2042                               chain);
2043                 }
2044
2045                 /*
2046                  * Retire the old buffer, replace with the new.  Dirty or
2047                  * redirty the new buffer.
2048                  *
2049                  * WARNING! The system buffer cache may have already flushed
2050                  *          the buffer, so we must be sure to [re]dirty it
2051                  *          for further modification.
2052                  *
2053                  *          If dedup_off was supplied, the caller is not
2054                  *          expected to make any further modification to the
2055                  *          buffer.
2056                  *
2057                  * WARNING! hammer2_get_gdata() assumes dio never transitions
2058                  *          through NULL in order to optimize away unnecessary
2059                  *          diolk operations.
2060                  */
2061                 {
2062                         hammer2_io_t *tio;
2063
2064                         if ((tio = chain->dio) != NULL)
2065                                 hammer2_io_bqrelse(&tio);
2066                         chain->data = (void *)bdata;
2067                         chain->dio = dio;
2068                         if (dedup_off == 0)
2069                                 hammer2_io_setdirty(dio);
2070                 }
2071                 break;
2072         default:
2073                 panic("hammer2_chain_modify: illegal non-embedded type %d",
2074                       chain->bref.type);
2075                 break;
2076
2077         }
2078 skip2:
2079         /*
2080          * setflush on parent indicating that the parent must recurse down
2081          * to us.  Do not call on chain itself which might already have it
2082          * set.
2083          */
2084         if (chain->parent)
2085                 hammer2_chain_setflush(chain->parent);
2086         lockmgr(&chain->diolk, LK_RELEASE);
2087
2088         return (chain->error);
2089 }
2090
2091 /*
2092  * Modify the chain associated with an inode.
2093  */
2094 int
2095 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain,
2096                         hammer2_tid_t mtid, int flags)
2097 {
2098         int error;
2099
2100         hammer2_inode_modify(ip);
2101         error = hammer2_chain_modify(chain, mtid, 0, flags);
2102
2103         return error;
2104 }
2105
2106 /*
2107  * Volume header data locks
2108  */
2109 void
2110 hammer2_voldata_lock(hammer2_dev_t *hmp)
2111 {
2112         lockmgr(&hmp->vollk, LK_EXCLUSIVE);
2113 }
2114
2115 void
2116 hammer2_voldata_unlock(hammer2_dev_t *hmp)
2117 {
2118         lockmgr(&hmp->vollk, LK_RELEASE);
2119 }
2120
2121 void
2122 hammer2_voldata_modify(hammer2_dev_t *hmp)
2123 {
2124         if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) {
2125                 atomic_add_long(&hammer2_count_modified_chains, 1);
2126                 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED);
2127                 hammer2_pfs_memory_inc(hmp->vchain.pmp);
2128         }
2129 }
2130
2131 /*
2132  * This function returns the chain at the nearest key within the specified
2133  * range.  The returned chain will be referenced but not locked.
2134  *
2135  * This function will recurse through chain->rbtree as necessary and will
2136  * return a *key_nextp suitable for iteration.  *key_nextp is only set if
2137  * the iteration value is less than the current value of *key_nextp.
2138  *
2139  * The caller should use (*key_nextp) to calculate the actual range of
2140  * the returned element, which will be (key_beg to *key_nextp - 1), because
2141  * there might be another element which is superior to the returned element
2142  * and overlaps it.
2143  *
2144  * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL
2145  * chains continue to be returned.  On EOF (*key_nextp) may overflow since
2146  * it will wind up being (key_end + 1).
2147  *
2148  * WARNING!  Must be called with child's spinlock held.  Spinlock remains
2149  *           held through the operation.
2150  */
2151 struct hammer2_chain_find_info {
2152         hammer2_chain_t         *best;
2153         hammer2_key_t           key_beg;
2154         hammer2_key_t           key_end;
2155         hammer2_key_t           key_next;
2156 };
2157
2158 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data);
2159 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data);
2160
2161 static
2162 hammer2_chain_t *
2163 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp,
2164                           hammer2_key_t key_beg, hammer2_key_t key_end)
2165 {
2166         struct hammer2_chain_find_info info;
2167
2168         info.best = NULL;
2169         info.key_beg = key_beg;
2170         info.key_end = key_end;
2171         info.key_next = *key_nextp;
2172
2173         RB_SCAN(hammer2_chain_tree, &parent->core.rbtree,
2174                 hammer2_chain_find_cmp, hammer2_chain_find_callback,
2175                 &info);
2176         *key_nextp = info.key_next;
2177 #if 0
2178         kprintf("chain_find %p %016jx:%016jx next=%016jx\n",
2179                 parent, key_beg, key_end, *key_nextp);
2180 #endif
2181
2182         return (info.best);
2183 }
2184
2185 static
2186 int
2187 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data)
2188 {
2189         struct hammer2_chain_find_info *info = data;
2190         hammer2_key_t child_beg;
2191         hammer2_key_t child_end;
2192
2193         child_beg = child->bref.key;
2194         child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1;
2195
2196         if (child_end < info->key_beg)
2197                 return(-1);
2198         if (child_beg > info->key_end)
2199                 return(1);
2200         return(0);
2201 }
2202
2203 static
2204 int
2205 hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
2206 {
2207         struct hammer2_chain_find_info *info = data;
2208         hammer2_chain_t *best;
2209         hammer2_key_t child_end;
2210
2211         /*
2212          * WARNING! Layerq is scanned forwards, exact matches should keep
2213          *          the existing info->best.
2214          */
2215         if ((best = info->best) == NULL) {
2216                 /*
2217                  * No previous best.  Assign best
2218                  */
2219                 info->best = child;
2220         } else if (best->bref.key <= info->key_beg &&
2221                    child->bref.key <= info->key_beg) {
2222                 /*
2223                  * Illegal overlap.
2224                  */
2225                 KKASSERT(0);
2226                 /*info->best = child;*/
2227         } else if (child->bref.key < best->bref.key) {
2228                 /*
2229                  * Child has a nearer key and best is not flush with key_beg.
2230                  * Set best to child.  Truncate key_next to the old best key.
2231                  */
2232                 info->best = child;
2233                 if (info->key_next > best->bref.key || info->key_next == 0)
2234                         info->key_next = best->bref.key;
2235         } else if (child->bref.key == best->bref.key) {
2236                 /*
2237                  * If our current best is flush with the child then this
2238                  * is an illegal overlap.
2239                  *
2240                  * key_next will automatically be limited to the smaller of
2241                  * the two end-points.
2242                  */
2243                 KKASSERT(0);
2244                 info->best = child;
2245         } else {
2246                 /*
2247                  * Keep the current best but truncate key_next to the child's
2248                  * base.
2249                  *
2250                  * key_next will also automatically be limited to the smaller
2251                  * of the two end-points (probably not necessary for this case
2252                  * but we do it anyway).
2253                  */
2254                 if (info->key_next > child->bref.key || info->key_next == 0)
2255                         info->key_next = child->bref.key;
2256         }
2257
2258         /*
2259          * Always truncate key_next based on child's end-of-range.
2260          */
2261         child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits);
2262         if (child_end && (info->key_next > child_end || info->key_next == 0))
2263                 info->key_next = child_end;
2264
2265         return(0);
2266 }
2267
2268 /*
2269  * Retrieve the specified chain from a media blockref, creating the
2270  * in-memory chain structure which reflects it.  The returned chain is
2271  * held and locked according to (how) (HAMMER2_RESOLVE_*).  The caller must
2272  * handle crc-checks and so forth, and should check chain->error before
2273  * assuming that the data is good.
2274  *
2275  * To handle insertion races pass the INSERT_RACE flag along with the
2276  * generation number of the core.  NULL will be returned if the generation
2277  * number changes before we have a chance to insert the chain.  Insert
2278  * races can occur because the parent might be held shared.
2279  *
2280  * Caller must hold the parent locked shared or exclusive since we may
2281  * need the parent's bref array to find our block.
2282  *
2283  * WARNING! chain->pmp is always set to NULL for any chain representing
2284  *          part of the super-root topology.
2285  */
2286 hammer2_chain_t *
2287 hammer2_chain_get(hammer2_chain_t *parent, int generation,
2288                   hammer2_blockref_t *bref, int how)
2289 {
2290         hammer2_dev_t *hmp = parent->hmp;
2291         hammer2_chain_t *chain;
2292         int error;
2293
2294         /*
2295          * Allocate a chain structure representing the existing media
2296          * entry.  Resulting chain has one ref and is not locked.
2297          */
2298         if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT)
2299                 chain = hammer2_chain_alloc(hmp, NULL, bref);
2300         else
2301                 chain = hammer2_chain_alloc(hmp, parent->pmp, bref);
2302         /* ref'd chain returned */
2303
2304         /*
2305          * Flag that the chain is in the parent's blockmap so delete/flush
2306          * knows what to do with it.
2307          */
2308         atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED);
2309
2310         /*
2311          * chain must be locked to avoid unexpected ripouts
2312          */
2313         hammer2_chain_lock(chain, how);
2314
2315         /*
2316          * Link the chain into its parent.  A spinlock is required to safely
2317          * access the RBTREE, and it is possible to collide with another
2318          * hammer2_chain_get() operation because the caller might only hold
2319          * a shared lock on the parent.
2320          *
2321          * NOTE: Get races can occur quite often when we distribute
2322          *       asynchronous read-aheads across multiple threads.
2323          */
2324         KKASSERT(parent->refs > 0);
2325         error = hammer2_chain_insert(parent, chain,
2326                                      HAMMER2_CHAIN_INSERT_SPIN |
2327                                      HAMMER2_CHAIN_INSERT_RACE,
2328                                      generation);
2329         if (error) {
2330                 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
2331                 /*kprintf("chain %p get race\n", chain);*/
2332                 hammer2_chain_unlock(chain);
2333                 hammer2_chain_drop(chain);
2334                 chain = NULL;
2335         } else {
2336                 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
2337         }
2338
2339         /*
2340          * Return our new chain referenced but not locked, or NULL if
2341          * a race occurred.
2342          */
2343         return (chain);
2344 }
2345
2346 /*
2347  * Lookup initialization/completion API
2348  */
2349 hammer2_chain_t *
2350 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags)
2351 {
2352         hammer2_chain_ref(parent);
2353         if (flags & HAMMER2_LOOKUP_SHARED) {
2354                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
2355                                            HAMMER2_RESOLVE_SHARED);
2356         } else {
2357                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2358         }
2359         return (parent);
2360 }
2361
2362 void
2363 hammer2_chain_lookup_done(hammer2_chain_t *parent)
2364 {
2365         if (parent) {
2366                 hammer2_chain_unlock(parent);
2367                 hammer2_chain_drop(parent);
2368         }
2369 }
2370
2371 /*
2372  * Take the locked chain and return a locked parent.  The chain remains
2373  * locked on return, but may have to be temporarily unlocked to acquire
2374  * the parent.  Because of this, (chain) must be stable and cannot be
2375  * deleted while it was temporarily unlocked (typically means that (chain)
2376  * is an inode).
2377  *
2378  * Pass HAMMER2_RESOLVE_* flags in flags.
2379  *
2380  * This will work even if the chain is errored, and the caller can check
2381  * parent->error on return if desired since the parent will be locked.
2382  *
2383  * This function handles the lock order reversal.
2384  */
2385 hammer2_chain_t *
2386 hammer2_chain_getparent(hammer2_chain_t *chain, int flags)
2387 {
2388         hammer2_chain_t *parent;
2389
2390         /*
2391          * Be careful of order, chain must be unlocked before parent
2392          * is locked below to avoid a deadlock.  Try it trivially first.
2393          */
2394         parent = chain->parent;
2395         if (parent == NULL)
2396                 panic("hammer2_chain_getparent: no parent");
2397         hammer2_chain_ref(parent);
2398         if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0)
2399                 return parent;
2400
2401         for (;;) {
2402                 hammer2_chain_unlock(chain);
2403                 hammer2_chain_lock(parent, flags);
2404                 hammer2_chain_lock(chain, flags);
2405
2406                 /*
2407                  * Parent relinking races are quite common.  We have to get
2408                  * it right or we will blow up the block table.
2409                  */
2410                 if (chain->parent == parent)
2411                         break;
2412                 hammer2_chain_unlock(parent);
2413                 hammer2_chain_drop(parent);
2414                 cpu_ccfence();
2415                 parent = chain->parent;
2416                 if (parent == NULL)
2417                         panic("hammer2_chain_getparent: no parent");
2418                 hammer2_chain_ref(parent);
2419         }
2420         return parent;
2421 }
2422
2423 /*
2424  * Take the locked chain and return a locked parent.  The chain is unlocked
2425  * and dropped.  *chainp is set to the returned parent as a convenience.
2426  * Pass HAMMER2_RESOLVE_* flags in flags.
2427  *
2428  * This will work even if the chain is errored, and the caller can check
2429  * parent->error on return if desired since the parent will be locked.
2430  *
2431  * The chain does NOT need to be stable.  We use a tracking structure
2432  * to track the expected parent if the chain is deleted out from under us.
2433  *
2434  * This function handles the lock order reversal.
2435  */
2436 hammer2_chain_t *
2437 hammer2_chain_repparent(hammer2_chain_t **chainp, int flags)
2438 {
2439         hammer2_chain_t *chain;
2440         hammer2_chain_t *parent;
2441         struct hammer2_reptrack reptrack;
2442         struct hammer2_reptrack **repp;
2443
2444         /*
2445          * Be careful of order, chain must be unlocked before parent
2446          * is locked below to avoid a deadlock.  Try it trivially first.
2447          */
2448         chain = *chainp;
2449         parent = chain->parent;
2450         if (parent == NULL) {
2451                 hammer2_spin_unex(&chain->core.spin);
2452                 panic("hammer2_chain_repparent: no parent");
2453         }
2454         hammer2_chain_ref(parent);
2455         if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) {
2456                 hammer2_chain_unlock(chain);
2457                 hammer2_chain_drop(chain);
2458                 *chainp = parent;
2459
2460                 return parent;
2461         }
2462
2463         /*
2464          * Ok, now it gets a bit nasty.  There are multiple situations where
2465          * the parent might be in the middle of a deletion, or where the child
2466          * (chain) might be deleted the instant we let go of its lock.
2467          * We can potentially end up in a no-win situation!
2468          *
2469          * In particular, the indirect_maintenance() case can cause these
2470          * situations.
2471          *
2472          * To deal with this we install a reptrack structure in the parent
2473          * This reptrack structure 'owns' the parent ref and will automatically
2474          * migrate to the parent's parent if the parent is deleted permanently.
2475          */
2476         hammer2_spin_init(&reptrack.spin, "h2reptrk");
2477         reptrack.chain = parent;
2478         hammer2_chain_ref(parent);              /* for the reptrack */
2479
2480         hammer2_spin_ex(&parent->core.spin);
2481         reptrack.next = parent->core.reptrack;
2482         parent->core.reptrack = &reptrack;
2483         hammer2_spin_unex(&parent->core.spin);
2484
2485         hammer2_chain_unlock(chain);
2486         hammer2_chain_drop(chain);
2487         chain = NULL;   /* gone */
2488
2489         /*
2490          * At the top of this loop, chain is gone and parent is refd both
2491          * by us explicitly AND via our reptrack.  We are attempting to
2492          * lock parent.
2493          */
2494         for (;;) {
2495                 hammer2_chain_lock(parent, flags);
2496
2497                 if (reptrack.chain == parent)
2498                         break;
2499                 hammer2_chain_unlock(parent);
2500                 hammer2_chain_drop(parent);
2501
2502                 kprintf("hammer2: debug REPTRACK %p->%p\n",
2503                         parent, reptrack.chain);
2504                 hammer2_spin_ex(&reptrack.spin);
2505                 parent = reptrack.chain;
2506                 hammer2_chain_ref(parent);
2507                 hammer2_spin_unex(&reptrack.spin);
2508         }
2509
2510         /*
2511          * Once parent is locked and matches our reptrack, our reptrack
2512          * will be stable and we have our parent.  We can unlink our
2513          * reptrack.
2514          *
2515          * WARNING!  Remember that the chain lock might be shared.  Chains
2516          *           locked shared have stable parent linkages.
2517          */
2518         hammer2_spin_ex(&parent->core.spin);
2519         repp = &parent->core.reptrack;
2520         while (*repp != &reptrack)
2521                 repp = &(*repp)->next;
2522         *repp = reptrack.next;
2523         hammer2_spin_unex(&parent->core.spin);
2524
2525         hammer2_chain_drop(parent);     /* reptrack ref */
2526         *chainp = parent;               /* return parent lock+ref */
2527
2528         return parent;
2529 }
2530
2531 /*
2532  * Dispose of any linked reptrack structures in (chain) by shifting them to
2533  * (parent).  Both (chain) and (parent) must be exclusively locked.
2534  *
2535  * This is interlocked against any children of (chain) on the other side.
2536  * No children so remain as-of when this is called so we can test
2537  * core.reptrack without holding the spin-lock.
2538  *
2539  * Used whenever the caller intends to permanently delete chains related
2540  * to topological recursions (BREF_TYPE_INDIRECT, BREF_TYPE_FREEMAP_NODE),
2541  * where the chains underneath the node being deleted are given a new parent
2542  * above the node being deleted.
2543  */
2544 static
2545 void
2546 hammer2_chain_repchange(hammer2_chain_t *parent, hammer2_chain_t *chain)
2547 {
2548         struct hammer2_reptrack *reptrack;
2549
2550         KKASSERT(chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree));
2551         while (chain->core.reptrack) {
2552                 hammer2_spin_ex(&parent->core.spin);
2553                 hammer2_spin_ex(&chain->core.spin);
2554                 reptrack = chain->core.reptrack;
2555                 if (reptrack == NULL) {
2556                         hammer2_spin_unex(&chain->core.spin);
2557                         hammer2_spin_unex(&parent->core.spin);
2558                         break;
2559                 }
2560                 hammer2_spin_ex(&reptrack->spin);
2561                 chain->core.reptrack = reptrack->next;
2562                 reptrack->chain = parent;
2563                 reptrack->next = parent->core.reptrack;
2564                 parent->core.reptrack = reptrack;
2565                 hammer2_chain_ref(parent);              /* reptrack */
2566
2567                 hammer2_spin_unex(&chain->core.spin);
2568                 hammer2_spin_unex(&parent->core.spin);
2569                 kprintf("hammer2: debug repchange %p %p->%p\n",
2570                         reptrack, chain, parent);
2571                 hammer2_chain_drop(chain);              /* reptrack */
2572         }
2573 }
2574
2575 /*
2576  * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive.
2577  * (*parentp) typically points to an inode but can also point to a related
2578  * indirect block and this function will recurse upwards and find the inode
2579  * or the nearest undeleted indirect block covering the key range.
2580  *
2581  * This function unconditionally sets *errorp, replacing any previous value.
2582  *
2583  * (*parentp) must be exclusive or shared locked (depending on flags) and
2584  * referenced and can be an inode or an existing indirect block within the
2585  * inode.
2586  *
2587  * If (*parent) is errored out, this function will not attempt to recurse
2588  * the radix tree and will return NULL along with an appropriate *errorp.
2589  * If NULL is returned and *errorp is 0, the requested lookup could not be
2590  * located.
2591  *
2592  * On return (*parentp) will be modified to point at the deepest parent chain
2593  * element encountered during the search, as a helper for an insertion or
2594  * deletion.
2595  *
2596  * The new (*parentp) will be locked shared or exclusive (depending on flags),
2597  * and referenced, and the old will be unlocked and dereferenced (no change
2598  * if they are both the same).  This is particularly important if the caller
2599  * wishes to insert a new chain, (*parentp) will be set properly even if NULL
2600  * is returned, as long as no error occurred.
2601  *
2602  * The matching chain will be returned locked according to flags.
2603  *
2604  * --
2605  *
2606  * NULL is returned if no match was found, but (*parentp) will still
2607  * potentially be adjusted.
2608  *
2609  * On return (*key_nextp) will point to an iterative value for key_beg.
2610  * (If NULL is returned (*key_nextp) is set to (key_end + 1)).
2611  *
2612  * This function will also recurse up the chain if the key is not within the
2613  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
2614  * can simply allow (*parentp) to float inside the loop.
2615  *
2616  * NOTE!  chain->data is not always resolved.  By default it will not be
2617  *        resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF.  Use
2618  *        HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/
2619  *        BREF_TYPE_DATA as the device buffer can alias the logical file
2620  *        buffer).
2621  */
2622
2623 hammer2_chain_t *
2624 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp,
2625                      hammer2_key_t key_beg, hammer2_key_t key_end,
2626                      int *errorp, int flags)
2627 {
2628         hammer2_dev_t *hmp;
2629         hammer2_chain_t *parent;
2630         hammer2_chain_t *chain;
2631         hammer2_blockref_t *base;
2632         hammer2_blockref_t *bref;
2633         hammer2_blockref_t bsave;
2634         hammer2_key_t scan_beg;
2635         hammer2_key_t scan_end;
2636         int count = 0;
2637         int how_always = HAMMER2_RESOLVE_ALWAYS;
2638         int how_maybe = HAMMER2_RESOLVE_MAYBE;
2639         int how;
2640         int generation;
2641         int maxloops = 300000;
2642         volatile hammer2_mtx_t save_mtx;
2643
2644         if (flags & HAMMER2_LOOKUP_ALWAYS) {
2645                 how_maybe = how_always;
2646                 how = HAMMER2_RESOLVE_ALWAYS;
2647         } else if (flags & HAMMER2_LOOKUP_NODATA) {
2648                 how = HAMMER2_RESOLVE_NEVER;
2649         } else {
2650                 how = HAMMER2_RESOLVE_MAYBE;
2651         }
2652         if (flags & HAMMER2_LOOKUP_SHARED) {
2653                 how_maybe |= HAMMER2_RESOLVE_SHARED;
2654                 how_always |= HAMMER2_RESOLVE_SHARED;
2655                 how |= HAMMER2_RESOLVE_SHARED;
2656         }
2657
2658         /*
2659          * Recurse (*parentp) upward if necessary until the parent completely
2660          * encloses the key range or we hit the inode.
2661          *
2662          * Handle races against the flusher deleting indirect nodes on its
2663          * way back up by continuing to recurse upward past the deletion.
2664          */
2665         parent = *parentp;
2666         hmp = parent->hmp;
2667         *errorp = 0;
2668
2669         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2670                parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2671                 scan_beg = parent->bref.key;
2672                 scan_end = scan_beg +
2673                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
2674                 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) {
2675                         if (key_beg >= scan_beg && key_end <= scan_end)
2676                                 break;
2677                 }
2678                 parent = hammer2_chain_repparent(parentp, how_maybe);
2679         }
2680 again:
2681         if (--maxloops == 0)
2682                 panic("hammer2_chain_lookup: maxloops");
2683
2684         /*
2685          * MATCHIND case that does not require parent->data (do prior to
2686          * parent->error check).
2687          */
2688         switch(parent->bref.type) {
2689         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2690         case HAMMER2_BREF_TYPE_INDIRECT:
2691                 if (flags & HAMMER2_LOOKUP_MATCHIND) {
2692                         scan_beg = parent->bref.key;
2693                         scan_end = scan_beg +
2694                                ((hammer2_key_t)1 << parent->bref.keybits) - 1;
2695                         if (key_beg == scan_beg && key_end == scan_end) {
2696                                 chain = parent;
2697                                 hammer2_chain_ref(chain);
2698                                 hammer2_chain_lock(chain, how_maybe);
2699                                 *key_nextp = scan_end + 1;
2700                                 goto done;
2701                         }
2702                 }
2703                 break;
2704         default:
2705                 break;
2706         }
2707
2708         /*
2709          * No lookup is possible if the parent is errored.  We delayed
2710          * this check as long as we could to ensure that the parent backup,
2711          * embedded data, and MATCHIND code could still execute.
2712          */
2713         if (parent->error) {
2714                 *errorp = parent->error;
2715                 return NULL;
2716         }
2717
2718         /*
2719          * Locate the blockref array.  Currently we do a fully associative
2720          * search through the array.
2721          */
2722         switch(parent->bref.type) {
2723         case HAMMER2_BREF_TYPE_INODE:
2724                 /*
2725                  * Special shortcut for embedded data returns the inode
2726                  * itself.  Callers must detect this condition and access
2727                  * the embedded data (the strategy code does this for us).
2728                  *
2729                  * This is only applicable to regular files and softlinks.
2730                  *
2731                  * We need a second lock on parent.  Since we already have
2732                  * a lock we must pass LOCKAGAIN to prevent unexpected
2733                  * blocking (we don't want to block on a second shared
2734                  * ref if an exclusive lock is pending)
2735                  */
2736                 if (parent->data->ipdata.meta.op_flags &
2737                     HAMMER2_OPFLAG_DIRECTDATA) {
2738                         if (flags & HAMMER2_LOOKUP_NODIRECT) {
2739                                 chain = NULL;
2740                                 *key_nextp = key_end + 1;
2741                                 goto done;
2742                         }
2743                         hammer2_chain_ref(parent);
2744                         hammer2_chain_lock(parent, how_always |
2745                                                    HAMMER2_RESOLVE_LOCKAGAIN);
2746                         *key_nextp = key_end + 1;
2747                         return (parent);
2748                 }
2749                 base = &parent->data->ipdata.u.blockset.blockref[0];
2750                 count = HAMMER2_SET_COUNT;
2751                 break;
2752         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2753         case HAMMER2_BREF_TYPE_INDIRECT:
2754                 /*
2755                  * Optimize indirect blocks in the INITIAL state to avoid
2756                  * I/O.
2757                  *
2758                  * Debugging: Enter permanent wait state instead of
2759                  * panicing on unexpectedly NULL data for the moment.
2760                  */
2761                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2762                         base = NULL;
2763                 } else {
2764                         if (parent->data == NULL) {
2765                                 kprintf("hammer2: unexpected NULL data "
2766                                         "on %p\n", parent);
2767                                 while (1)
2768                                         tsleep(parent, 0, "xxx", 0);
2769                         }
2770                         base = &parent->data->npdata[0];
2771                 }
2772                 count = parent->bytes / sizeof(hammer2_blockref_t);
2773                 break;
2774         case HAMMER2_BREF_TYPE_VOLUME:
2775                 base = &parent->data->voldata.sroot_blockset.blockref[0];
2776                 count = HAMMER2_SET_COUNT;
2777                 break;
2778         case HAMMER2_BREF_TYPE_FREEMAP:
2779                 base = &parent->data->blkset.blockref[0];
2780                 count = HAMMER2_SET_COUNT;
2781                 break;
2782         default:
2783                 panic("hammer2_chain_lookup: unrecognized "
2784                       "blockref(B) type: %d",
2785                       parent->bref.type);
2786                 base = NULL;    /* safety */
2787                 count = 0;      /* safety */
2788                 break;
2789         }
2790
2791         /*
2792          * Merged scan to find next candidate.
2793          *
2794          * hammer2_base_*() functions require the parent->core.live_* fields
2795          * to be synchronized.
2796          *
2797          * We need to hold the spinlock to access the block array and RB tree
2798          * and to interlock chain creation.
2799          */
2800         if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
2801                 hammer2_chain_countbrefs(parent, base, count);
2802
2803         /*
2804          * Combined search
2805          */
2806         hammer2_spin_ex(&parent->core.spin);
2807         chain = hammer2_combined_find(parent, base, count,
2808                                       key_nextp,
2809                                       key_beg, key_end,
2810                                       &bref);
2811         generation = parent->core.generation;
2812
2813         /*
2814          * Exhausted parent chain, iterate.
2815          */
2816         if (bref == NULL) {
2817                 KKASSERT(chain == NULL);
2818                 hammer2_spin_unex(&parent->core.spin);
2819                 if (key_beg == key_end) /* short cut single-key case */
2820                         return (NULL);
2821
2822                 /*
2823                  * Stop if we reached the end of the iteration.
2824                  */
2825                 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
2826                     parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2827                         return (NULL);
2828                 }
2829
2830                 /*
2831                  * Calculate next key, stop if we reached the end of the
2832                  * iteration, otherwise go up one level and loop.
2833                  */
2834                 key_beg = parent->bref.key +
2835                           ((hammer2_key_t)1 << parent->bref.keybits);
2836                 if (key_beg == 0 || key_beg > key_end)
2837                         return (NULL);
2838                 parent = hammer2_chain_repparent(parentp, how_maybe);
2839                 goto again;
2840         }
2841
2842         /*
2843          * Selected from blockref or in-memory chain.
2844          */
2845         bsave = *bref;
2846         if (chain == NULL) {
2847                 hammer2_spin_unex(&parent->core.spin);
2848                 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT ||
2849                     bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2850                         chain = hammer2_chain_get(parent, generation,
2851                                                   &bsave, how_maybe);
2852                 } else {
2853                         chain = hammer2_chain_get(parent, generation,
2854                                                   &bsave, how);
2855                 }
2856                 if (chain == NULL)
2857                         goto again;
2858         } else {
2859                 hammer2_chain_ref(chain);
2860                 hammer2_spin_unex(&parent->core.spin);
2861
2862                 /*
2863                  * chain is referenced but not locked.  We must lock the
2864                  * chain to obtain definitive state.
2865                  */
2866                 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT ||
2867                     bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2868                         hammer2_chain_lock(chain, how_maybe);
2869                 } else {
2870                         hammer2_chain_lock(chain, how);
2871                 }
2872                 KKASSERT(chain->parent == parent);
2873         }
2874         if (bcmp(&bsave, &chain->bref, sizeof(bsave)) ||
2875             chain->parent != parent) {
2876                 hammer2_chain_unlock(chain);
2877                 hammer2_chain_drop(chain);
2878                 chain = NULL;   /* SAFETY */
2879                 goto again;
2880         }
2881
2882
2883         /*
2884          * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
2885          *
2886          * NOTE: Chain's key range is not relevant as there might be
2887          *       one-offs within the range that are not deleted.
2888          *
2889          * NOTE: Lookups can race delete-duplicate because
2890          *       delete-duplicate does not lock the parent's core
2891          *       (they just use the spinlock on the core).
2892          */
2893         if (chain->flags & HAMMER2_CHAIN_DELETED) {
2894                 kprintf("skip deleted chain %016jx.%02x key=%016jx\n",
2895                         chain->bref.data_off, chain->bref.type,
2896                         chain->bref.key);
2897                 hammer2_chain_unlock(chain);
2898                 hammer2_chain_drop(chain);
2899                 chain = NULL;   /* SAFETY */
2900                 key_beg = *key_nextp;
2901                 if (key_beg == 0 || key_beg > key_end)
2902                         return(NULL);
2903                 goto again;
2904         }
2905
2906         /*
2907          * If the chain element is an indirect block it becomes the new
2908          * parent and we loop on it.  We must maintain our top-down locks
2909          * to prevent the flusher from interfering (i.e. doing a
2910          * delete-duplicate and leaving us recursing down a deleted chain).
2911          *
2912          * The parent always has to be locked with at least RESOLVE_MAYBE
2913          * so we can access its data.  It might need a fixup if the caller
2914          * passed incompatible flags.  Be careful not to cause a deadlock
2915          * as a data-load requires an exclusive lock.
2916          *
2917          * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key
2918          * range is within the requested key range we return the indirect
2919          * block and do NOT loop.  This is usually only used to acquire
2920          * freemap nodes.
2921          */
2922         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2923             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2924                 save_mtx = parent->lock;
2925                 hammer2_chain_unlock(parent);
2926                 hammer2_chain_drop(parent);
2927                 *parentp = parent = chain;
2928                 chain = NULL;   /* SAFETY */
2929                 goto again;
2930         }
2931 done:
2932         /*
2933          * All done, return the locked chain.
2934          *
2935          * If the caller does not want a locked chain, replace the lock with
2936          * a ref.  Perhaps this can eventually be optimized to not obtain the
2937          * lock in the first place for situations where the data does not
2938          * need to be resolved.
2939          *
2940          * NOTE! A chain->error must be tested by the caller upon return.
2941          *       *errorp is only set based on issues which occur while
2942          *       trying to reach the chain.
2943          */
2944         return (chain);
2945 }
2946
2947 /*
2948  * After having issued a lookup we can iterate all matching keys.
2949  *
2950  * If chain is non-NULL we continue the iteration from just after it's index.
2951  *
2952  * If chain is NULL we assume the parent was exhausted and continue the
2953  * iteration at the next parent.
2954  *
2955  * If a fatal error occurs (typically an I/O error), a dummy chain is
2956  * returned with chain->error and error-identifying information set.  This
2957  * chain will assert if you try to do anything fancy with it.
2958  *
2959  * XXX Depending on where the error occurs we should allow continued iteration.
2960  *
2961  * parent must be locked on entry and remains locked throughout.  chain's
2962  * lock status must match flags.  Chain is always at least referenced.
2963  *
2964  * WARNING!  The MATCHIND flag does not apply to this function.
2965  */
2966 hammer2_chain_t *
2967 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain,
2968                    hammer2_key_t *key_nextp,
2969                    hammer2_key_t key_beg, hammer2_key_t key_end,
2970                    int *errorp, int flags)
2971 {
2972         hammer2_chain_t *parent;
2973         int how_maybe;
2974
2975         /*
2976          * Calculate locking flags for upward recursion.
2977          */
2978         how_maybe = HAMMER2_RESOLVE_MAYBE;
2979         if (flags & HAMMER2_LOOKUP_SHARED)
2980                 how_maybe |= HAMMER2_RESOLVE_SHARED;
2981
2982         parent = *parentp;
2983         *errorp = 0;
2984
2985         /*
2986          * Calculate the next index and recalculate the parent if necessary.
2987          */
2988         if (chain) {
2989                 key_beg = chain->bref.key +
2990                           ((hammer2_key_t)1 << chain->bref.keybits);
2991                 hammer2_chain_unlock(chain);
2992                 hammer2_chain_drop(chain);
2993
2994                 /*
2995                  * chain invalid past this point, but we can still do a
2996                  * pointer comparison w/parent.
2997                  *
2998                  * Any scan where the lookup returned degenerate data embedded
2999                  * in the inode has an invalid index and must terminate.
3000                  */
3001                 if (chain == parent)
3002                         return(NULL);
3003                 if (key_beg == 0 || key_beg > key_end)
3004                         return(NULL);
3005                 chain = NULL;
3006         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
3007                    parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
3008                 /*
3009                  * We reached the end of the iteration.
3010                  */
3011                 return (NULL);
3012         } else {
3013                 /*
3014                  * Continue iteration with next parent unless the current
3015                  * parent covers the range.
3016                  *
3017                  * (This also handles the case of a deleted, empty indirect
3018                  * node).
3019                  */
3020                 key_beg = parent->bref.key +
3021                           ((hammer2_key_t)1 << parent->bref.keybits);
3022                 if (key_beg == 0 || key_beg > key_end)
3023                         return (NULL);
3024                 parent = hammer2_chain_repparent(parentp, how_maybe);
3025         }
3026
3027         /*
3028          * And execute
3029          */
3030         return (hammer2_chain_lookup(parentp, key_nextp,
3031                                      key_beg, key_end,
3032                                      errorp, flags));
3033 }
3034
3035 /*
3036  * Caller wishes to iterate chains under parent, loading new chains into
3037  * chainp.  Caller must initialize *chainp to NULL and *firstp to 1, and
3038  * then call hammer2_chain_scan() repeatedly until a non-zero return.
3039  * During the scan, *firstp will be set to 0 and (*chainp) will be replaced
3040  * with the returned chain for the scan.  The returned *chainp will be
3041  * locked and referenced.  Any prior contents will be unlocked and dropped.
3042  *
3043  * Caller should check the return value.  A normal scan EOF will return
3044  * exactly HAMMER2_ERROR_EOF.  Any other non-zero value indicates an
3045  * error trying to access parent data.  Any error in the returned chain
3046  * must be tested separately by the caller.
3047  *
3048  * (*chainp) is dropped on each scan, but will only be set if the returned
3049  * element itself can recurse.  Leaf elements are NOT resolved, loaded, or
3050  * returned via *chainp.  The caller will get their bref only.
3051  *
3052  * The raw scan function is similar to lookup/next but does not seek to a key.
3053  * Blockrefs are iterated via first_bref = (parent, NULL) and
3054  * next_chain = (parent, bref).
3055  *
3056  * The passed-in parent must be locked and its data resolved.  The function
3057  * nominally returns a locked and referenced *chainp != NULL for chains
3058  * the caller might need to recurse on (and will dipose of any *chainp passed
3059  * in).  The caller must check the chain->bref.type either way.
3060  */
3061 int
3062 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp,
3063                    hammer2_blockref_t *bref, int *firstp,
3064                    int flags)
3065 {
3066         hammer2_dev_t *hmp;
3067         hammer2_blockref_t *base;
3068         hammer2_blockref_t *bref_ptr;
3069         hammer2_key_t key;
3070         hammer2_key_t next_key;
3071         hammer2_chain_t *chain = NULL;
3072         int count = 0;
3073         int how_always = HAMMER2_RESOLVE_ALWAYS;
3074         int how_maybe = HAMMER2_RESOLVE_MAYBE;
3075         int how;
3076         int generation;
3077         int maxloops = 300000;
3078         int error;
3079
3080         hmp = parent->hmp;
3081         error = 0;
3082
3083         /*
3084          * Scan flags borrowed from lookup.
3085          */
3086         if (flags & HAMMER2_LOOKUP_ALWAYS) {
3087                 how_maybe = how_always;
3088                 how = HAMMER2_RESOLVE_ALWAYS;
3089         } else if (flags & HAMMER2_LOOKUP_NODATA) {
3090                 how = HAMMER2_RESOLVE_NEVER;
3091         } else {
3092                 how = HAMMER2_RESOLVE_MAYBE;
3093         }
3094         if (flags & HAMMER2_LOOKUP_SHARED) {
3095                 how_maybe |= HAMMER2_RESOLVE_SHARED;
3096                 how_always |= HAMMER2_RESOLVE_SHARED;
3097                 how |= HAMMER2_RESOLVE_SHARED;
3098         }
3099
3100         /*
3101          * Calculate key to locate first/next element, unlocking the previous
3102          * element as we go.  Be careful, the key calculation can overflow.
3103          *
3104          * (also reset bref to NULL)
3105          */
3106         if (*firstp) {
3107                 key = 0;
3108                 *firstp = 0;
3109         } else {
3110                 key = bref->key + ((hammer2_key_t)1 << bref->keybits);
3111                 if ((chain = *chainp) != NULL) {
3112                         *chainp = NULL;
3113                         hammer2_chain_unlock(chain);
3114                         hammer2_chain_drop(chain);
3115                         chain = NULL;
3116                 }
3117                 if (key == 0) {
3118                         error |= HAMMER2_ERROR_EOF;
3119                         goto done;
3120                 }
3121         }
3122
3123 again:
3124         if (parent->error) {
3125                 error = parent->error;
3126                 goto done;
3127         }
3128         if (--maxloops == 0)
3129                 panic("hammer2_chain_scan: maxloops");
3130
3131         /*
3132          * Locate the blockref array.  Currently we do a fully associative
3133          * search through the array.
3134          */
3135         switch(parent->bref.type) {
3136         case HAMMER2_BREF_TYPE_INODE:
3137                 /*
3138                  * An inode with embedded data has no sub-chains.
3139                  *
3140                  * WARNING! Bulk scan code may pass a static chain marked
3141                  *          as BREF_TYPE_INODE with a copy of the volume
3142                  *          root blockset to snapshot the volume.
3143                  */
3144                 if (parent->data->ipdata.meta.op_flags &
3145                     HAMMER2_OPFLAG_DIRECTDATA) {
3146                         error |= HAMMER2_ERROR_EOF;
3147                         goto done;
3148                 }
3149                 base = &parent->data->ipdata.u.blockset.blockref[0];
3150                 count = HAMMER2_SET_COUNT;
3151                 break;
3152         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3153         case HAMMER2_BREF_TYPE_INDIRECT:
3154                 /*
3155                  * Optimize indirect blocks in the INITIAL state to avoid
3156                  * I/O.
3157                  */
3158                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
3159                         base = NULL;
3160                 } else {
3161                         if (parent->data == NULL)
3162                                 panic("parent->data is NULL");
3163                         base = &parent->data->npdata[0];
3164                 }
3165                 count = parent->bytes / sizeof(hammer2_blockref_t);
3166                 break;
3167         case HAMMER2_BREF_TYPE_VOLUME:
3168                 base = &parent->data->voldata.sroot_blockset.blockref[0];
3169                 count = HAMMER2_SET_COUNT;
3170                 break;
3171         case HAMMER2_BREF_TYPE_FREEMAP:
3172                 base = &parent->data->blkset.blockref[0];
3173                 count = HAMMER2_SET_COUNT;
3174                 break;
3175         default:
3176                 panic("hammer2_chain_scan: unrecognized blockref type: %d",
3177                       parent->bref.type);
3178                 base = NULL;    /* safety */
3179                 count = 0;      /* safety */
3180                 break;
3181         }
3182
3183         /*
3184          * Merged scan to find next candidate.
3185          *
3186          * hammer2_base_*() functions require the parent->core.live_* fields
3187          * to be synchronized.
3188          *
3189          * We need to hold the spinlock to access the block array and RB tree
3190          * and to interlock chain creation.
3191          */
3192         if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
3193                 hammer2_chain_countbrefs(parent, base, count);
3194
3195         next_key = 0;
3196         bref_ptr = NULL;
3197         hammer2_spin_ex(&parent->core.spin);
3198         chain = hammer2_combined_find(parent, base, count,
3199                                       &next_key,
3200                                       key, HAMMER2_KEY_MAX,
3201                                       &bref_ptr);
3202         generation = parent->core.generation;
3203
3204         /*
3205          * Exhausted parent chain, we're done.
3206          */
3207         if (bref_ptr == NULL) {
3208                 hammer2_spin_unex(&parent->core.spin);
3209                 KKASSERT(chain == NULL);
3210                 error |= HAMMER2_ERROR_EOF;
3211                 goto done;
3212         }
3213
3214         /*
3215          * Copy into the supplied stack-based blockref.
3216          */
3217         *bref = *bref_ptr;
3218
3219         /*
3220          * Selected from blockref or in-memory chain.
3221          */
3222         if (chain == NULL) {
3223                 switch(bref->type) {
3224                 case HAMMER2_BREF_TYPE_INODE:
3225                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3226                 case HAMMER2_BREF_TYPE_INDIRECT:
3227                 case HAMMER2_BREF_TYPE_VOLUME:
3228                 case HAMMER2_BREF_TYPE_FREEMAP:
3229                         /*
3230                          * Recursion, always get the chain
3231                          */
3232                         hammer2_spin_unex(&parent->core.spin);
3233                         chain = hammer2_chain_get(parent, generation,
3234                                                   bref, how);
3235                         if (chain == NULL)
3236                                 goto again;
3237                         break;
3238                 default:
3239                         /*
3240                          * No recursion, do not waste time instantiating
3241                          * a chain, just iterate using the bref.
3242                          */
3243                         hammer2_spin_unex(&parent->core.spin);
3244                         break;
3245                 }
3246         } else {
3247                 /*
3248                  * Recursion or not we need the chain in order to supply
3249                  * the bref.
3250                  */
3251                 hammer2_chain_ref(chain);
3252                 hammer2_spin_unex(&parent->core.spin);
3253                 hammer2_chain_lock(chain, how);
3254         }
3255         if (chain &&
3256             (bcmp(bref, &chain->bref, sizeof(*bref)) ||
3257              chain->parent != parent)) {
3258                 hammer2_chain_unlock(chain);
3259                 hammer2_chain_drop(chain);
3260                 chain = NULL;
3261                 goto again;
3262         }
3263
3264         /*
3265          * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
3266          *
3267          * NOTE: chain's key range is not relevant as there might be
3268          *       one-offs within the range that are not deleted.
3269          *
3270          * NOTE: XXX this could create problems with scans used in
3271          *       situations other than mount-time recovery.
3272          *
3273          * NOTE: Lookups can race delete-duplicate because
3274          *       delete-duplicate does not lock the parent's core
3275          *       (they just use the spinlock on the core).
3276          */
3277         if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
3278                 hammer2_chain_unlock(chain);
3279                 hammer2_chain_drop(chain);
3280                 chain = NULL;
3281
3282                 key = next_key;
3283                 if (key == 0) {
3284                         error |= HAMMER2_ERROR_EOF;
3285                         goto done;
3286                 }
3287                 goto again;
3288         }
3289
3290 done:
3291         /*
3292          * All done, return the bref or NULL, supply chain if necessary.
3293          */
3294         if (chain)
3295                 *chainp = chain;
3296         return (error);
3297 }
3298
3299 /*
3300  * Create and return a new hammer2 system memory structure of the specified
3301  * key, type and size and insert it under (*parentp).  This is a full
3302  * insertion, based on the supplied key/keybits, and may involve creating
3303  * indirect blocks and moving other chains around via delete/duplicate.
3304  *
3305  * This call can be made with parent == NULL as long as a non -1 methods
3306  * is supplied.  hmp must also be supplied in this situation (otherwise
3307  * hmp is extracted from the supplied parent).  The chain will be detached
3308  * from the topology.  A later call with both parent and chain can be made
3309  * to attach it.
3310  *
3311  * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION
3312  * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING
3313  * FULL.  This typically means that the caller is creating the chain after
3314  * doing a hammer2_chain_lookup().
3315  *
3316  * (*parentp) must be exclusive locked and may be replaced on return
3317  * depending on how much work the function had to do.
3318  *
3319  * (*parentp) must not be errored or this function will assert.
3320  *
3321  * (*chainp) usually starts out NULL and returns the newly created chain,
3322  * but if the caller desires the caller may allocate a disconnected chain
3323  * and pass it in instead.
3324  *
3325  * This function should NOT be used to insert INDIRECT blocks.  It is
3326  * typically used to create/insert inodes and data blocks.
3327  *
3328  * Caller must pass-in an exclusively locked parent the new chain is to
3329  * be inserted under, and optionally pass-in a disconnected, exclusively
3330  * locked chain to insert (else we create a new chain).  The function will
3331  * adjust (*parentp) as necessary, create or connect the chain, and
3332  * return an exclusively locked chain in *chainp.
3333  *
3334  * When creating a PFSROOT inode under the super-root, pmp is typically NULL
3335  * and will be reassigned.
3336  *
3337  * NOTE: returns HAMMER_ERROR_* flags
3338  */
3339 int
3340 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp,
3341                      hammer2_dev_t *hmp, hammer2_pfs_t *pmp, int methods,
3342                      hammer2_key_t key, int keybits, int type, size_t bytes,
3343                      hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags)
3344 {
3345         hammer2_chain_t *chain;
3346         hammer2_chain_t *parent;
3347         hammer2_blockref_t *base;
3348         hammer2_blockref_t dummy;
3349         int allocated = 0;
3350         int error = 0;
3351         int count;
3352         int maxloops = 300000;
3353
3354         /*
3355          * Topology may be crossing a PFS boundary.
3356          */
3357         parent = *parentp;
3358         if (parent) {
3359                 KKASSERT(hammer2_mtx_owned(&parent->lock));
3360                 KKASSERT(parent->error == 0);
3361                 hmp = parent->hmp;
3362         }
3363         chain = *chainp;
3364
3365         if (chain == NULL) {
3366                 /*
3367                  * First allocate media space and construct the dummy bref,
3368                  * then allocate the in-memory chain structure.  Set the
3369                  * INITIAL flag for fresh chains which do not have embedded
3370                  * data.
3371                  */
3372                 bzero(&dummy, sizeof(dummy));
3373                 dummy.type = type;
3374                 dummy.key = key;
3375                 dummy.keybits = keybits;
3376                 dummy.data_off = hammer2_getradix(bytes);
3377
3378                 /*
3379                  * Inherit methods from parent by default.  Primarily used
3380                  * for BREF_TYPE_DATA.  Non-data types *must* be set to
3381                  * a non-NONE check algorithm.
3382                  */
3383                 if (methods == -1)
3384                         dummy.methods = parent->bref.methods;
3385                 else
3386                         dummy.methods = (uint8_t)methods;
3387
3388                 if (type != HAMMER2_BREF_TYPE_DATA &&
3389                     HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) {
3390                         dummy.methods |=
3391                                 HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT);
3392                 }
3393
3394                 chain = hammer2_chain_alloc(hmp, pmp, &dummy);
3395
3396                 /*
3397                  * Lock the chain manually, chain_lock will load the chain
3398                  * which we do NOT want to do.  (note: chain->refs is set
3399                  * to 1 by chain_alloc() for us, but lockcnt is not).
3400                  */
3401                 chain->lockcnt = 1;
3402                 hammer2_mtx_ex(&chain->lock);
3403                 allocated = 1;
3404
3405                 /*
3406                  * Set INITIAL to optimize I/O.  The flag will generally be
3407                  * processed when we call hammer2_chain_modify().
3408                  */
3409                 switch(type) {
3410                 case HAMMER2_BREF_TYPE_VOLUME:
3411                 case HAMMER2_BREF_TYPE_FREEMAP:
3412                         panic("hammer2_chain_create: called with volume type");
3413                         break;
3414                 case HAMMER2_BREF_TYPE_INDIRECT:
3415                         panic("hammer2_chain_create: cannot be used to"
3416                               "create indirect block");
3417                         break;
3418                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3419                         panic("hammer2_chain_create: cannot be used to"
3420                               "create freemap root or node");
3421                         break;
3422                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
3423                         KKASSERT(bytes == sizeof(chain->data->bmdata));
3424                         /* fall through */
3425                 case HAMMER2_BREF_TYPE_DIRENT:
3426                 case HAMMER2_BREF_TYPE_INODE:
3427                 case HAMMER2_BREF_TYPE_DATA:
3428                 default:
3429                         /*
3430                          * leave chain->data NULL, set INITIAL
3431                          */
3432                         KKASSERT(chain->data == NULL);
3433                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
3434                         break;
3435                 }
3436         } else {
3437                 /*
3438                  * We are reattaching a previously deleted chain, possibly
3439                  * under a new parent and possibly with a new key/keybits.
3440                  * The chain does not have to be in a modified state.  The
3441                  * UPDATE flag will be set later on in this routine.
3442                  *
3443                  * Do NOT mess with the current state of the INITIAL flag.
3444                  */
3445                 chain->bref.key = key;
3446                 chain->bref.keybits = keybits;
3447                 if (chain->flags & HAMMER2_CHAIN_DELETED)
3448                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3449                 KKASSERT(chain->parent == NULL);
3450         }
3451
3452         /*
3453          * Set the appropriate bref flag if requested.
3454          *
3455          * NOTE! Callers can call this function to move chains without
3456          *       knowing about special flags, so don't clear bref flags
3457          *       here!
3458          */
3459         if (flags & HAMMER2_INSERT_PFSROOT)
3460                 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
3461
3462         if (parent == NULL)
3463                 goto skip;
3464
3465         /*
3466          * Calculate how many entries we have in the blockref array and
3467          * determine if an indirect block is required when inserting into
3468          * the parent.
3469          */
3470 again:
3471         if (--maxloops == 0)
3472                 panic("hammer2_chain_create: maxloops");
3473
3474         switch(parent->bref.type) {
3475         case HAMMER2_BREF_TYPE_INODE:
3476                 if ((parent->data->ipdata.meta.op_flags &
3477                      HAMMER2_OPFLAG_DIRECTDATA) != 0) {
3478                         kprintf("hammer2: parent set for direct-data! "
3479                                 "pkey=%016jx ckey=%016jx\n",
3480                                 parent->bref.key,
3481                                 chain->bref.key);
3482                 }
3483                 KKASSERT((parent->data->ipdata.meta.op_flags &
3484                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
3485                 KKASSERT(parent->data != NULL);
3486                 base = &parent->data->ipdata.u.blockset.blockref[0];
3487                 count = HAMMER2_SET_COUNT;
3488                 break;
3489         case HAMMER2_BREF_TYPE_INDIRECT:
3490         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3491                 if (parent->flags & HAMMER2_CHAIN_INITIAL)
3492                         base = NULL;
3493                 else
3494                         base = &parent->data->npdata[0];
3495                 count = parent->bytes / sizeof(hammer2_blockref_t);
3496                 break;
3497         case HAMMER2_BREF_TYPE_VOLUME:
3498                 KKASSERT(parent->data != NULL);
3499                 base = &parent->data->voldata.sroot_blockset.blockref[0];
3500                 count = HAMMER2_SET_COUNT;
3501                 break;
3502         case HAMMER2_BREF_TYPE_FREEMAP:
3503                 KKASSERT(parent->data != NULL);
3504                 base = &parent->data->blkset.blockref[0];
3505                 count = HAMMER2_SET_COUNT;
3506                 break;
3507         default:
3508                 panic("hammer2_chain_create: unrecognized blockref type: %d",
3509                       parent->bref.type);
3510                 base = NULL;
3511                 count = 0;
3512                 break;
3513         }
3514
3515         /*
3516          * Make sure we've counted the brefs
3517          */
3518         if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
3519                 hammer2_chain_countbrefs(parent, base, count);
3520
3521         KASSERT(parent->core.live_count >= 0 &&
3522                 parent->core.live_count <= count,
3523                 ("bad live_count %d/%d (%02x, %d)",
3524                         parent->core.live_count, count,
3525                         parent->bref.type, parent->bytes));
3526
3527         /*
3528          * If no free blockref could be found we must create an indirect
3529          * block and move a number of blockrefs into it.  With the parent
3530          * locked we can safely lock each child in order to delete+duplicate
3531          * it without causing a deadlock.
3532          *
3533          * This may return the new indirect block or the old parent depending
3534          * on where the key falls.  NULL is returned on error.
3535          */
3536         if (parent->core.live_count == count) {
3537                 hammer2_chain_t *nparent;
3538
3539                 KKASSERT((flags & HAMMER2_INSERT_SAMEPARENT) == 0);
3540
3541                 nparent = hammer2_chain_create_indirect(parent, key, keybits,
3542                                                         mtid, type, &error);
3543                 if (nparent == NULL) {
3544                         if (allocated)
3545                                 hammer2_chain_drop(chain);
3546                         chain = NULL;
3547                         goto done;
3548                 }
3549                 if (parent != nparent) {
3550                         hammer2_chain_unlock(parent);
3551                         hammer2_chain_drop(parent);
3552                         parent = *parentp = nparent;
3553                 }
3554                 goto again;
3555         }
3556
3557         /*
3558          * fall through if parent, or skip to here if no parent.
3559          */
3560 skip:
3561         if (chain->flags & HAMMER2_CHAIN_DELETED)
3562                 kprintf("Inserting deleted chain @%016jx\n",
3563                         chain->bref.key);
3564
3565         /*
3566          * Link the chain into its parent.
3567          */
3568         if (chain->parent != NULL)
3569                 panic("hammer2: hammer2_chain_create: chain already connected");
3570         KKASSERT(chain->parent == NULL);
3571         if (parent) {
3572                 KKASSERT(parent->core.live_count < count);
3573                 hammer2_chain_insert(parent, chain,
3574                                      HAMMER2_CHAIN_INSERT_SPIN |
3575                                      HAMMER2_CHAIN_INSERT_LIVE,
3576                                      0);
3577         }
3578
3579         if (allocated) {
3580                 /*
3581                  * Mark the newly created chain modified.  This will cause
3582                  * UPDATE to be set and process the INITIAL flag.
3583                  *
3584                  * Device buffers are not instantiated for DATA elements
3585                  * as these are handled by logical buffers.
3586                  *
3587                  * Indirect and freemap node indirect blocks are handled
3588                  * by hammer2_chain_create_indirect() and not by this
3589                  * function.
3590                  *
3591                  * Data for all other bref types is expected to be
3592                  * instantiated (INODE, LEAF).
3593                  */
3594                 switch(chain->bref.type) {
3595                 case HAMMER2_BREF_TYPE_DATA:
3596                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
3597                 case HAMMER2_BREF_TYPE_DIRENT:
3598                 case HAMMER2_BREF_TYPE_INODE:
3599                         error = hammer2_chain_modify(chain, mtid, dedup_off,
3600                                                      HAMMER2_MODIFY_OPTDATA);
3601                         break;
3602                 default:
3603                         /*
3604                          * Remaining types are not supported by this function.
3605                          * In particular, INDIRECT and LEAF_NODE types are
3606                          * handled by create_indirect().
3607                          */
3608                         panic("hammer2_chain_create: bad type: %d",
3609                               chain->bref.type);
3610                         /* NOT REACHED */
3611                         break;
3612                 }
3613         } else {
3614                 /*
3615                  * When reconnecting a chain we must set UPDATE and
3616                  * setflush so the flush recognizes that it must update
3617                  * the bref in the parent.
3618                  */
3619                 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0)
3620                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
3621         }
3622
3623         /*
3624          * We must setflush(parent) to ensure that it recurses through to
3625          * chain.  setflush(chain) might not work because ONFLUSH is possibly
3626          * already set in the chain (so it won't recurse up to set it in the
3627          * parent).
3628          */
3629         if (parent)
3630                 hammer2_chain_setflush(parent);
3631
3632 done:
3633         *chainp = chain;
3634
3635         return (error);
3636 }
3637
3638 /*
3639  * Move the chain from its old parent to a new parent.  The chain must have
3640  * already been deleted or already disconnected (or never associated) with
3641  * a parent.  The chain is reassociated with the new parent and the deleted
3642  * flag will be cleared (no longer deleted).  The chain's modification state
3643  * is not altered.
3644  *
3645  * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION
3646  * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING
3647  * FULL.  This typically means that the caller is creating the chain after
3648  * doing a hammer2_chain_lookup().
3649  *
3650  * Neither (parent) or (chain) can be errored.
3651  *
3652  * If (parent) is non-NULL then the chain is inserted under the parent.
3653  *
3654  * If (parent) is NULL then the newly duplicated chain is not inserted
3655  * anywhere, similar to if it had just been chain_alloc()'d (suitable for
3656  * passing into hammer2_chain_create() after this function returns).
3657  *
3658  * WARNING! This function calls create which means it can insert indirect
3659  *          blocks.  This can cause other unrelated chains in the parent to
3660  *          be moved to a newly inserted indirect block in addition to the
3661  *          specific chain.
3662  */
3663 void
3664 hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain,
3665                      hammer2_tid_t mtid, int flags)
3666 {
3667         hammer2_blockref_t *bref;
3668         hammer2_dev_t *hmp;
3669         hammer2_chain_t *parent;
3670
3671         /*
3672          * WARNING!  We should never resolve DATA to device buffers
3673          *           (XXX allow it if the caller did?), and since
3674          *           we currently do not have the logical buffer cache
3675          *           buffer in-hand to fix its cached physical offset
3676          *           we also force the modify code to not COW it. XXX
3677          *
3678          * NOTE!     We allow error'd chains to be renamed.  The bref itself
3679          *           is good and can be renamed.  The content, however, may
3680          *           be inaccessible.
3681          */
3682         hmp = chain->hmp;
3683         KKASSERT(chain->parent == NULL);
3684         /*KKASSERT(chain->error == 0); allow */
3685         bref = &chain->bref;
3686
3687         /*
3688          * If parent is not NULL the duplicated chain will be entered under
3689          * the parent and the UPDATE bit set to tell flush to update
3690          * the blockref.
3691          *
3692          * We must setflush(parent) to ensure that it recurses through to
3693          * chain.  setflush(chain) might not work because ONFLUSH is possibly
3694          * already set in the chain (so it won't recurse up to set it in the
3695          * parent).
3696          *
3697          * Having both chains locked is extremely important for atomicy.
3698          */
3699         if (parentp && (parent = *parentp) != NULL) {
3700                 KKASSERT(hammer2_mtx_owned(&parent->lock));
3701                 KKASSERT(parent->refs > 0);
3702                 KKASSERT(parent->error == 0);
3703
3704                 hammer2_chain_create(parentp, &chain, NULL, chain->pmp,
3705                                      HAMMER2_METH_DEFAULT,
3706                                      bref->key, bref->keybits, bref->type,
3707                                      chain->bytes, mtid, 0, flags);
3708                 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE);
3709                 hammer2_chain_setflush(*parentp);
3710         }
3711 }
3712
3713 /*
3714  * This works in tandem with delete_obref() to install a blockref in
3715  * (typically) an indirect block that is associated with the chain being
3716  * moved to *parentp.
3717  *
3718  * The reason we need this function is that the caller needs to maintain
3719  * the blockref as it was, and not generate a new blockref for what might
3720  * be a modified chain.  Otherwise stuff will leak into the flush that
3721  * the flush code's FLUSH_INODE_STOP flag is unable to catch.
3722  *
3723  * It is EXTREMELY important that we properly set CHAIN_BMAPUPD and
3724  * CHAIN_UPDATE.  We must set BMAPUPD if the bref does not match, and
3725  * we must clear CHAIN_UPDATE (that was likely set by the chain_rename) if
3726  * it does.  Otherwise we can end up in a situation where H2 is unable to
3727  * clean up the in-memory chain topology.
3728  *
3729  * The reason for this is that flushes do not generally flush through
3730  * BREF_TYPE_INODE chains and depend on a hammer2_inode_t queued to syncq
3731  * or sideq to properly flush and dispose of the related inode chain's flags.
3732  * Situations where the inode is not actually modified by the frontend,
3733  * but where we have to move the related chains around as we insert or cleanup
3734  * indirect blocks, can leave us with a 'dirty' (non-disposable) in-memory
3735  * inode chain that does not have a hammer2_inode_t associated with it.
3736  */
3737 static void
3738 hammer2_chain_rename_obref(hammer2_chain_t **parentp, hammer2_chain_t *chain,
3739                            hammer2_tid_t mtid, int flags,
3740                            hammer2_blockref_t *obref)
3741 {
3742         hammer2_chain_rename(parentp, chain, mtid, flags);
3743
3744         if (obref->type != HAMMER2_BREF_TYPE_EMPTY) {
3745                 hammer2_blockref_t *tbase;
3746                 int tcount;
3747
3748                 KKASSERT((chain->flags & HAMMER2_CHAIN_BMAPPED) == 0);
3749                 hammer2_chain_modify(*parentp, mtid, 0, 0);
3750                 tbase = hammer2_chain_base_and_count(*parentp, &tcount);
3751                 hammer2_base_insert(*parentp, tbase, tcount, chain, obref);
3752                 if (bcmp(obref, &chain->bref, sizeof(chain->bref))) {
3753                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD |
3754                                                       HAMMER2_CHAIN_UPDATE);
3755                 } else {
3756                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
3757                 }
3758         }
3759 }
3760
3761 /*
3762  * Helper function for deleting chains.
3763  *
3764  * The chain is removed from the live view (the RBTREE) as well as the parent's
3765  * blockmap.  Both chain and its parent must be locked.
3766  *
3767  * parent may not be errored.  chain can be errored.
3768  */
3769 static int
3770 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain,
3771                              hammer2_tid_t mtid, int flags,
3772                              hammer2_blockref_t *obref)
3773 {
3774         hammer2_dev_t *hmp;
3775         int error = 0;
3776
3777         KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
3778         KKASSERT(chain->parent == parent);
3779         hmp = chain->hmp;
3780
3781         if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
3782                 /*
3783                  * Chain is blockmapped, so there must be a parent.
3784                  * Atomically remove the chain from the parent and remove
3785                  * the blockmap entry.  The parent must be set modified
3786                  * to remove the blockmap entry.
3787                  */
3788                 hammer2_blockref_t *base;
3789                 int count;
3790
3791                 KKASSERT(parent != NULL);
3792                 KKASSERT(parent->error == 0);
3793                 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0);
3794                 error = hammer2_chain_modify(parent, mtid, 0, 0);
3795                 if (error)
3796                         goto done;
3797
3798                 /*
3799                  * Calculate blockmap pointer
3800                  */
3801                 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
3802                 hammer2_spin_ex(&chain->core.spin);
3803                 hammer2_spin_ex(&parent->core.spin);
3804
3805                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3806                 atomic_add_int(&parent->core.live_count, -1);
3807                 ++parent->core.generation;
3808                 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain);
3809                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
3810                 --parent->core.chain_count;
3811                 chain->parent = NULL;
3812
3813                 switch(parent->bref.type) {
3814                 case HAMMER2_BREF_TYPE_INODE:
3815                         /*
3816                          * Access the inode's block array.  However, there
3817                          * is no block array if the inode is flagged
3818                          * DIRECTDATA.
3819                          */
3820                         if (parent->data &&
3821                             (parent->data->ipdata.meta.op_flags &
3822                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
3823                                 base =
3824                                    &parent->data->ipdata.u.blockset.blockref[0];
3825                         } else {
3826                                 base = NULL;
3827                         }
3828                         count = HAMMER2_SET_COUNT;
3829                         break;
3830                 case HAMMER2_BREF_TYPE_INDIRECT:
3831                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3832                         if (parent->data)
3833                                 base = &parent->data->npdata[0];
3834                         else
3835                                 base = NULL;
3836                         count = parent->bytes / sizeof(hammer2_blockref_t);
3837                         break;
3838                 case HAMMER2_BREF_TYPE_VOLUME:
3839                         base = &parent->data->voldata.
3840                                         sroot_blockset.blockref[0];
3841                         count = HAMMER2_SET_COUNT;
3842                         break;
3843                 case HAMMER2_BREF_TYPE_FREEMAP:
3844                         base = &parent->data->blkset.blockref[0];
3845                         count = HAMMER2_SET_COUNT;
3846                         break;
3847                 default:
3848                         base = NULL;
3849                         count = 0;
3850                         panic("_hammer2_chain_delete_helper: "
3851                               "unrecognized blockref type: %d",
3852                               parent->bref.type);
3853                         break;
3854                 }
3855
3856                 /*
3857                  * delete blockmapped chain from its parent.
3858                  *
3859                  * The parent is not affected by any statistics in chain
3860                  * which are pending synchronization.  That is, there is
3861                  * nothing to undo in the parent since they have not yet
3862                  * been incorporated into the parent.
3863                  *
3864                  * The parent is affected by statistics stored in inodes.
3865                  * Those have already been synchronized, so they must be
3866                  * undone.  XXX split update possible w/delete in middle?
3867                  */
3868                 if (base) {
3869                         hammer2_base_delete(parent, base, count, chain, obref);
3870                 }
3871                 hammer2_spin_unex(&parent->core.spin);
3872                 hammer2_spin_unex(&chain->core.spin);
3873         } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
3874                 /*
3875                  * Chain is not blockmapped but a parent is present.
3876                  * Atomically remove the chain from the parent.  There is
3877                  * no blockmap entry to remove.
3878                  *
3879                  * Because chain was associated with a parent but not
3880                  * synchronized, the chain's *_count_up fields contain
3881                  * inode adjustment statistics which must be undone.
3882                  */
3883                 hammer2_spin_ex(&chain->core.spin);
3884                 hammer2_spin_ex(&parent->core.spin);
3885                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3886                 atomic_add_int(&parent->core.live_count, -1);
3887                 ++parent->core.generation;
3888                 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain);
3889                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
3890                 --parent->core.chain_count;
3891                 chain->parent = NULL;
3892                 hammer2_spin_unex(&parent->core.spin);
3893                 hammer2_spin_unex(&chain->core.spin);
3894         } else {
3895                 /*
3896                  * Chain is not blockmapped and has no parent.  This
3897                  * is a degenerate case.
3898                  */
3899                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3900         }
3901 done:
3902         return error;
3903 }
3904
3905 /*
3906  * Create an indirect block that covers one or more of the elements in the
3907  * current parent.  Either returns the existing parent with no locking or
3908  * ref changes or returns the new indirect block locked and referenced
3909  * and leaving the original parent lock/ref intact as well.
3910  *
3911  * If an error occurs, NULL is returned and *errorp is set to the H2 error.
3912  *
3913  * The returned chain depends on where the specified key falls.
3914  *
3915  * The key/keybits for the indirect mode only needs to follow three rules:
3916  *
3917  * (1) That all elements underneath it fit within its key space and
3918  *
3919  * (2) That all elements outside it are outside its key space.
3920  *
3921  * (3) When creating the new indirect block any elements in the current
3922  *     parent that fit within the new indirect block's keyspace must be
3923  *     moved into the new indirect block.
3924  *
3925  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
3926  *     keyspace the the current parent, but lookup/iteration rules will
3927  *     ensure (and must ensure) that rule (2) for all parents leading up
3928  *     to the nearest inode or the root volume header is adhered to.  This
3929  *     is accomplished by always recursing through matching keyspaces in
3930  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
3931  *
3932  * The current implementation calculates the current worst-case keyspace by
3933  * iterating the current parent and then divides it into two halves, choosing
3934  * whichever half has the most elements (not necessarily the half containing
3935  * the requested key).
3936  *
3937  * We can also opt to use the half with the least number of elements.  This
3938  * causes lower-numbered keys (aka logical file offsets) to recurse through
3939  * fewer indirect blocks and higher-numbered keys to recurse through more.
3940  * This also has the risk of not moving enough elements to the new indirect
3941  * block and being forced to create several indirect blocks before the element
3942  * can be inserted.
3943  *
3944  * Must be called with an exclusively locked parent.
3945  *
3946  * NOTE: *errorp set to HAMMER_ERROR_* flags
3947  */
3948 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent,
3949                                 hammer2_key_t *keyp, int keybits,
3950                                 hammer2_blockref_t *base, int count);
3951 static int hammer2_chain_indkey_file(hammer2_chain_t *parent,
3952                                 hammer2_key_t *keyp, int keybits,
3953                                 hammer2_blockref_t *base, int count,
3954                                 int ncount);
3955 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent,
3956                                 hammer2_key_t *keyp, int keybits,
3957                                 hammer2_blockref_t *base, int count,
3958                                 int ncount);
3959 static
3960 hammer2_chain_t *
3961 hammer2_chain_create_indirect(hammer2_chain_t *parent,
3962                               hammer2_key_t create_key, int create_bits,
3963                               hammer2_tid_t mtid, int for_type, int *errorp)
3964 {
3965         hammer2_dev_t *hmp;
3966         hammer2_blockref_t *base;
3967         hammer2_blockref_t *bref;
3968         hammer2_blockref_t bsave;
3969         hammer2_blockref_t dummy;
3970         hammer2_chain_t *chain;
3971         hammer2_chain_t *ichain;
3972         hammer2_key_t key = create_key;
3973         hammer2_key_t key_beg;
3974         hammer2_key_t key_end;
3975         hammer2_key_t key_next;
3976         int keybits = create_bits;
3977         int count;
3978         int ncount;
3979         int nbytes;
3980         int loops;
3981         int error;
3982         int reason;
3983         int generation;
3984         int maxloops = 300000;
3985
3986         /*
3987          * Calculate the base blockref pointer or NULL if the chain
3988          * is known to be empty.  We need to calculate the array count
3989          * for RB lookups either way.
3990          */
3991         hmp = parent->hmp;
3992         KKASSERT(hammer2_mtx_owned(&parent->lock));
3993
3994         /*
3995          * Pre-modify the parent now to avoid having to deal with error
3996          * processing if we tried to later (in the middle of our loop).
3997          *
3998          * We are going to be moving bref's around, the indirect blocks
3999          * cannot be in an initial state.  Do not pass MODIFY_OPTDATA.
4000          */
4001         *errorp = hammer2_chain_modify(parent, mtid, 0, 0);
4002         if (*errorp) {
4003                 kprintf("hammer2_chain_create_indirect: error %08x %s\n",
4004                         *errorp, hammer2_error_str(*errorp));
4005                 return NULL;
4006         }
4007         KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0);
4008
4009         /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/
4010         base = hammer2_chain_base_and_count(parent, &count);
4011
4012         /*
4013          * How big should our new indirect block be?  It has to be at least
4014          * as large as its parent for splits to work properly.
4015          *
4016          * The freemap uses a specific indirect block size.  The number of
4017          * levels are built dynamically and ultimately depend on the size
4018          * volume.  Because freemap blocks are taken from the reserved areas
4019          * of the volume our goal is efficiency (fewer levels) and not so
4020          * much to save disk space.
4021          *
4022          * The first indirect block level for a directory usually uses
4023          * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries).  Due to
4024          * the hash mechanism, this typically gives us a nominal
4025          * 32 * 4 entries with one level of indirection.
4026          *
4027          * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE
4028          * indirect blocks.  The initial 4 entries in the inode gives us
4029          * 256KB.  Up to 4 indirect blocks gives us 32MB.  Three levels
4030          * of indirection gives us 137GB, and so forth.  H2 can support
4031          * huge file sizes but they are not typical, so we try to stick
4032          * with compactness and do not use a larger indirect block size.
4033          *
4034          * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but
4035          * due to the way indirect blocks are created this usually winds
4036          * up being extremely inefficient for small files.  Even though
4037          * 16KB requires more levels of indirection for very large files,
4038          * the 16KB records can be ganged together into 64KB DIOs.
4039          */
4040         if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
4041             for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
4042                 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE;
4043         } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
4044                 if (parent->data->ipdata.meta.type ==
4045                     HAMMER2_OBJTYPE_DIRECTORY)
4046                         nbytes = HAMMER2_IND_BYTES_MIN; /* 4KB = 32 entries */
4047                 else
4048                         nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */
4049
4050         } else {
4051                 nbytes = HAMMER2_IND_BYTES_NOM;
4052         }
4053         if (nbytes < count * sizeof(hammer2_blockref_t)) {
4054                 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE &&
4055                          for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF);
4056                 nbytes = count * sizeof(hammer2_blockref_t);
4057         }
4058         ncount = nbytes / sizeof(hammer2_blockref_t);
4059
4060         /*
4061          * When creating an indirect block for a freemap node or leaf
4062          * the key/keybits must be fitted to static radix levels because
4063          * particular radix levels use particular reserved blocks in the
4064          * related zone.
4065          *
4066          * This routine calculates the key/radix of the indirect block
4067          * we need to create, and whether it is on the high-side or the
4068          * low-side.
4069          */
4070         switch(for_type) {
4071         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
4072         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
4073                 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits,
4074                                                        base, count);
4075                 break;
4076         case HAMMER2_BREF_TYPE_DATA:
4077                 keybits = hammer2_chain_indkey_file(parent, &key, keybits,
4078                                                     base, count, ncount);
4079                 break;
4080         case HAMMER2_BREF_TYPE_DIRENT:
4081         case HAMMER2_BREF_TYPE_INODE:
4082                 keybits = hammer2_chain_indkey_dir(parent, &key, keybits,
4083                                                    base, count, ncount);
4084                 break;
4085         default:
4086                 panic("illegal indirect block for bref type %d", for_type);
4087                 break;
4088         }
4089
4090         /*
4091          * Normalize the key for the radix being represented, keeping the
4092          * high bits and throwing away the low bits.
4093          */
4094         key &= ~(((hammer2_key_t)1 << keybits) - 1);
4095
4096         /*
4097          * Ok, create our new indirect block
4098          */
4099         bzero(&dummy, sizeof(dummy));
4100         if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
4101             for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
4102                 dummy.type = HAMMER2_BREF_TYPE_FREEMAP_NODE;
4103         } else {
4104                 dummy.type = HAMMER2_BREF_TYPE_INDIRECT;
4105         }
4106         dummy.key = key;
4107         dummy.keybits = keybits;
4108         dummy.data_off = hammer2_getradix(nbytes);
4109         dummy.methods =
4110                 HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) |
4111                 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
4112
4113         ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy);
4114         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
4115         hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE);
4116         /* ichain has one ref at this point */
4117
4118         /*
4119          * We have to mark it modified to allocate its block, but use
4120          * OPTDATA to allow it to remain in the INITIAL state.  Otherwise
4121          * it won't be acted upon by the flush code.
4122          *
4123          * XXX remove OPTDATA, we need a fully initialized indirect block to
4124          * be able to move the original blockref.
4125          */
4126         *errorp = hammer2_chain_modify(ichain, mtid, 0, 0);
4127         if (*errorp) {
4128                 kprintf("hammer2_chain_create_indirect: error %08x %s\n",
4129                         *errorp, hammer2_error_str(*errorp));
4130                 hammer2_chain_unlock(ichain);
4131                 hammer2_chain_drop(ichain);
4132                 return NULL;
4133         }
4134         KKASSERT((ichain->flags & HAMMER2_CHAIN_INITIAL) == 0);
4135
4136         /*
4137          * Iterate the original parent and move the matching brefs into
4138          * the new indirect block.
4139          *
4140          * XXX handle flushes.
4141          */
4142         key_beg = 0;
4143         key_end = HAMMER2_KEY_MAX;
4144         key_next = 0;   /* avoid gcc warnings */
4145         hammer2_spin_ex(&parent->core.spin);
4146         loops = 0;
4147         reason = 0;
4148
4149         for (;;) {
4150                 /*
4151                  * Parent may have been modified, relocating its block array.
4152                  * Reload the base pointer.
4153                  */
4154                 base = hammer2_chain_base_and_count(parent, &count);
4155
4156                 if (++loops > 100000) {
4157                     hammer2_spin_unex(&parent->core.spin);
4158                     panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n",
4159                           reason, parent, base, count, key_next);
4160                 }
4161
4162                 /*
4163                  * NOTE: spinlock stays intact, returned chain (if not NULL)
4164                  *       is not referenced or locked which means that we
4165                  *       cannot safely check its flagged / deletion status
4166                  *       until we lock it.
4167                  */
4168                 chain = hammer2_combined_find(parent, base, count,
4169                                               &key_next,
4170                                               key_beg, key_end,
4171                                               &bref);
4172                 generation = parent->core.generation;
4173                 if (bref == NULL)
4174                         break;
4175                 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4176
4177                 /*
4178                  * Skip keys that are not within the key/radix of the new
4179                  * indirect block.  They stay in the parent.
4180                  */
4181                 if (rounddown2(key ^ bref->key, (hammer2_key_t)1 << keybits) != 0) {
4182                         goto next_key_spinlocked;
4183                 }
4184
4185                 /*
4186                  * Load the new indirect block by acquiring the related
4187                  * chains (potentially from media as it might not be
4188                  * in-memory).  Then move it to the new parent (ichain).
4189                  *
4190                  * chain is referenced but not locked.  We must lock the
4191                  * chain to obtain definitive state.
4192                  */
4193                 bsave = *bref;
4194                 if (chain) {
4195                         /*
4196                          * Use chain already present in the RBTREE
4197                          */
4198                         hammer2_chain_ref(chain);
4199                         hammer2_spin_unex(&parent->core.spin);
4200                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
4201                 } else {
4202                         /*
4203                          * Get chain for blockref element.  _get returns NULL
4204                          * on insertion race.
4205                          */
4206                         hammer2_spin_unex(&parent->core.spin);
4207                         chain = hammer2_chain_get(parent, generation, &bsave,
4208                                                   HAMMER2_RESOLVE_NEVER);
4209                         if (chain == NULL) {
4210                                 reason = 1;
4211                                 hammer2_spin_ex(&parent->core.spin);
4212                                 continue;
4213                         }
4214                 }
4215
4216                 /*
4217                  * This is always live so if the chain has been deleted
4218                  * we raced someone and we have to retry.
4219                  *
4220                  * NOTE: Lookups can race delete-duplicate because
4221                  *       delete-duplicate does not lock the parent's core
4222                  *       (they just use the spinlock on the core).
4223                  *
4224                  *       (note reversed logic for this one)
4225                  */
4226                 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) ||
4227                     chain->parent != parent ||
4228                     (chain->flags & HAMMER2_CHAIN_DELETED)) {
4229                         hammer2_chain_unlock(chain);
4230                         hammer2_chain_drop(chain);
4231                         if (hammer2_debug & 0x0040) {
4232                                 kprintf("LOST PARENT RETRY "
4233                                 "RETRY (%p,%p)->%p %08x\n",
4234                                 parent, chain->parent, chain, chain->flags);
4235                         }
4236                         hammer2_spin_ex(&parent->core.spin);
4237                         continue;
4238                 }
4239
4240                 /*
4241                  * Shift the chain to the indirect block.
4242                  *
4243                  * WARNING! No reason for us to load chain data, pass NOSTATS
4244                  *          to prevent delete/insert from trying to access
4245                  *          inode stats (and thus asserting if there is no
4246                  *          chain->data loaded).
4247                  *
4248                  * WARNING! The (parent, chain) deletion may modify the parent
4249                  *          and invalidate the base pointer.
4250                  *
4251                  * WARNING! Parent must already be marked modified, so we
4252                  *          can assume that chain_delete always suceeds.
4253                  *
4254                  * WARNING! hammer2_chain_repchange() does not have to be
4255                  *          called (and doesn't work anyway because we are
4256                  *          only doing a partial shift).  A recursion that is
4257                  *          in-progress can continue at the current parent
4258                  *          and will be able to properly find its next key.
4259                  */
4260                 error = hammer2_chain_delete_obref(parent, chain, mtid, 0,
4261                                                    &bsave);
4262                 KKASSERT(error == 0);
4263                 hammer2_chain_rename_obref(&ichain, chain, mtid, 0, &bsave);
4264                 hammer2_chain_unlock(chain);
4265                 hammer2_chain_drop(chain);
4266                 KKASSERT(parent->refs > 0);
4267                 chain = NULL;
4268                 base = NULL;    /* safety */
4269                 hammer2_spin_ex(&parent->core.spin);
4270 next_key_spinlocked:
4271                 if (--maxloops == 0)
4272                         panic("hammer2_chain_create_indirect: maxloops");
4273                 reason = 4;
4274                 if (key_next == 0 || key_next > key_end)
4275                         break;
4276                 key_beg = key_next;
4277                 /* loop */
4278         }
4279         hammer2_spin_unex(&parent->core.spin);
4280
4281         /*
4282          * Insert the new indirect block into the parent now that we've
4283          * cleared out some entries in the parent.  We calculated a good
4284          * insertion index in the loop above (ichain->index).
4285          *
4286          * We don't have to set UPDATE here because we mark ichain
4287          * modified down below (so the normal modified -> flush -> set-moved
4288          * sequence applies).
4289          *
4290          * The insertion shouldn't race as this is a completely new block
4291          * and the parent is locked.
4292          */
4293         base = NULL;    /* safety, parent modify may change address */
4294         KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
4295         KKASSERT(parent->core.live_count < count);
4296         hammer2_chain_insert(parent, ichain,
4297                              HAMMER2_CHAIN_INSERT_SPIN |
4298                              HAMMER2_CHAIN_INSERT_LIVE,
4299                              0);
4300
4301         /*
4302          * Make sure flushes propogate after our manual insertion.
4303          */
4304         hammer2_chain_setflush(ichain);
4305         hammer2_chain_setflush(parent);
4306
4307         /*
4308          * Figure out what to return.
4309          */
4310         if (rounddown2(create_key ^ key, (hammer2_key_t)1 << keybits)) {
4311                 /*
4312                  * Key being created is outside the key range,
4313                  * return the original parent.
4314                  */
4315                 hammer2_chain_unlock(ichain);
4316                 hammer2_chain_drop(ichain);
4317         } else {
4318                 /*
4319                  * Otherwise its in the range, return the new parent.
4320                  * (leave both the new and old parent locked).
4321                  */
4322                 parent = ichain;
4323         }
4324
4325         return(parent);
4326 }
4327
4328 /*
4329  * Do maintenance on an indirect chain.  Both parent and chain are locked.
4330  *
4331  * Returns non-zero if (chain) is deleted, either due to being empty or
4332  * because its children were safely moved into the parent.
4333  */
4334 int
4335 hammer2_chain_indirect_maintenance(hammer2_chain_t *parent,
4336                                    hammer2_chain_t *chain)
4337 {
4338         hammer2_blockref_t *chain_base;
4339         hammer2_blockref_t *base;
4340         hammer2_blockref_t *bref;
4341         hammer2_blockref_t bsave;
4342         hammer2_key_t key_next;
4343         hammer2_key_t key_beg;
4344         hammer2_key_t key_end;
4345         hammer2_chain_t *sub;
4346         int chain_count;
4347         int count;
4348         int error;
4349         int generation;
4350
4351         /*
4352          * Make sure we have an accurate live_count
4353          */
4354         if ((chain->flags & (HAMMER2_CHAIN_INITIAL |
4355                              HAMMER2_CHAIN_COUNTEDBREFS)) == 0) {
4356                 base = &chain->data->npdata[0];
4357                 count = chain->bytes / sizeof(hammer2_blockref_t);
4358                 hammer2_chain_countbrefs(chain, base, count);
4359         }
4360
4361         /*
4362          * If the indirect block is empty we can delete it.
4363          * (ignore deletion error)
4364          */
4365         if (chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)) {
4366                 hammer2_chain_delete(parent, chain,
4367                                      chain->bref.modify_tid,
4368                                      HAMMER2_DELETE_PERMANENT);
4369                 hammer2_chain_repchange(parent, chain);
4370                 return 1;
4371         }
4372
4373         base = hammer2_chain_base_and_count(parent, &count);
4374
4375         if ((parent->flags & (HAMMER2_CHAIN_INITIAL |
4376                              HAMMER2_CHAIN_COUNTEDBREFS)) == 0) {
4377                 hammer2_chain_countbrefs(parent, base, count);
4378         }
4379
4380         /*
4381          * Determine if we can collapse chain into parent, calculate
4382          * hysteresis for chain emptiness.
4383          */
4384         if (parent->core.live_count + chain->core.live_count - 1 > count)
4385                 return 0;
4386         chain_count = chain->bytes / sizeof(hammer2_blockref_t);
4387         if (chain->core.live_count > chain_count * 3 / 4)
4388                 return 0;
4389
4390         /*
4391          * Ok, theoretically we can collapse chain's contents into
4392          * parent.  chain is locked, but any in-memory children of chain
4393          * are not.  For this to work, we must be able to dispose of any
4394          * in-memory children of chain.
4395          *
4396          * For now require that there are no in-memory children of chain.
4397          *
4398          * WARNING! Both chain and parent must remain locked across this
4399          *          entire operation.
4400          */
4401
4402         /*
4403          * Parent must be marked modified.  Don't try to collapse it if we
4404          * can't mark it modified.  Once modified, destroy chain to make room
4405          * and to get rid of what will be a conflicting key (this is included
4406          * in the calculation above).  Finally, move the children of chain
4407          * into chain's parent.
4408          *
4409          * This order creates an accounting problem for bref.embed.stats
4410          * because we destroy chain before we remove its children.  Any
4411          * elements whos blockref is already synchronized will be counted
4412          * twice.  To deal with the problem we clean out chain's stats prior
4413          * to deleting it.
4414          */
4415         error = hammer2_chain_modify(parent, 0, 0, 0);
4416         if (error) {
4417                 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n",
4418                             hammer2_error_str(error));
4419                 return 0;
4420         }
4421         error = hammer2_chain_modify(chain, chain->bref.modify_tid, 0, 0);
4422         if (error) {
4423                 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n",
4424                             hammer2_error_str(error));
4425                 return 0;
4426         }
4427
4428         chain->bref.embed.stats.inode_count = 0;
4429         chain->bref.embed.stats.data_count = 0;
4430         error = hammer2_chain_delete(parent, chain,
4431                                      chain->bref.modify_tid,
4432                                      HAMMER2_DELETE_PERMANENT);
4433         KKASSERT(error == 0);
4434
4435         /*
4436          * The combined_find call requires core.spin to be held.  One would
4437          * think there wouldn't be any conflicts since we hold chain
4438          * exclusively locked, but the caching mechanism for 0-ref children
4439          * does not require a chain lock.
4440          */
4441         hammer2_spin_ex(&chain->core.spin);
4442
4443         key_next = 0;
4444         key_beg = 0;
4445         key_end = HAMMER2_KEY_MAX;
4446         for (;;) {
4447                 chain_base = &chain->data->npdata[0];
4448                 chain_count = chain->bytes / sizeof(hammer2_blockref_t);
4449                 sub = hammer2_combined_find(chain, chain_base, chain_count,
4450                                             &key_next,
4451                                             key_beg, key_end,
4452                                             &bref);
4453                 generation = chain->core.generation;
4454                 if (bref == NULL)
4455                         break;
4456                 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4457
4458                 bsave = *bref;
4459                 if (sub) {
4460                         hammer2_chain_ref(sub);
4461                         hammer2_spin_unex(&chain->core.spin);
4462                         hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER);
4463                 } else {
4464                         hammer2_spin_unex(&chain->core.spin);
4465                         sub = hammer2_chain_get(chain, generation, &bsave,
4466                                                 HAMMER2_RESOLVE_NEVER);
4467                         if (sub == NULL) {
4468                                 hammer2_spin_ex(&chain->core.spin);
4469                                 continue;
4470                         }
4471                 }
4472                 if (bcmp(&bsave, &sub->bref, sizeof(bsave)) ||
4473                     sub->parent != chain ||
4474                     (sub->flags & HAMMER2_CHAIN_DELETED)) {
4475                         hammer2_chain_unlock(sub);
4476                         hammer2_chain_drop(sub);
4477                         hammer2_spin_ex(&chain->core.spin);
4478                         sub = NULL;     /* safety */
4479                         continue;
4480                 }
4481                 error = hammer2_chain_delete_obref(chain, sub,
4482                                                    sub->bref.modify_tid, 0,
4483                                                    &bsave);
4484                 KKASSERT(error == 0);
4485                 hammer2_chain_rename_obref(&parent, sub,
4486                                      sub->bref.modify_tid,
4487                                      HAMMER2_INSERT_SAMEPARENT, &bsave);
4488                 hammer2_chain_unlock(sub);
4489                 hammer2_chain_drop(sub);
4490                 hammer2_spin_ex(&chain->core.spin);
4491
4492                 if (key_next == 0)
4493                         break;
4494                 key_beg = key_next;
4495         }
4496         hammer2_spin_unex(&chain->core.spin);
4497
4498         hammer2_chain_repchange(parent, chain);
4499
4500         return 1;
4501 }
4502
4503 /*
4504  * Freemap indirect blocks
4505  *
4506  * Calculate the keybits and highside/lowside of the freemap node the
4507  * caller is creating.
4508  *
4509  * This routine will specify the next higher-level freemap key/radix
4510  * representing the lowest-ordered set.  By doing so, eventually all
4511  * low-ordered sets will be moved one level down.
4512  *
4513  * We have to be careful here because the freemap reserves a limited
4514  * number of blocks for a limited number of levels.  So we can't just
4515  * push indiscriminately.
4516  */
4517 int
4518 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
4519                              int keybits, hammer2_blockref_t *base, int count)
4520 {
4521         hammer2_chain_t *chain;
4522         hammer2_blockref_t *bref;
4523         hammer2_key_t key;
4524         hammer2_key_t key_beg;
4525         hammer2_key_t key_end;
4526         hammer2_key_t key_next;
4527         int locount;
4528         int hicount;
4529         int maxloops = 300000;
4530
4531         key = *keyp;
4532         locount = 0;
4533         hicount = 0;
4534         keybits = 64;
4535
4536         /*
4537          * Calculate the range of keys in the array being careful to skip
4538          * slots which are overridden with a deletion.
4539          */
4540         key_beg = 0;
4541         key_end = HAMMER2_KEY_MAX;
4542         hammer2_spin_ex(&parent->core.spin);
4543
4544         for (;;) {
4545                 if (--maxloops == 0) {
4546                         panic("indkey_freemap shit %p %p:%d\n",
4547                               parent, base, count);
4548                 }
4549                 chain = hammer2_combined_find(parent, base, count,
4550                                               &key_next,
4551                                               key_beg, key_end,
4552                                               &bref);
4553
4554                 /*
4555                  * Exhausted search
4556                  */
4557                 if (bref == NULL)
4558                         break;
4559
4560                 /*
4561                  * Skip deleted chains.
4562                  */
4563                 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
4564                         if (key_next == 0 || key_next > key_end)
4565                                 break;
4566                         key_beg = key_next;
4567                         continue;
4568                 }
4569
4570                 /*
4571                  * Use the full live (not deleted) element for the scan
4572                  * iteration.  HAMMER2 does not allow partial replacements.
4573                  *
4574                  * XXX should be built into hammer2_combined_find().
4575                  */
4576                 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4577
4578                 if (keybits > bref->keybits) {
4579                         key = bref->key;
4580                         keybits = bref->keybits;
4581                 } else if (keybits == bref->keybits && bref->key < key) {
4582                         key = bref->key;
4583                 }
4584                 if (key_next == 0)
4585                         break;
4586                 key_beg = key_next;
4587         }
4588         hammer2_spin_unex(&parent->core.spin);
4589
4590         /*
4591          * Return the keybits for a higher-level FREEMAP_NODE covering
4592          * this node.
4593          */
4594         switch(keybits) {
4595         case HAMMER2_FREEMAP_LEVEL0_RADIX:
4596                 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX;
4597                 break;
4598         case HAMMER2_FREEMAP_LEVEL1_RADIX:
4599                 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX;
4600                 break;
4601         case HAMMER2_FREEMAP_LEVEL2_RADIX:
4602                 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX;
4603                 break;
4604         case HAMMER2_FREEMAP_LEVEL3_RADIX:
4605                 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX;
4606                 break;
4607         case HAMMER2_FREEMAP_LEVEL4_RADIX:
4608                 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX;
4609                 break;
4610         case HAMMER2_FREEMAP_LEVEL5_RADIX:
4611                 panic("hammer2_chain_indkey_freemap: level too high");
4612                 break;
4613         default:
4614                 panic("hammer2_chain_indkey_freemap: bad radix");
4615                 break;
4616         }
4617         *keyp = key;
4618
4619         return (keybits);
4620 }
4621
4622 /*
4623  * File indirect blocks
4624  *
4625  * Calculate the key/keybits for the indirect block to create by scanning
4626  * existing keys.  The key being created is also passed in *keyp and can be
4627  * inside or outside the indirect block.  Regardless, the indirect block
4628  * must hold at least two keys in order to guarantee sufficient space.
4629  *
4630  * We use a modified version of the freemap's fixed radix tree, but taylored
4631  * for file data.  Basically we configure an indirect block encompassing the
4632  * smallest key.
4633  */
4634 static int
4635 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp,
4636                             int keybits, hammer2_blockref_t *base, int count,
4637                             int ncount)
4638 {
4639         hammer2_chain_t *chain;
4640         hammer2_blockref_t *bref;
4641         hammer2_key_t key;
4642         hammer2_key_t key_beg;
4643         hammer2_key_t key_end;
4644         hammer2_key_t key_next;
4645         int nradix;
4646         int locount;
4647         int hicount;
4648         int maxloops = 300000;
4649
4650         key = *keyp;
4651         locount = 0;
4652         hicount = 0;
4653         keybits = 64;
4654
4655         /*
4656          * Calculate the range of keys in the array being careful to skip
4657          * slots which are overridden with a deletion.
4658          *
4659          * Locate the smallest key.
4660          */
4661         key_beg = 0;
4662         key_end = HAMMER2_KEY_MAX;
4663         hammer2_spin_ex(&parent->core.spin);
4664
4665         for (;;) {
4666                 if (--maxloops == 0) {
4667                         panic("indkey_freemap shit %p %p:%d\n",
4668                               parent, base, count);
4669                 }
4670                 chain = hammer2_combined_find(parent, base, count,
4671                                               &key_next,
4672                                               key_beg, key_end,
4673                                               &bref);
4674
4675                 /*
4676                  * Exhausted search
4677                  */
4678                 if (bref == NULL)
4679                         break;
4680
4681                 /*
4682                  * Skip deleted chains.
4683                  */
4684                 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
4685                         if (key_next == 0 || key_next > key_end)
4686                                 break;
4687                         key_beg = key_next;
4688                         continue;
4689                 }
4690
4691                 /*
4692                  * Use the full live (not deleted) element for the scan
4693                  * iteration.  HAMMER2 does not allow partial replacements.
4694                  *
4695                  * XXX should be built into hammer2_combined_find().
4696                  */
4697                 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4698
4699                 if (keybits > bref->keybits) {
4700                         key = bref->key;
4701                         keybits = bref->keybits;
4702                 } else if (keybits == bref->keybits && bref->key < key) {
4703                         key = bref->key;
4704                 }
4705                 if (key_next == 0)
4706                         break;
4707                 key_beg = key_next;
4708         }
4709         hammer2_spin_unex(&parent->core.spin);
4710
4711         /*
4712          * Calculate the static keybits for a higher-level indirect block
4713          * that contains the key.
4714          */
4715         *keyp = key;
4716
4717         switch(ncount) {
4718         case HAMMER2_IND_BYTES_MIN / sizeof(hammer2_blockref_t):
4719                 nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX;
4720                 break;
4721         case HAMMER2_IND_BYTES_NOM / sizeof(hammer2_blockref_t):
4722                 nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX;
4723                 break;
4724         case HAMMER2_IND_BYTES_MAX / sizeof(hammer2_blockref_t):
4725                 nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX;
4726                 break;
4727         default:
4728                 panic("bad ncount %d\n", ncount);
4729                 nradix = 0;
4730                 break;
4731         }
4732
4733         /*
4734          * The largest radix that can be returned for an indirect block is
4735          * 63 bits.  (The largest practical indirect block radix is actually
4736          * 62 bits because the top-level inode or volume root contains four
4737          * entries, but allow 63 to be returned).
4738          */
4739         if (nradix >= 64)
4740                 nradix = 63;
4741
4742         return keybits + nradix;
4743 }
4744
4745 #if 1
4746
4747 /*
4748  * Directory indirect blocks.
4749  *
4750  * Covers both the inode index (directory of inodes), and directory contents
4751  * (filenames hardlinked to inodes).
4752  *
4753  * Because directory keys are hashed we generally try to cut the space in
4754  * half.  We accomodate the inode index (which tends to have linearly
4755  * increasing inode numbers) by ensuring that the keyspace is at least large
4756  * enough to fill up the indirect block being created.
4757  */
4758 static int
4759 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp,
4760                          int keybits, hammer2_blockref_t *base, int count,
4761                          int ncount)
4762 {
4763         hammer2_blockref_t *bref;
4764         hammer2_chain_t *chain;
4765         hammer2_key_t key_beg;
4766         hammer2_key_t key_end;
4767         hammer2_key_t key_next;
4768         hammer2_key_t key;
4769         int nkeybits;
4770         int locount;
4771         int hicount;
4772         int maxloops = 300000;
4773
4774         /*
4775          * NOTE: We can't take a shortcut here anymore for inodes because
4776          *       the root directory can contain a mix of inodes and directory
4777          *       entries (we used to just return 63 if parent->bref.type was
4778          *       HAMMER2_BREF_TYPE_INODE.
4779          */
4780         key = *keyp;
4781         locount = 0;
4782         hicount = 0;
4783
4784         /*
4785          * Calculate the range of keys in the array being careful to skip
4786          * slots which are overridden with a deletion.
4787          */
4788         key_beg = 0;
4789         key_end = HAMMER2_KEY_MAX;
4790         hammer2_spin_ex(&parent->core.spin);
4791
4792         for (;;) {
4793                 if (--maxloops == 0) {
4794                         panic("indkey_freemap shit %p %p:%d\n",
4795                               parent, base, count);
4796                 }
4797                 chain = hammer2_combined_find(parent, base, count,
4798                                               &key_next,
4799                                               key_beg, key_end,
4800                                               &bref);
4801
4802                 /*
4803                  * Exhausted search
4804                  */
4805                 if (bref == NULL)
4806                         break;
4807
4808                 /*
4809                  * Deleted object
4810                  */
4811                 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
4812                         if (key_next == 0 || key_next > key_end)
4813                                 break;
4814                         key_beg = key_next;
4815                         continue;
4816                 }
4817
4818                 /*
4819                  * Use the full live (not deleted) element for the scan
4820                  * iteration.  HAMMER2 does not allow partial replacements.
4821                  *
4822                  * XXX should be built into hammer2_combined_find().
4823                  */
4824                 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4825
4826                 /*
4827                  * Expand our calculated key range (key, keybits) to fit
4828                  * the scanned key.  nkeybits represents the full range
4829                  * that we will later cut in half (two halves @ nkeybits - 1).
4830                  */
4831                 nkeybits = keybits;
4832                 if (nkeybits < bref->keybits) {
4833                         if (bref->keybits > 64) {
4834                                 kprintf("bad bref chain %p bref %p\n",
4835                                         chain, bref);
4836                                 Debugger("fubar");
4837                         }
4838                         nkeybits = bref->keybits;
4839                 }
4840                 while (nkeybits < 64 &&
4841                        rounddown2(key ^ bref->key, (hammer2_key_t)1 << nkeybits) != 0) {
4842                         ++nkeybits;
4843                 }
4844
4845                 /*
4846                  * If the new key range is larger we have to determine
4847                  * which side of the new key range the existing keys fall
4848                  * under by checking the high bit, then collapsing the
4849                  * locount into the hicount or vise-versa.
4850                  */
4851                 if (keybits != nkeybits) {
4852                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
4853                                 hicount += locount;
4854                                 locount = 0;
4855                         } else {
4856                                 locount += hicount;
4857                                 hicount = 0;
4858                         }
4859                         keybits = nkeybits;
4860                 }
4861
4862                 /*
4863                  * The newly scanned key will be in the lower half or the
4864                  * upper half of the (new) key range.
4865                  */
4866                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
4867                         ++hicount;
4868                 else
4869                         ++locount;
4870
4871                 if (key_next == 0)
4872                         break;
4873                 key_beg = key_next;
4874         }
4875         hammer2_spin_unex(&parent->core.spin);
4876         bref = NULL;    /* now invalid (safety) */
4877
4878         /*
4879          * Adjust keybits to represent half of the full range calculated
4880          * above (radix 63 max) for our new indirect block.
4881          */
4882         --keybits;
4883
4884         /*
4885          * Expand keybits to hold at least ncount elements.  ncount will be
4886          * a power of 2.  This is to try to completely fill leaf nodes (at
4887          * least for keys which are not hashes).
4888          *
4889          * We aren't counting 'in' or 'out', we are counting 'high side'
4890          * and 'low side' based on the bit at (1LL << keybits).  We want
4891          * everything to be inside in these cases so shift it all to
4892          * the low or high side depending on the new high bit.
4893          */
4894         while (((hammer2_key_t)1 << keybits) < ncount) {
4895                 ++keybits;
4896                 if (key & ((hammer2_key_t)1 << keybits)) {
4897                         hicount += locount;
4898                         locount = 0;
4899                 } else {
4900                         locount += hicount;
4901                         hicount = 0;
4902                 }
4903         }
4904
4905         if (hicount > locount)
4906                 key |= (hammer2_key_t)1 << keybits;
4907         else
4908                 key &= ~(hammer2_key_t)1 << keybits;
4909
4910         *keyp = key;
4911
4912         return (keybits);
4913 }
4914
4915 #else
4916
4917 /*
4918  * Directory indirect blocks.
4919  *
4920  * Covers both the inode index (directory of inodes), and directory contents
4921  * (filenames hardlinked to inodes).
4922  *
4923  * Because directory keys are hashed we generally try to cut the space in
4924  * half.  We accomodate the inode index (which tends to have linearly
4925  * increasing inode numbers) by ensuring that the keyspace is at least large
4926  * enough to fill up the indirect block being created.
4927  */
4928 static int
4929 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp,
4930                          int keybits, hammer2_blockref_t *base, int count,
4931                          int ncount)
4932 {
4933         hammer2_blockref_t *bref;
4934         hammer2_chain_t *chain;
4935         hammer2_key_t key_beg;
4936         hammer2_key_t key_end;
4937         hammer2_key_t key_next;
4938         hammer2_key_t key;
4939         int nkeybits;
4940         int locount;
4941         int hicount;
4942         int maxloops = 300000;
4943
4944         /*
4945          * Shortcut if the parent is the inode.  In this situation the
4946          * parent has 4+1 directory entries and we are creating an indirect
4947          * block capable of holding many more.
4948          */
4949         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
4950                 return 63;
4951         }
4952
4953         key = *keyp;
4954         locount = 0;
4955         hicount = 0;
4956
4957         /*
4958          * Calculate the range of keys in the array being careful to skip
4959          * slots which are overridden with a deletion.
4960          */
4961         key_beg = 0;
4962         key_end = HAMMER2_KEY_MAX;
4963         hammer2_spin_ex(&parent->core.spin);
4964
4965         for (;;) {
4966                 if (--maxloops == 0) {
4967                         panic("indkey_freemap shit %p %p:%d\n",
4968                               parent, base, count);
4969                 }
4970                 chain = hammer2_combined_find(parent, base, count,
4971                                               &key_next,
4972                                               key_beg, key_end,
4973                                               &bref);
4974
4975                 /*
4976                  * Exhausted search
4977                  */
4978                 if (bref == NULL)
4979                         break;
4980
4981                 /*
4982                  * Deleted object
4983                  */
4984                 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
4985                         if (key_next == 0 || key_next > key_end)
4986                                 break;
4987                         key_beg = key_next;
4988                         continue;
4989                 }
4990
4991                 /*
4992                  * Use the full live (not deleted) element for the scan
4993                  * iteration.  HAMMER2 does not allow partial replacements.
4994                  *
4995                  * XXX should be built into hammer2_combined_find().
4996                  */
4997                 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4998
4999                 /*
5000                  * Expand our calculated key range (key, keybits) to fit
5001                  * the scanned key.  nkeybits represents the full range
5002                  * that we will later cut in half (two halves @ nkeybits - 1).
5003                  */
5004                 nkeybits = keybits;
5005                 if (nkeybits < bref->keybits) {
5006                         if (bref->keybits > 64) {
5007                                 kprintf("bad bref chain %p bref %p\n",
5008                                         chain, bref);
5009                                 Debugger("fubar");
5010                         }
5011                         nkeybits = bref->keybits;
5012                 }
5013                 while (nkeybits < 64 &&
5014                        (~(((hammer2_key_t)1 << nkeybits) - 1) &
5015                         (key ^ bref->key)) != 0) {
5016                         ++nkeybits;
5017                 }
5018
5019                 /*
5020                  * If the new key range is larger we have to determine
5021                  * which side of the new key range the existing keys fall
5022                  * under by checking the high bit, then collapsing the
5023                  * locount into the hicount or vise-versa.
5024                  */
5025                 if (keybits != nkeybits) {
5026                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
5027                                 hicount += locount;
5028                                 locount = 0;
5029                         } else {
5030                                 locount += hicount;
5031                                 hicount = 0;
5032                         }
5033                         keybits = nkeybits;
5034                 }
5035
5036                 /*
5037                  * The newly scanned key will be in the lower half or the
5038                  * upper half of the (new) key range.
5039                  */
5040                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
5041                         ++hicount;
5042                 else
5043                         ++locount;
5044
5045                 if (key_next == 0)
5046                         break;
5047                 key_beg = key_next;
5048         }
5049         hammer2_spin_unex(&parent->core.spin);
5050         bref = NULL;    /* now invalid (safety) */
5051
5052         /*
5053          * Adjust keybits to represent half of the full range calculated
5054          * above (radix 63 max) for our new indirect block.
5055          */
5056         --keybits;
5057
5058         /*
5059          * Expand keybits to hold at least ncount elements.  ncount will be
5060          * a power of 2.  This is to try to completely fill leaf nodes (at
5061          * least for keys which are not hashes).
5062          *
5063          * We aren't counting 'in' or 'out', we are counting 'high side'
5064          * and 'low side' based on the bit at (1LL << keybits).  We want
5065          * everything to be inside in these cases so shift it all to
5066          * the low or high side depending on the new high bit.
5067          */
5068         while (((hammer2_key_t)1 << keybits) < ncount) {
5069                 ++keybits;
5070                 if (key & ((hammer2_key_t)1 << keybits)) {
5071                         hicount += locount;
5072                         locount = 0;
5073                 } else {
5074                         locount += hicount;
5075                         hicount = 0;
5076                 }
5077         }
5078
5079         if (hicount > locount)
5080                 key |= (hammer2_key_t)1 << keybits;
5081         else
5082                 key &= ~(hammer2_key_t)1 << keybits;
5083
5084         *keyp = key;
5085
5086         return (keybits);
5087 }
5088
5089 #endif
5090
5091 /*
5092  * Sets CHAIN_DELETED and remove the chain's blockref from the parent if
5093  * it exists.
5094  *
5095  * Both parent and chain must be locked exclusively.
5096  *
5097  * This function will modify the parent if the blockref requires removal
5098  * from the parent's block table.
5099  *
5100  * This function is NOT recursive.  Any entity already pushed into the
5101  * chain (such as an inode) may still need visibility into its contents,
5102  * as well as the ability to read and modify the contents.  For example,
5103  * for an unlinked file which is still open.
5104  *
5105  * Also note that the flusher is responsible for cleaning up empty
5106  * indirect blocks.
5107  */
5108 int
5109 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain,
5110                      hammer2_tid_t mtid, int flags)
5111 {
5112         int error = 0;
5113
5114         KKASSERT(hammer2_mtx_owned(&chain->lock));
5115
5116         /*
5117          * Nothing to do if already marked.
5118          *
5119          * We need the spinlock on the core whos RBTREE contains chain
5120          * to protect against races.
5121          */
5122         if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
5123                 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
5124                          chain->parent == parent);
5125                 error = _hammer2_chain_delete_helper(parent, chain,
5126                                                      mtid, flags, NULL);
5127         }
5128
5129         /*
5130          * Permanent deletions mark the chain as destroyed.
5131          *
5132          * NOTE: We do not setflush the chain unless the deletion is
5133          *       permanent, since the deletion of a chain does not actually
5134          *       require it to be flushed.
5135          */
5136         if (error == 0) {
5137                 if (flags & HAMMER2_DELETE_PERMANENT) {
5138                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
5139                         hammer2_chain_setflush(chain);
5140                 }
5141         }
5142
5143         return error;
5144 }
5145
5146 static int
5147 hammer2_chain_delete_obref(hammer2_chain_t *parent, hammer2_chain_t *chain,
5148                      hammer2_tid_t mtid, int flags,
5149                      hammer2_blockref_t *obref)
5150 {
5151         int error = 0;
5152
5153         KKASSERT(hammer2_mtx_owned(&chain->lock));
5154
5155         /*
5156          * Nothing to do if already marked.
5157          *
5158          * We need the spinlock on the core whos RBTREE contains chain
5159          * to protect against races.
5160          */
5161         obref->type = HAMMER2_BREF_TYPE_EMPTY;
5162         if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
5163                 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
5164                          chain->parent == parent);
5165                 error = _hammer2_chain_delete_helper(parent, chain,
5166                                                      mtid, flags, obref);
5167         }
5168
5169         /*
5170          * Permanent deletions mark the chain as destroyed.
5171          *
5172          * NOTE: We do not setflush the chain unless the deletion is
5173          *       permanent, since the deletion of a chain does not actually
5174          *       require it to be flushed.
5175          */
5176         if (error == 0) {
5177                 if (flags & HAMMER2_DELETE_PERMANENT) {
5178                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
5179                         hammer2_chain_setflush(chain);
5180                 }
5181         }
5182
5183         return error;
5184 }
5185
5186 /*
5187  * Returns the index of the nearest element in the blockref array >= elm.
5188  * Returns (count) if no element could be found.
5189  *
5190  * Sets *key_nextp to the next key for loop purposes but does not modify
5191  * it if the next key would be higher than the current value of *key_nextp.
5192  * Note that *key_nexp can overflow to 0, which should be tested by the
5193  * caller.
5194  *
5195  * WARNING!  Must be called with parent's spinlock held.  Spinlock remains
5196  *           held through the operation.
5197  */
5198 static int
5199 hammer2_base_find(hammer2_chain_t *parent,
5200                   hammer2_blockref_t *base, int count,
5201                   hammer2_key_t *key_nextp,
5202                   hammer2_key_t key_beg, hammer2_key_t key_end)
5203 {
5204         hammer2_blockref_t *scan;
5205         hammer2_key_t scan_end;
5206         int i;
5207         int limit;
5208
5209         /*
5210          * Require the live chain's already have their core's counted
5211          * so we can optimize operations.
5212          */
5213         KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS);
5214
5215         /*
5216          * Degenerate case
5217          */
5218         if (count == 0 || base == NULL)
5219                 return(count);
5220
5221         /*
5222          * Sequential optimization using parent->cache_index.  This is
5223          * the most likely scenario.
5224          *
5225          * We can avoid trailing empty entries on live chains, otherwise
5226          * we might have to check the whole block array.
5227          */
5228         i = parent->cache_index;        /* SMP RACE OK */
5229         cpu_ccfence();
5230         limit = parent->core.live_zero;
5231         if (i >= limit)
5232                 i = limit - 1;
5233         if (i < 0)
5234                 i = 0;
5235         KKASSERT(i < count);
5236
5237         /*
5238          * Search backwards
5239          */
5240         scan = &base[i];
5241         while (i > 0 && (scan->type == HAMMER2_BREF_TYPE_EMPTY ||
5242             scan->key > key_beg)) {
5243                 --scan;
5244                 --i;
5245         }
5246         parent->cache_index = i;
5247
5248         /*
5249          * Search forwards, stop when we find a scan element which
5250          * encloses the key or until we know that there are no further
5251          * elements.
5252          */
5253         while (i < count) {
5254                 if (scan->type != HAMMER2_BREF_TYPE_EMPTY) {
5255                         scan_end = scan->key +
5256                                    ((hammer2_key_t)1 << scan->keybits) - 1;
5257                         if (scan->key > key_beg || scan_end >= key_beg)
5258                                 break;
5259                 }
5260                 if (i >= limit)
5261                         return (count);
5262                 ++scan;
5263                 ++i;
5264         }
5265         if (i != count) {
5266                 parent->cache_index = i;
5267                 if (i >= limit) {
5268                         i = count;
5269                 } else {
5270                         scan_end = scan->key +
5271                                    ((hammer2_key_t)1 << scan->keybits);
5272                         if (scan_end && (*key_nextp > scan_end ||
5273                                          *key_nextp == 0)) {
5274                                 *key_nextp = scan_end;
5275                         }
5276                 }
5277         }
5278         return (i);
5279 }
5280
5281 /*
5282  * Do a combined search and return the next match either from the blockref
5283  * array or from the in-memory chain.  Sets *bresp to the returned bref in
5284  * both cases, or sets it to NULL if the search exhausted.  Only returns
5285  * a non-NULL chain if the search matched from the in-memory chain.
5286  *
5287  * When no in-memory chain has been found and a non-NULL bref is returned
5288  * in *bresp.
5289  *
5290  *
5291  * The returned chain is not locked or referenced.  Use the returned bref
5292  * to determine if the search exhausted or not.  Iterate if the base find
5293  * is chosen but matches a deleted chain.
5294  *
5295  * WARNING!  Must be called with parent's spinlock held.  Spinlock remains
5296  *           held through the operation.
5297  */
5298 hammer2_chain_t *
5299 hammer2_combined_find(hammer2_chain_t *parent,
5300                       hammer2_blockref_t *base, int count,
5301                       hammer2_key_t *key_nextp,
5302                       hammer2_key_t key_beg, hammer2_key_t key_end,
5303                       hammer2_blockref_t **bresp)
5304 {
5305         hammer2_blockref_t *bref;
5306         hammer2_chain_t *chain;
5307         int i;
5308
5309         /*
5310          * Lookup in block array and in rbtree.
5311          */
5312         *key_nextp = key_end + 1;
5313         i = hammer2_base_find(parent, base, count, key_nextp,
5314                               key_beg, key_end);
5315         chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end);
5316
5317         /*
5318          * Neither matched
5319          */
5320         if (i == count && chain == NULL) {
5321                 *bresp = NULL;
5322                 return(NULL);
5323         }
5324
5325         /*
5326          * Only chain matched.
5327          */
5328         if (i == count) {
5329                 bref = &chain->bref;
5330                 goto found;
5331         }
5332
5333         /*
5334          * Only blockref matched.
5335          */
5336         if (chain == NULL) {
5337                 bref = &base[i];
5338                 goto found;
5339         }
5340
5341         /*
5342          * Both in-memory and blockref matched, select the nearer element.
5343          *
5344          * If both are flush with the left-hand side or both are the
5345          * same distance away, select the chain.  In this situation the
5346          * chain must have been loaded from the matching blockmap.
5347          */
5348         if ((chain->bref.key <= key_beg && base[i].key <= key_beg) ||
5349             chain->bref.key == base[i].key) {
5350                 KKASSERT(chain->bref.key == base[i].key);
5351                 bref = &chain->bref;
5352                 goto found;
5353         }
5354
5355         /*
5356          * Select the nearer key
5357          */
5358         if (chain->bref.key < base[i].key) {
5359                 bref = &chain->bref;
5360         } else {
5361                 bref = &base[i];
5362                 chain = NULL;
5363         }
5364
5365         /*
5366          * If the bref is out of bounds we've exhausted our search.
5367          */
5368 found:
5369         if (bref->key > key_end) {
5370                 *bresp = NULL;
5371                 chain = NULL;
5372         } else {
5373                 *bresp = bref;
5374         }
5375         return(chain);
5376 }
5377
5378 /*
5379  * Locate the specified block array element and delete it.  The element
5380  * must exist.
5381  *
5382  * The spin lock on the related chain must be held.
5383  *
5384  * NOTE: live_count was adjusted when the chain was deleted, so it does not
5385  *       need to be adjusted when we commit the media change.
5386  */
5387 void
5388 hammer2_base_delete(hammer2_chain_t *parent,
5389                     hammer2_blockref_t *base, int count,
5390                     hammer2_chain_t *chain,
5391                     hammer2_blockref_t *obref)
5392 {
5393         hammer2_blockref_t *elm = &chain->bref;
5394         hammer2_blockref_t *scan;
5395         hammer2_key_t key_next;
5396         int i;
5397
5398         /*
5399          * Delete element.  Expect the element to exist.
5400          *
5401          * XXX see caller, flush code not yet sophisticated enough to prevent
5402          *     re-flushed in some cases.
5403          */
5404         key_next = 0; /* max range */
5405         i = hammer2_base_find(parent, base, count, &key_next,
5406                               elm->key, elm->key);
5407         scan = &base[i];
5408         if (i == count || scan->type == HAMMER2_BREF_TYPE_EMPTY ||
5409             scan->key != elm->key ||
5410             ((chain->flags & HAMMER2_CHAIN_BMAPUPD) == 0 &&
5411              scan->keybits != elm->keybits)) {
5412                 hammer2_spin_unex(&parent->core.spin);
5413                 panic("delete base %p element not found at %d/%d elm %p\n",
5414                       base, i, count, elm);
5415                 return;
5416         }
5417
5418         /*
5419          * Update stats and zero the entry.
5420          *
5421          * NOTE: Handle radix == 0 (0 bytes) case.
5422          */
5423         if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) {
5424                 parent->bref.embed.stats.data_count -= (hammer2_off_t)1 <<
5425                                 (int)(scan->data_off & HAMMER2_OFF_MASK_RADIX);
5426         }
5427         switch(scan->type) {
5428         case HAMMER2_BREF_TYPE_INODE:
5429                 --parent->bref.embed.stats.inode_count;
5430                 /* fall through */
5431         case HAMMER2_BREF_TYPE_DATA:
5432                 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) {
5433                         atomic_set_int(&chain->flags,
5434                                        HAMMER2_CHAIN_HINT_LEAF_COUNT);
5435                 } else {
5436                         if (parent->bref.leaf_count)
5437                                 --parent->bref.leaf_count;
5438                 }
5439                 /* fall through */
5440         case HAMMER2_BREF_TYPE_INDIRECT:
5441                 if (scan->type != HAMMER2_BREF_TYPE_DATA) {
5442                         parent->bref.embed.stats.data_count -=
5443                                 scan->embed.stats.data_count;
5444                         parent->bref.embed.stats.inode_count -=
5445                                 scan->embed.stats.inode_count;
5446                 }
5447                 if (scan->type == HAMMER2_BREF_TYPE_INODE)
5448                         break;
5449                 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) {
5450                         atomic_set_int(&chain->flags,
5451                                        HAMMER2_CHAIN_HINT_LEAF_COUNT);
5452                 } else {
5453                         if (parent->bref.leaf_count <= scan->leaf_count)
5454                                 parent->bref.leaf_count = 0;
5455                         else
5456                                 parent->bref.leaf_count -= scan->leaf_count;
5457                 }
5458                 break;
5459         case HAMMER2_BREF_TYPE_DIRENT:
5460                 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) {
5461                         atomic_set_int(&chain->flags,
5462                                        HAMMER2_CHAIN_HINT_LEAF_COUNT);
5463                 } else {
5464                         if (parent->bref.leaf_count)
5465                                 --parent->bref.leaf_count;
5466                 }
5467         default:
5468                 break;
5469         }
5470
5471         if (obref)
5472                 *obref = *scan;
5473         bzero(scan, sizeof(*scan));
5474
5475         /*
5476          * We can only optimize parent->core.live_zero for live chains.
5477          */
5478         if (parent->core.live_zero == i + 1) {
5479                 while (--i >= 0 && base[i].type == HAMMER2_BREF_TYPE_EMPTY)
5480                         ;
5481                 parent->core.live_zero = i + 1;
5482         }
5483
5484         /*
5485          * Clear appropriate blockmap flags in chain.
5486          */
5487         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED |
5488                                         HAMMER2_CHAIN_BMAPUPD);
5489 }
5490
5491 /*
5492  * Insert the specified element.  The block array must not already have the
5493  * element and must have space available for the insertion.
5494  *
5495  * The spin lock on the related chain must be held.
5496  *
5497  * NOTE: live_count was adjusted when the chain was deleted, so it does not
5498  *       need to be adjusted when we commit the media change.
5499  */
5500 void
5501 hammer2_base_insert(hammer2_chain_t *parent,
5502                     hammer2_blockref_t *base, int count,
5503                     hammer2_chain_t *chain, hammer2_blockref_t *elm)
5504 {
5505         hammer2_key_t key_next;
5506         hammer2_key_t xkey;
5507         int i;
5508         int j;
5509         int k;
5510         int l;
5511         int u = 1;
5512
5513         /*
5514          * Insert new element.  Expect the element to not already exist
5515          * unless we are replacing it.
5516          *
5517          * XXX see caller, flush code not yet sophisticated enough to prevent
5518          *     re-flushed in some cases.
5519          */
5520         key_next = 0; /* max range */
5521         i = hammer2_base_find(parent, base, count, &key_next,
5522                               elm->key, elm->key);
5523
5524         /*
5525          * Shortcut fill optimization, typical ordered insertion(s) may not
5526          * require a search.
5527          */
5528         KKASSERT(i >= 0 && i <= count);
5529
5530         /*
5531          * Set appropriate blockmap flags in chain (if not NULL)
5532          */
5533         if (chain)
5534                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED);
5535
5536         /*
5537          * Update stats and zero the entry
5538          */
5539         if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) {
5540                 parent->bref.embed.stats.data_count += (hammer2_off_t)1 <<
5541                                 (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX);
5542         }
5543         switch(elm->type) {
5544         case HAMMER2_BREF_TYPE_INODE:
5545                 ++parent->bref.embed.stats.inode_count;
5546                 /* fall through */
5547         case HAMMER2_BREF_TYPE_DATA:
5548                 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX)
5549                         ++parent->bref.leaf_count;
5550                 /* fall through */
5551         case HAMMER2_BREF_TYPE_INDIRECT:
5552                 if (elm->type != HAMMER2_BREF_TYPE_DATA) {
5553                         parent->bref.embed.stats.data_count +=
5554                                 elm->embed.stats.data_count;
5555                         parent->bref.embed.stats.inode_count +=
5556                                 elm->embed.stats.inode_count;
5557                 }
5558                 if (elm->type == HAMMER2_BREF_TYPE_INODE)
5559                         break;
5560                 if (parent->bref.leaf_count + elm->leaf_count <
5561                     HAMMER2_BLOCKREF_LEAF_MAX) {
5562                         parent->bref.leaf_count += elm->leaf_count;
5563                 } else {
5564                         parent->bref.leaf_count = HAMMER2_BLOCKREF_LEAF_MAX;
5565                 }
5566                 break;
5567         case HAMMER2_BREF_TYPE_DIRENT:
5568                 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX)
5569                         ++parent->bref.leaf_count;
5570                 break;
5571         default:
5572                 break;
5573         }
5574
5575
5576         /*
5577          * We can only optimize parent->core.live_zero for live chains.
5578          */
5579         if (i == count && parent->core.live_zero < count) {
5580                 i = parent->core.live_zero++;
5581                 base[i] = *elm;
5582                 return;
5583         }
5584
5585         xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1;
5586         if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) {
5587                 hammer2_spin_unex(&parent->core.spin);
5588                 panic("insert base %p overlapping elements at %d elm %p\n",
5589                       base, i, elm);
5590         }
5591
5592         /*
5593          * Try to find an empty slot before or after.
5594          */
5595         j = i;
5596         k = i;
5597         while (j > 0 || k < count) {
5598                 --j;
5599                 if (j >= 0 && base[j].type == HAMMER2_BREF_TYPE_EMPTY) {
5600                         if (j == i - 1) {
5601                                 base[j] = *elm;
5602                         } else {
5603                                 bcopy(&base[j+1], &base[j],
5604                                       (i - j - 1) * sizeof(*base));
5605                                 base[i - 1] = *elm;
5606                         }
5607                         goto validate;
5608                 }
5609                 ++k;
5610                 if (k < count && base[k].type == HAMMER2_BREF_TYPE_EMPTY) {
5611                         bcopy(&base[i], &base[i+1],
5612                               (k - i) * sizeof(hammer2_blockref_t));
5613                         base[i] = *elm;
5614
5615                         /*
5616                          * We can only update parent->core.live_zero for live
5617                          * chains.
5618                          */
5619                         if (parent->core.live_zero <= k)
5620                                 parent->core.live_zero = k + 1;
5621                         u = 2;
5622                         goto validate;
5623                 }
5624         }
5625         panic("hammer2_base_insert: no room!");
5626
5627         /*
5628          * Debugging
5629          */
5630 validate:
5631         key_next = 0;
5632         for (l = 0; l < count; ++l) {
5633                 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) {
5634                         key_next = base[l].key +
5635                                    ((hammer2_key_t)1 << base[l].keybits) - 1;
5636                         break;
5637                 }
5638         }
5639         while (++l < count) {
5640                 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) {
5641                         if (base[l].key <= key_next)
5642                                 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l);
5643                         key_next = base[l].key +
5644                                    ((hammer2_key_t)1 << base[l].keybits) - 1;
5645
5646                 }
5647         }
5648
5649 }
5650
5651 #if 0
5652
5653 /*
5654  * Sort the blockref array for the chain.  Used by the flush code to
5655  * sort the blockref[] array.
5656  *
5657  * The chain must be exclusively locked AND spin-locked.
5658  */
5659 typedef hammer2_blockref_t *hammer2_blockref_p;
5660
5661 static
5662 int
5663 hammer2_base_sort_callback(const void *v1, const void *v2)
5664 {
5665         hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1;
5666         hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2;
5667
5668         /*
5669          * Make sure empty elements are placed at the end of the array
5670          */
5671         if (bref1->type == HAMMER2_BREF_TYPE_EMPTY) {
5672                 if (bref2->type == HAMMER2_BREF_TYPE_EMPTY)
5673                         return(0);
5674                 return(1);
5675         } else if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) {
5676                 return(-1);
5677         }
5678
5679         /*
5680          * Sort by key
5681          */
5682         if (bref1->key < bref2->key)
5683                 return(-1);
5684         if (bref1->key > bref2->key)
5685                 return(1);
5686         return(0);
5687 }
5688
5689 void
5690 hammer2_base_sort(hammer2_chain_t *chain)
5691 {
5692         hammer2_blockref_t *base;
5693         int count;
5694
5695         switch(chain->bref.type) {
5696         case HAMMER2_BREF_TYPE_INODE:
5697                 /*
5698                  * Special shortcut for embedded data returns the inode
5699                  * itself.  Callers must detect this condition and access
5700                  * the embedded data (the strategy code does this for us).
5701                  *
5702                  * This is only applicable to regular files and softlinks.
5703                  */
5704                 if (chain->data->ipdata.meta.op_flags &
5705                     HAMMER2_OPFLAG_DIRECTDATA) {
5706                         return;
5707                 }
5708                 base = &chain->data->ipdata.u.blockset.blockref[0];
5709                 count = HAMMER2_SET_COUNT;
5710                 break;
5711         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
5712         case HAMMER2_BREF_TYPE_INDIRECT:
5713                 /*
5714                  * Optimize indirect blocks in the INITIAL state to avoid
5715                  * I/O.
5716                  */
5717                 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0);
5718                 base = &chain->data->npdata[0];
5719                 count = chain->bytes / sizeof(hammer2_blockref_t);
5720                 break;
5721         case HAMMER2_BREF_TYPE_VOLUME:
5722                 base = &chain->data->voldata.sroot_blockset.blockref[0];
5723                 count = HAMMER2_SET_COUNT;
5724                 break;
5725         case HAMMER2_BREF_TYPE_FREEMAP:
5726                 base = &chain->data->blkset.blockref[0];
5727                 count = HAMMER2_SET_COUNT;
5728                 break;
5729         default:
5730                 panic("hammer2_base_sort: unrecognized "
5731                       "blockref(A) type: %d",
5732                       chain->bref.type);
5733                 base = NULL;    /* safety */
5734                 count = 0;      /* safety */
5735                 break;
5736         }
5737         kqsort(base, count, sizeof(*base), hammer2_base_sort_callback);
5738 }
5739
5740 #endif
5741
5742 /*
5743  * Chain memory management
5744  */
5745 void
5746 hammer2_chain_wait(hammer2_chain_t *chain)
5747 {
5748         tsleep(chain, 0, "chnflw", 1);
5749 }
5750
5751 const hammer2_media_data_t *
5752 hammer2_chain_rdata(hammer2_chain_t *chain)
5753 {
5754         KKASSERT(chain->data != NULL);
5755         return (chain->data);
5756 }
5757
5758 hammer2_media_data_t *
5759 hammer2_chain_wdata(hammer2_chain_t *chain)
5760 {
5761         KKASSERT(chain->data != NULL);
5762         return (chain->data);
5763 }
5764
5765 /*
5766  * Set the check data for a chain.  This can be a heavy-weight operation
5767  * and typically only runs on-flush.  For file data check data is calculated
5768  * when the logical buffers are flushed.
5769  */
5770 void
5771 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata)
5772 {
5773         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED);
5774
5775         switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
5776         case HAMMER2_CHECK_NONE:
5777                 break;
5778         case HAMMER2_CHECK_DISABLED:
5779                 break;
5780         case HAMMER2_CHECK_ISCSI32:
5781                 chain->bref.check.iscsi32.value =
5782                         hammer2_icrc32(bdata, chain->bytes);
5783                 break;
5784         case HAMMER2_CHECK_XXHASH64:
5785                 chain->bref.check.xxhash64.value =
5786                         XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED);
5787                 break;
5788         case HAMMER2_CHECK_SHA192:
5789                 {
5790                         SHA256_CTX hash_ctx;
5791                         union {
5792                                 uint8_t digest[SHA256_DIGEST_LENGTH];
5793                                 uint64_t digest64[SHA256_DIGEST_LENGTH/8];
5794                         } u;
5795
5796                         SHA256_Init(&hash_ctx);
5797                         SHA256_Update(&hash_ctx, bdata, chain->bytes);
5798                         SHA256_Final(u.digest, &hash_ctx);
5799                         u.digest64[2] ^= u.digest64[3];
5800                         bcopy(u.digest,
5801                               chain->bref.check.sha192.data,
5802                               sizeof(chain->bref.check.sha192.data));
5803                 }
5804                 break;
5805         case HAMMER2_CHECK_FREEMAP:
5806                 chain->bref.check.freemap.icrc32 =
5807                         hammer2_icrc32(bdata, chain->bytes);
5808                 break;
5809         default:
5810                 kprintf("hammer2_chain_setcheck: unknown check type %02x\n",
5811                         chain->bref.methods);
5812                 break;
5813         }
5814 }
5815
5816 /*
5817  * Characterize a failed check code and try to trace back to the inode.
5818  */
5819 static void
5820 hammer2_characterize_failed_chain(hammer2_chain_t *chain, uint64_t check,
5821                                   int bits)
5822 {
5823         hammer2_chain_t *lchain;
5824         hammer2_chain_t *ochain;
5825         int did;
5826
5827         did = krateprintf(&krate_h2chk,
5828                 "chain %016jx.%02x (%s) meth=%02x CHECK FAIL "
5829                 "(flags=%08x, bref/data ",
5830                 chain->bref.data_off,
5831                 chain->bref.type,
5832                 hammer2_bref_type_str(chain->bref.type),
5833                 chain->bref.methods,
5834                 chain->flags);
5835         if (did == 0)
5836                 return;
5837
5838         if (bits == 32) {
5839                 kprintf("%08x/%08x)\n",
5840                         chain->bref.check.iscsi32.value,
5841                         (uint32_t)check);
5842         } else {
5843                 kprintf("%016jx/%016jx)\n",
5844                         chain->bref.check.xxhash64.value,
5845                         check);
5846         }
5847
5848         /*
5849          * Run up the chains to try to find the governing inode so we
5850          * can report it.
5851          *
5852          * XXX This error reporting is not really MPSAFE
5853          */
5854         ochain = chain;
5855         lchain = chain;
5856         while (chain && chain->bref.type != HAMMER2_BREF_TYPE_INODE) {
5857                 lchain = chain;
5858                 chain = chain->parent;
5859         }
5860
5861         if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
5862             ((chain->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) == 0 ||
5863              (lchain->bref.key & HAMMER2_DIRHASH_VISIBLE))) {
5864                 kprintf("   Resides at/in inode %ld\n",
5865                         chain->bref.key);
5866         } else if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
5867                 kprintf("   Resides in inode index - CRITICAL!!!\n");
5868         } else {
5869                 kprintf("   Resides in root index - CRITICAL!!!\n");
5870         }
5871         if (ochain->hmp) {
5872                 const char *pfsname = "UNKNOWN";
5873                 int i;
5874
5875                 if (ochain->pmp) {
5876                         for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
5877                                 if (ochain->pmp->pfs_hmps[i] == ochain->hmp &&
5878                                     ochain->pmp->pfs_names[i]) {
5879                                         pfsname = ochain->pmp->pfs_names[i];
5880                                         break;
5881                                 }
5882                         }
5883                 }
5884                 kprintf("   In pfs %s on device %s\n",
5885                         pfsname, ochain->hmp->devrepname);
5886         }
5887 }
5888
5889 /*
5890  * Returns non-zero on success, 0 on failure.
5891  */
5892 int
5893 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata)
5894 {
5895         uint32_t check32;
5896         uint64_t check64;
5897         int r;
5898
5899         if (chain->flags & HAMMER2_CHAIN_NOTTESTED)
5900                 return 1;
5901
5902         switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
5903         case HAMMER2_CHECK_NONE:
5904                 r = 1;
5905                 break;
5906         case HAMMER2_CHECK_DISABLED:
5907                 r = 1;
5908                 break;
5909         case HAMMER2_CHECK_ISCSI32:
5910                 check32 = hammer2_icrc32(bdata, chain->bytes);
5911                 r = (chain->bref.check.iscsi32.value == check32);
5912                 if (r == 0) {
5913                         hammer2_characterize_failed_chain(chain, check32, 32);
5914                 }
5915                 hammer2_process_icrc32 += chain->bytes;
5916                 break;
5917         case HAMMER2_CHECK_XXHASH64:
5918                 check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED);
5919                 r = (chain->bref.check.xxhash64.value == check64);
5920                 if (r == 0) {
5921                         hammer2_characterize_failed_chain(chain, check64, 64);
5922                 }
5923                 hammer2_process_xxhash64 += chain->bytes;
5924                 break;
5925         case HAMMER2_CHECK_SHA192:
5926                 {
5927                         SHA256_CTX hash_ctx;
5928                         union {
5929                                 uint8_t digest[SHA256_DIGEST_LENGTH];
5930                                 uint64_t digest64[SHA256_DIGEST_LENGTH/8];
5931                         } u;
5932
5933                         SHA256_Init(&hash_ctx);
5934                         SHA256_Update(&hash_ctx, bdata, chain->bytes);
5935                         SHA256_Final(u.digest, &hash_ctx);
5936                         u.digest64[2] ^= u.digest64[3];
5937                         if (bcmp(u.digest,
5938                                  chain->bref.check.sha192.data,
5939                                  sizeof(chain->bref.check.sha192.data)) == 0) {
5940                                 r = 1;
5941                         } else {
5942                                 r = 0;
5943                                 krateprintf(&krate_h2chk,
5944                                         "chain %016jx.%02x meth=%02x "
5945                                         "CHECK FAIL\n",
5946                                         chain->bref.data_off,
5947                                         chain->bref.type,
5948                                         chain->bref.methods);
5949                         }
5950                 }
5951                 break;
5952         case HAMMER2_CHECK_FREEMAP:
5953                 r = (chain->bref.check.freemap.icrc32 ==
5954                      hammer2_icrc32(bdata, chain->bytes));
5955                 if (r == 0) {
5956                         int did;
5957
5958                         did = krateprintf(&krate_h2chk,
5959                                           "chain %016jx.%02x meth=%02x "
5960                                           "CHECK FAIL\n",
5961                                           chain->bref.data_off,
5962                                           chain->bref.type,
5963                                           chain->bref.methods);
5964                         if (did) {
5965                                 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n",
5966                                         chain->bref.check.freemap.icrc32,
5967                                         hammer2_icrc32(bdata, chain->bytes),
5968                                         chain->bytes);
5969                                 if (chain->dio) {
5970                                         kprintf("dio %p buf %016jx,%d "
5971                                                 "bdata %p/%p\n",
5972                                                 chain->dio,
5973                                                 chain->dio->bp->b_loffset,
5974                                                 chain->dio->bp->b_bufsize,
5975                                                 bdata,
5976                                                 chain->dio->bp->b_data);
5977                                 }
5978                         }
5979                 }
5980                 break;
5981         default:
5982                 kprintf("hammer2_chain_testcheck: unknown check type %02x\n",
5983                         chain->bref.methods);
5984                 r = 1;
5985                 break;
5986         }
5987         return r;
5988 }
5989
5990 /*
5991  * Acquire the chain and parent representing the specified inode for the
5992  * device at the specified cluster index.
5993  *
5994  * The flags passed in are LOOKUP flags, not RESOLVE flags.
5995  *
5996  * If we are unable to locate the inode, HAMMER2_ERROR_EIO is returned and
5997  * *chainp will be NULL.  *parentp may still be set error or not, or NULL
5998  * if the parent itself could not be resolved.
5999  *
6000  * The caller may pass-in a locked *parentp and/or *chainp, or neither.
6001  * They will be unlocked and released by this function.  The *parentp and
6002  * *chainp representing the located inode are returned locked.
6003  */
6004 int
6005 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum,
6006                          int clindex, int flags,
6007                          hammer2_chain_t **parentp, hammer2_chain_t **chainp)
6008 {
6009         hammer2_chain_t *parent;
6010         hammer2_chain_t *rchain;
6011         hammer2_key_t key_dummy;
6012         hammer2_inode_t *ip;
6013         int resolve_flags;
6014         int error;
6015
6016         resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ?
6017                         HAMMER2_RESOLVE_SHARED : 0;
6018
6019         /*
6020          * Caller expects us to replace these.
6021          */
6022         if (*chainp) {
6023                 hammer2_chain_unlock(*chainp);
6024                 hammer2_chain_drop(*chainp);
6025                 *chainp = NULL;
6026         }
6027         if (*parentp) {
6028                 hammer2_chain_unlock(*parentp);
6029                 hammer2_chain_drop(*parentp);
6030                 *parentp = NULL;
6031         }
6032
6033         /*
6034          * Be very careful, this is a backend function and we CANNOT
6035          * lock any frontend inode structure we find.  But we have to
6036          * look the inode up this way first in case it exists but is
6037          * detached from the radix tree.
6038          */
6039         ip = hammer2_inode_lookup(pmp, inum);
6040         if (ip) {
6041                 *chainp = hammer2_inode_chain_and_parent(ip, clindex,
6042                                                        parentp,
6043                                                        resolve_flags);
6044                 hammer2_inode_drop(ip);
6045                 if (*chainp)
6046                         return 0;
6047                 hammer2_chain_unlock(*chainp);
6048                 hammer2_chain_drop(*chainp);
6049                 *chainp = NULL;
6050                 if (*parentp) {
6051                         hammer2_chain_unlock(*parentp);
6052                         hammer2_chain_drop(*parentp);
6053                         *parentp = NULL;
6054                 }
6055         }
6056
6057         /*
6058          * Inodes hang off of the iroot (bit 63 is clear, differentiating
6059          * inodes from root directory entries in the key lookup).
6060          */
6061         parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags);
6062         rchain = NULL;
6063         if (parent) {
6064                 rchain = hammer2_chain_lookup(&parent, &key_dummy,
6065                                               inum, inum,
6066                                               &error, flags);
6067         } else {
6068                 error = HAMMER2_ERROR_EIO;
6069         }
6070         *parentp = parent;
6071         *chainp = rchain;
6072
6073         return error;
6074 }
6075
6076 /*
6077  * Used by the bulkscan code to snapshot the synchronized storage for
6078  * a volume, allowing it to be scanned concurrently against normal
6079  * operation.
6080  */
6081 hammer2_chain_t *
6082 hammer2_chain_bulksnap(hammer2_dev_t *hmp)
6083 {
6084         hammer2_chain_t *copy;
6085
6086         copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref);
6087         copy->data = kmalloc(sizeof(copy->data->voldata),
6088                              hmp->mchain,
6089                              M_WAITOK | M_ZERO);
6090         hammer2_voldata_lock(hmp);
6091         copy->data->voldata = hmp->volsync;
6092         hammer2_voldata_unlock(hmp);
6093
6094         return copy;
6095 }
6096
6097 void
6098 hammer2_chain_bulkdrop(hammer2_chain_t *copy)
6099 {
6100         KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME);
6101         KKASSERT(copy->data);
6102         kfree(copy->data, copy->hmp->mchain);
6103         copy->data = NULL;
6104         atomic_add_long(&hammer2_chain_allocs, -1);
6105         hammer2_chain_drop(copy);
6106 }
6107
6108 /*
6109  * Returns non-zero if the chain (INODE or DIRENT) matches the
6110  * filename.
6111  */
6112 int
6113 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name,
6114                           size_t name_len)
6115 {
6116         const hammer2_inode_data_t *ripdata;
6117         const hammer2_dirent_head_t *den;
6118
6119         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
6120                 ripdata = &chain->data->ipdata;
6121                 if (ripdata->meta.name_len == name_len &&
6122                     bcmp(ripdata->filename, name, name_len) == 0) {
6123                         return 1;
6124                 }
6125         }
6126         if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT &&
6127            chain->bref.embed.dirent.namlen == name_len) {
6128                 den = &chain->bref.embed.dirent;
6129                 if (name_len > sizeof(chain->bref.check.buf) &&
6130                     bcmp(chain->data->buf, name, name_len) == 0) {
6131                         return 1;
6132                 }
6133                 if (name_len <= sizeof(chain->bref.check.buf) &&
6134                     bcmp(chain->bref.check.buf, name, name_len) == 0) {
6135                         return 1;
6136                 }
6137         }
6138         return 0;
6139 }