hammer2 - refactor some chain bits, stabilization
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem implements most of the core support functions for
37  * the hammer2_chain structure.
38  *
39  * Chains are the in-memory version on media objects (volume header, inodes,
40  * indirect blocks, data blocks, etc).  Chains represent a portion of the
41  * HAMMER2 topology.
42  *
43  * Chains are no-longer delete-duplicated.  Instead, the original in-memory
44  * chain will be moved along with its block reference (e.g. for things like
45  * renames, hardlink operations, modifications, etc), and will be indexed
46  * on a secondary list for flush handling instead of propagating a flag
47  * upward to the root.
48  *
49  * Concurrent front-end operations can still run against backend flushes
50  * as long as they do not cross the current flush boundary.  An operation
51  * running above the current flush (in areas not yet flushed) can become
52  * part of the current flush while ano peration running below the current
53  * flush can become part of the next flush.
54  */
55 #include <sys/cdefs.h>
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/types.h>
59 #include <sys/lock.h>
60 #include <sys/kern_syscall.h>
61 #include <sys/uuid.h>
62
63 #include <crypto/sha2/sha2.h>
64
65 #include "hammer2.h"
66
67 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
68
69 static hammer2_chain_t *hammer2_chain_create_indirect(
70                 hammer2_chain_t *parent,
71                 hammer2_key_t key, int keybits,
72                 hammer2_tid_t mtid, int for_type, int *errorp);
73 static void hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop);
74 static hammer2_chain_t *hammer2_combined_find(
75                 hammer2_chain_t *parent,
76                 hammer2_blockref_t *base, int count,
77                 int *cache_indexp, hammer2_key_t *key_nextp,
78                 hammer2_key_t key_beg, hammer2_key_t key_end,
79                 hammer2_blockref_t **bresp);
80
81 /*
82  * Basic RBTree for chains (core->rbtree and core->dbtree).  Chains cannot
83  * overlap in the RB trees.  Deleted chains are moved from rbtree to either
84  * dbtree or to dbq.
85  *
86  * Chains in delete-duplicate sequences can always iterate through core_entry
87  * to locate the live version of the chain.
88  */
89 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
90
91 int
92 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
93 {
94         hammer2_key_t c1_beg;
95         hammer2_key_t c1_end;
96         hammer2_key_t c2_beg;
97         hammer2_key_t c2_end;
98
99         /*
100          * Compare chains.  Overlaps are not supposed to happen and catch
101          * any software issues early we count overlaps as a match.
102          */
103         c1_beg = chain1->bref.key;
104         c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1;
105         c2_beg = chain2->bref.key;
106         c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1;
107
108         if (c1_end < c2_beg)    /* fully to the left */
109                 return(-1);
110         if (c1_beg > c2_end)    /* fully to the right */
111                 return(1);
112         return(0);              /* overlap (must not cross edge boundary) */
113 }
114
115 static __inline
116 int
117 hammer2_isclusterable(hammer2_chain_t *chain)
118 {
119         if (hammer2_cluster_enable) {
120                 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
121                     chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
122                     chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
123                         return(1);
124                 }
125         }
126         return(0);
127 }
128
129 /*
130  * Make a chain visible to the flusher.  The flusher needs to be able to
131  * do flushes of subdirectory chains or single files so it does a top-down
132  * recursion using the ONFLUSH flag for the recursion.  It locates MODIFIED
133  * or UPDATE chains and flushes back up the chain to the volume root.
134  *
135  * This routine sets ONFLUSH upward until it hits the volume root.  For
136  * simplicity we ignore PFSROOT boundaries whos rules can be complex.
137  * Extra ONFLUSH flagging doesn't hurt the filesystem.
138  */
139 void
140 hammer2_chain_setflush(hammer2_chain_t *chain)
141 {
142         hammer2_chain_t *parent;
143
144         if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) {
145                 hammer2_spin_sh(&chain->core.spin);
146                 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) {
147                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
148                         if ((parent = chain->parent) == NULL)
149                                 break;
150                         hammer2_spin_sh(&parent->core.spin);
151                         hammer2_spin_unsh(&chain->core.spin);
152                         chain = parent;
153                 }
154                 hammer2_spin_unsh(&chain->core.spin);
155         }
156 }
157
158 /*
159  * Allocate a new disconnected chain element representing the specified
160  * bref.  chain->refs is set to 1 and the passed bref is copied to
161  * chain->bref.  chain->bytes is derived from the bref.
162  *
163  * chain->pmp inherits pmp unless the chain is an inode (other than the
164  * super-root inode).
165  *
166  * NOTE: Returns a referenced but unlocked (because there is no core) chain.
167  */
168 hammer2_chain_t *
169 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp,
170                     hammer2_blockref_t *bref)
171 {
172         hammer2_chain_t *chain;
173         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
174
175         /*
176          * Construct the appropriate system structure.
177          */
178         switch(bref->type) {
179         case HAMMER2_BREF_TYPE_INODE:
180         case HAMMER2_BREF_TYPE_INDIRECT:
181         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
182         case HAMMER2_BREF_TYPE_DATA:
183         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
184                 /*
185                  * Chain's are really only associated with the hmp but we
186                  * maintain a pmp association for per-mount memory tracking
187                  * purposes.  The pmp can be NULL.
188                  */
189                 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO);
190                 break;
191         case HAMMER2_BREF_TYPE_VOLUME:
192         case HAMMER2_BREF_TYPE_FREEMAP:
193                 /*
194                  * Only hammer2_chain_bulksnap() calls this function with these
195                  * types.
196                  */
197                 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO);
198                 break;
199         default:
200                 chain = NULL;
201                 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
202                       bref->type);
203         }
204
205         /*
206          * Initialize the new chain structure.  pmp must be set to NULL for
207          * chains belonging to the super-root topology of a device mount.
208          */
209         if (pmp == hmp->spmp)
210                 chain->pmp = NULL;
211         else
212                 chain->pmp = pmp;
213         chain->hmp = hmp;
214         chain->bref = *bref;
215         chain->bytes = bytes;
216         chain->refs = 1;
217         chain->flags = HAMMER2_CHAIN_ALLOCATED;
218
219         /*
220          * Set the PFS boundary flag if this chain represents a PFS root.
221          */
222         if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT)
223                 chain->flags |= HAMMER2_CHAIN_PFSBOUNDARY;
224         hammer2_chain_core_init(chain);
225
226         return (chain);
227 }
228
229 /*
230  * Initialize a chain's core structure.  This structure used to be allocated
231  * but is now embedded.
232  *
233  * The core is not locked.  No additional refs on the chain are made.
234  * (trans) must not be NULL if (core) is not NULL.
235  */
236 void
237 hammer2_chain_core_init(hammer2_chain_t *chain)
238 {
239         /*
240          * Fresh core under nchain (no multi-homing of ochain's
241          * sub-tree).
242          */
243         RB_INIT(&chain->core.rbtree);   /* live chains */
244         hammer2_mtx_init(&chain->lock, "h2chain");
245 }
246
247 /*
248  * Add a reference to a chain element, preventing its destruction.
249  *
250  * (can be called with spinlock held)
251  */
252 void
253 hammer2_chain_ref(hammer2_chain_t *chain)
254 {
255         atomic_add_int(&chain->refs, 1);
256 #if 0
257         kprintf("REFC %p %d %08x\n", chain, chain->refs - 1, chain->flags);
258         print_backtrace(8);
259 #endif
260 }
261
262 /*
263  * Insert the chain in the core rbtree.
264  *
265  * Normal insertions are placed in the live rbtree.  Insertion of a deleted
266  * chain is a special case used by the flush code that is placed on the
267  * unstaged deleted list to avoid confusing the live view.
268  */
269 #define HAMMER2_CHAIN_INSERT_SPIN       0x0001
270 #define HAMMER2_CHAIN_INSERT_LIVE       0x0002
271 #define HAMMER2_CHAIN_INSERT_RACE       0x0004
272
273 static
274 int
275 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain,
276                      int flags, int generation)
277 {
278         hammer2_chain_t *xchain;
279         int error = 0;
280
281         if (flags & HAMMER2_CHAIN_INSERT_SPIN)
282                 hammer2_spin_ex(&parent->core.spin);
283
284         /*
285          * Interlocked by spinlock, check for race
286          */
287         if ((flags & HAMMER2_CHAIN_INSERT_RACE) &&
288             parent->core.generation != generation) {
289                 error = EAGAIN;
290                 goto failed;
291         }
292
293         /*
294          * Insert chain
295          */
296         xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain);
297         KASSERT(xchain == NULL,
298                 ("hammer2_chain_insert: collision %p %p", chain, xchain));
299         atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
300         chain->parent = parent;
301         ++parent->core.chain_count;
302         ++parent->core.generation;      /* XXX incs for _get() too, XXX */
303
304         /*
305          * We have to keep track of the effective live-view blockref count
306          * so the create code knows when to push an indirect block.
307          */
308         if (flags & HAMMER2_CHAIN_INSERT_LIVE)
309                 atomic_add_int(&parent->core.live_count, 1);
310 failed:
311         if (flags & HAMMER2_CHAIN_INSERT_SPIN)
312                 hammer2_spin_unex(&parent->core.spin);
313         return error;
314 }
315
316 /*
317  * Drop the caller's reference to the chain.  When the ref count drops to
318  * zero this function will try to disassociate the chain from its parent and
319  * deallocate it, then recursely drop the parent using the implied ref
320  * from the chain's chain->parent.
321  */
322 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain);
323
324 void
325 hammer2_chain_drop(hammer2_chain_t *chain)
326 {
327         u_int refs;
328
329         if (hammer2_debug & 0x200000)
330                 Debugger("drop");
331 #if 0
332         kprintf("DROP %p %d %08x\n", chain, chain->refs - 1, chain->flags);
333         print_backtrace(8);
334 #endif
335
336         KKASSERT(chain->refs > 0);
337
338         while (chain) {
339                 refs = chain->refs;
340                 cpu_ccfence();
341                 KKASSERT(refs > 0);
342
343                 if (refs == 1) {
344                         chain = hammer2_chain_lastdrop(chain);
345                 } else {
346                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1))
347                                 break;
348                         /* retry the same chain */
349                 }
350         }
351 }
352
353 /*
354  * Safe handling of the 1->0 transition on chain.  Returns a chain for
355  * recursive drop or NULL, possibly returning the same chain if the atomic
356  * op fails.
357  *
358  * Whem two chains need to be recursively dropped we use the chain
359  * we would otherwise free to placehold the additional chain.  It's a bit
360  * convoluted but we can't just recurse without potentially blowing out
361  * the kernel stack.
362  *
363  * The chain cannot be freed if it has any children.
364  * The chain cannot be freed if flagged MODIFIED unless we can dispose of that.
365  * The chain cannot be freed if flagged UPDATE unless we can dispose of that.
366  *
367  * The core spinlock is allowed nest child-to-parent (not parent-to-child).
368  */
369 static
370 hammer2_chain_t *
371 hammer2_chain_lastdrop(hammer2_chain_t *chain)
372 {
373         hammer2_pfs_t *pmp;
374         hammer2_dev_t *hmp;
375         hammer2_chain_t *parent;
376         hammer2_chain_t *rdrop;
377
378         /*
379          * Critical field access.
380          */
381         hammer2_spin_ex(&chain->core.spin);
382
383         if (chain->parent) {
384                 /*
385                  * If the chain has a parent the UPDATE bit prevents scrapping
386                  * as the chain is needed to properly flush the parent.  Try
387                  * to complete the 1->0 transition and return NULL.  Retry
388                  * (return chain) if we are unable to complete the 1->0
389                  * transition, else return NULL (nothing more to do).
390                  *
391                  * If the chain has a parent the MODIFIED bit prevents
392                  * scrapping.
393                  */
394                 if (chain->flags & (HAMMER2_CHAIN_UPDATE |
395                                     HAMMER2_CHAIN_MODIFIED)) {
396                         if (atomic_cmpset_int(&chain->refs, 1, 0)) {
397                                 hammer2_spin_unex(&chain->core.spin);
398                                 chain = NULL;
399                         } else {
400                                 hammer2_spin_unex(&chain->core.spin);
401                         }
402                         return (chain);
403                 }
404                 /* spinlock still held */
405         } else {
406                 /*
407                  * The chain has no parent and can be flagged for destruction.
408                  * Since it has no parent, UPDATE can also be cleared.
409                  */
410                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
411                 if (chain->flags & HAMMER2_CHAIN_UPDATE)
412                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
413
414                 /*
415                  * If the chain has children or if it has been MODIFIED and
416                  * also recorded for DEDUP, we must still flush the chain.
417                  *
418                  * In the case where it has children, the DESTROY flag test
419                  * in the flush code will prevent unnecessary flushes of
420                  * MODIFIED chains that are not flagged DEDUP so don't worry
421                  * about that here.
422                  */
423                 if (chain->core.chain_count ||
424                     (chain->flags & (HAMMER2_CHAIN_MODIFIED |
425                                      HAMMER2_CHAIN_DEDUP)) ==
426                     (HAMMER2_CHAIN_MODIFIED |
427                      HAMMER2_CHAIN_DEDUP)) {
428                         /*
429                          * Put on flushq (should ensure refs > 1), retry
430                          * the drop.
431                          */
432                         hammer2_spin_unex(&chain->core.spin);
433                         hammer2_delayed_flush(chain);
434                         return(chain);  /* retry drop */
435                 }
436
437                 /*
438                  * Otherwise we can scrap the MODIFIED bit if it is set,
439                  * and continue along the freeing path.
440                  */
441                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
442                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
443                         atomic_add_long(&hammer2_count_modified_chains, -1);
444                 }
445                 /* spinlock still held */
446         }
447
448         /*
449          * If any children exist we must leave the chain intact with refs == 0.
450          * They exist because chains are retained below us which have refs or
451          * may require flushing.  This case can occur when parent != NULL.
452          *
453          * Retry (return chain) if we fail to transition the refs to 0, else
454          * return NULL indication nothing more to do.
455          */
456         if (chain->core.chain_count) {
457                 if (atomic_cmpset_int(&chain->refs, 1, 0)) {
458                         hammer2_spin_unex(&chain->core.spin);
459                         chain = NULL;
460                 } else {
461                         hammer2_spin_unex(&chain->core.spin);
462                 }
463                 return (chain);
464         }
465         /* spinlock still held */
466         /* no chains left under us */
467
468         /*
469          * chain->core has no children left so no accessors can get to our
470          * chain from there.  Now we have to lock the parent core to interlock
471          * remaining possible accessors that might bump chain's refs before
472          * we can safely drop chain's refs with intent to free the chain.
473          */
474         hmp = chain->hmp;
475         pmp = chain->pmp;       /* can be NULL */
476         rdrop = NULL;
477
478         /*
479          * Spinlock the parent and try to drop the last ref on chain.
480          * On success remove chain from its parent, otherwise return NULL.
481          *
482          * (normal core locks are top-down recursive but we define core
483          *  spinlocks as bottom-up recursive, so this is safe).
484          */
485         if ((parent = chain->parent) != NULL) {
486                 hammer2_spin_ex(&parent->core.spin);
487                 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
488                         /* 1->0 transition failed, retry */
489                         hammer2_spin_unex(&parent->core.spin);
490                         hammer2_spin_unex(&chain->core.spin);
491                         return(chain);
492                 }
493
494                 /*
495                  * 1->0 transition successful, remove chain from the
496                  * parent.
497                  */
498                 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
499                         RB_REMOVE(hammer2_chain_tree,
500                                   &parent->core.rbtree, chain);
501                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
502                         --parent->core.chain_count;
503                         chain->parent = NULL;
504                 }
505
506                 /*
507                  * If our chain was the last chain in the parent's core the
508                  * core is now empty and its parent might have to be
509                  * re-dropped if it has 0 refs.
510                  */
511                 if (parent->core.chain_count == 0) {
512                         rdrop = parent;
513                         atomic_add_int(&rdrop->refs, 1);
514                         /*
515                         if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0)
516                                 rdrop = NULL;
517                         */
518                 }
519                 hammer2_spin_unex(&parent->core.spin);
520                 parent = NULL;  /* safety */
521         }
522
523         /*
524          * Successful 1->0 transition and the chain can be destroyed now.
525          *
526          * We still have the core spinlock, and core's chain_count is 0.
527          * Any parent spinlock is gone.
528          */
529         hammer2_spin_unex(&chain->core.spin);
530         KKASSERT(RB_EMPTY(&chain->core.rbtree) &&
531                  chain->core.chain_count == 0);
532
533         /*
534          * All spin locks are gone, no pointers remain to the chain, finish
535          * freeing it.
536          */
537         KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE |
538                                   HAMMER2_CHAIN_MODIFIED)) == 0);
539         hammer2_chain_drop_data(chain, 1);
540
541         KKASSERT(chain->dio == NULL);
542
543         /*
544          * Once chain resources are gone we can use the now dead chain
545          * structure to placehold what might otherwise require a recursive
546          * drop, because we have potentially two things to drop and can only
547          * return one directly.
548          */
549         if (chain->flags & HAMMER2_CHAIN_ALLOCATED) {
550                 chain->flags &= ~HAMMER2_CHAIN_ALLOCATED;
551                 chain->hmp = NULL;
552                 kfree(chain, hmp->mchain);
553         }
554
555         /*
556          * Possible chaining loop when parent re-drop needed.
557          */
558         return(rdrop);
559 }
560
561 /*
562  * On either last lock release or last drop
563  */
564 static void
565 hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop)
566 {
567         /*hammer2_dev_t *hmp = chain->hmp;*/
568
569         switch(chain->bref.type) {
570         case HAMMER2_BREF_TYPE_VOLUME:
571         case HAMMER2_BREF_TYPE_FREEMAP:
572                 if (lastdrop)
573                         chain->data = NULL;
574                 break;
575         default:
576                 KKASSERT(chain->data == NULL);
577                 break;
578         }
579 }
580
581 /*
582  * Lock a referenced chain element, acquiring its data with I/O if necessary,
583  * and specify how you would like the data to be resolved.
584  *
585  * If an I/O or other fatal error occurs, chain->error will be set to non-zero.
586  *
587  * The lock is allowed to recurse, multiple locking ops will aggregate
588  * the requested resolve types.  Once data is assigned it will not be
589  * removed until the last unlock.
590  *
591  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
592  *                         (typically used to avoid device/logical buffer
593  *                          aliasing for data)
594  *
595  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
596  *                         the INITIAL-create state (indirect blocks only).
597  *
598  *                         Do not resolve data elements for DATA chains.
599  *                         (typically used to avoid device/logical buffer
600  *                          aliasing for data)
601  *
602  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
603  *
604  * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
605  *                         it will be locked exclusive.
606  *
607  * NOTE: Embedded elements (volume header, inodes) are always resolved
608  *       regardless.
609  *
610  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
611  *       element will instantiate and zero its buffer, and flush it on
612  *       release.
613  *
614  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
615  *       so as not to instantiate a device buffer, which could alias against
616  *       a logical file buffer.  However, if ALWAYS is specified the
617  *       device buffer will be instantiated anyway.
618  *
619  * WARNING! This function blocks on I/O if data needs to be fetched.  This
620  *          blocking can run concurrent with other compatible lock holders
621  *          who do not need data returning.  The lock is not upgraded to
622  *          exclusive during a data fetch, a separate bit is used to
623  *          interlock I/O.  However, an exclusive lock holder can still count
624  *          on being interlocked against an I/O fetch managed by a shared
625  *          lock holder.
626  */
627 void
628 hammer2_chain_lock(hammer2_chain_t *chain, int how)
629 {
630         /*
631          * Ref and lock the element.  Recursive locks are allowed.
632          */
633         KKASSERT(chain->refs > 0);
634         atomic_add_int(&chain->lockcnt, 1);
635
636         /*
637          * Get the appropriate lock.
638          */
639         if (how & HAMMER2_RESOLVE_SHARED)
640                 hammer2_mtx_sh(&chain->lock);
641         else
642                 hammer2_mtx_ex(&chain->lock);
643         ++curthread->td_tracker;
644
645         /*
646          * If we already have a valid data pointer no further action is
647          * necessary.
648          */
649         if (chain->data)
650                 return;
651
652         /*
653          * Do we have to resolve the data?
654          */
655         switch(how & HAMMER2_RESOLVE_MASK) {
656         case HAMMER2_RESOLVE_NEVER:
657                 return;
658         case HAMMER2_RESOLVE_MAYBE:
659                 if (chain->flags & HAMMER2_CHAIN_INITIAL)
660                         return;
661                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
662                         return;
663 #if 0
664                 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE)
665                         return;
666                 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF)
667                         return;
668 #endif
669                 /* fall through */
670         case HAMMER2_RESOLVE_ALWAYS:
671         default:
672                 break;
673         }
674
675         /*
676          * Caller requires data
677          */
678         hammer2_chain_load_data(chain);
679 }
680
681 /*
682  * Obtains a second shared lock on the chain, does not account the second
683  * shared lock as being owned by the current thread.
684  *
685  * Caller must already own a shared lock on this chain.
686  */
687 void
688 hammer2_chain_push_shared_lock(hammer2_chain_t *chain)
689 {
690         hammer2_mtx_sh(&chain->lock);
691         atomic_add_int(&chain->lockcnt, 1);
692         /* do not count in td_tracker for this thread */
693 }
694
695 /*
696  * Accounts for a shared lock that was pushed to us as being owned by our
697  * thread.
698  */
699 void
700 hammer2_chain_pull_shared_lock(hammer2_chain_t *chain)
701 {
702         ++curthread->td_tracker;
703 }
704
705 /*
706  * Issue I/O and install chain->data.  Caller must hold a chain lock, lock
707  * may be of any type.
708  *
709  * Once chain->data is set it cannot be disposed of until all locks are
710  * released.
711  */
712 void
713 hammer2_chain_load_data(hammer2_chain_t *chain)
714 {
715         hammer2_blockref_t *bref;
716         hammer2_dev_t *hmp;
717         char *bdata;
718         int error;
719
720         /*
721          * Degenerate case, data already present.
722          */
723         if (chain->data)
724                 return;
725
726         hmp = chain->hmp;
727         KKASSERT(hmp != NULL);
728
729         /*
730          * Gain the IOINPROG bit, interlocked block.
731          */
732         for (;;) {
733                 u_int oflags;
734                 u_int nflags;
735
736                 oflags = chain->flags;
737                 cpu_ccfence();
738                 if (oflags & HAMMER2_CHAIN_IOINPROG) {
739                         nflags = oflags | HAMMER2_CHAIN_IOSIGNAL;
740                         tsleep_interlock(&chain->flags, 0);
741                         if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
742                                 tsleep(&chain->flags, PINTERLOCKED,
743                                         "h2iocw", 0);
744                         }
745                         /* retry */
746                 } else {
747                         nflags = oflags | HAMMER2_CHAIN_IOINPROG;
748                         if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
749                                 break;
750                         }
751                         /* retry */
752                 }
753         }
754
755         /*
756          * We own CHAIN_IOINPROG
757          *
758          * Degenerate case if we raced another load.
759          */
760         if (chain->data)
761                 goto done;
762
763         /*
764          * We must resolve to a device buffer, either by issuing I/O or
765          * by creating a zero-fill element.  We do not mark the buffer
766          * dirty when creating a zero-fill element (the hammer2_chain_modify()
767          * API must still be used to do that).
768          *
769          * The device buffer is variable-sized in powers of 2 down
770          * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
771          * chunk always contains buffers of the same size. (XXX)
772          *
773          * The minimum physical IO size may be larger than the variable
774          * block size.
775          */
776         bref = &chain->bref;
777
778         /*
779          * The getblk() optimization can only be used on newly created
780          * elements if the physical block size matches the request.
781          */
782         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
783                 error = hammer2_io_new(hmp, bref->data_off, chain->bytes,
784                                         &chain->dio);
785         } else {
786                 error = hammer2_io_bread(hmp, bref->data_off, chain->bytes,
787                                          &chain->dio);
788                 hammer2_adjreadcounter(&chain->bref, chain->bytes);
789         }
790         if (error) {
791                 chain->error = HAMMER2_ERROR_IO;
792                 kprintf("hammer2_chain_lock: I/O error %016jx: %d\n",
793                         (intmax_t)bref->data_off, error);
794                 hammer2_io_bqrelse(&chain->dio);
795                 goto done;
796         }
797         chain->error = 0;
798
799         /*
800          * NOTE: A locked chain's data cannot be modified without first
801          *       calling hammer2_chain_modify().
802          */
803
804         /*
805          * Clear INITIAL.  In this case we used io_new() and the buffer has
806          * been zero'd and marked dirty.
807          */
808         bdata = hammer2_io_data(chain->dio, chain->bref.data_off);
809         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
810                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
811                 chain->bref.flags |= HAMMER2_BREF_FLAG_ZERO;
812         } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
813                 /*
814                  * check data not currently synchronized due to
815                  * modification.  XXX assumes data stays in the buffer
816                  * cache, which might not be true (need biodep on flush
817                  * to calculate crc?  or simple crc?).
818                  */
819         } else {
820                 if (hammer2_chain_testcheck(chain, bdata) == 0) {
821                         kprintf("chain %016jx.%02x meth=%02x "
822                                 "CHECK FAIL %08x (flags=%08x)\n",
823                                 chain->bref.data_off,
824                                 chain->bref.type,
825                                 chain->bref.methods,
826                                 hammer2_icrc32(bdata, chain->bytes),
827                                 chain->flags);
828                         chain->error = HAMMER2_ERROR_CHECK;
829                 }
830         }
831
832         /*
833          * Setup the data pointer, either pointing it to an embedded data
834          * structure and copying the data from the buffer, or pointing it
835          * into the buffer.
836          *
837          * The buffer is not retained when copying to an embedded data
838          * structure in order to avoid potential deadlocks or recursions
839          * on the same physical buffer.
840          *
841          * WARNING! Other threads can start using the data the instant we
842          *          set chain->data non-NULL.
843          */
844         switch (bref->type) {
845         case HAMMER2_BREF_TYPE_VOLUME:
846         case HAMMER2_BREF_TYPE_FREEMAP:
847                 /*
848                  * Copy data from bp to embedded buffer
849                  */
850                 panic("hammer2_chain_lock: called on unresolved volume header");
851                 break;
852         case HAMMER2_BREF_TYPE_INODE:
853         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
854         case HAMMER2_BREF_TYPE_INDIRECT:
855         case HAMMER2_BREF_TYPE_DATA:
856         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
857         default:
858                 /*
859                  * Point data at the device buffer and leave dio intact.
860                  */
861                 chain->data = (void *)bdata;
862                 break;
863         }
864
865         /*
866          * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested.
867          */
868 done:
869         for (;;) {
870                 u_int oflags;
871                 u_int nflags;
872
873                 oflags = chain->flags;
874                 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG |
875                                     HAMMER2_CHAIN_IOSIGNAL);
876                 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG);
877                 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
878                         if (oflags & HAMMER2_CHAIN_IOSIGNAL)
879                                 wakeup(&chain->flags);
880                         break;
881                 }
882         }
883 }
884
885 /*
886  * Unlock and deref a chain element.
887  *
888  * On the last lock release any non-embedded data (chain->dio) will be
889  * retired.
890  */
891 void
892 hammer2_chain_unlock(hammer2_chain_t *chain)
893 {
894         hammer2_mtx_state_t ostate;
895         long *counterp;
896         u_int lockcnt;
897
898         --curthread->td_tracker;
899         /*
900          * If multiple locks are present (or being attempted) on this
901          * particular chain we can just unlock, drop refs, and return.
902          *
903          * Otherwise fall-through on the 1->0 transition.
904          */
905         for (;;) {
906                 lockcnt = chain->lockcnt;
907                 KKASSERT(lockcnt > 0);
908                 cpu_ccfence();
909                 if (lockcnt > 1) {
910                         if (atomic_cmpset_int(&chain->lockcnt,
911                                               lockcnt, lockcnt - 1)) {
912                                 hammer2_mtx_unlock(&chain->lock);
913                                 return;
914                         }
915                 } else {
916                         if (atomic_cmpset_int(&chain->lockcnt, 1, 0))
917                                 break;
918                 }
919                 /* retry */
920         }
921
922         /*
923          * On the 1->0 transition we upgrade the core lock (if necessary)
924          * to exclusive for terminal processing.  If after upgrading we find
925          * that lockcnt is non-zero, another thread is racing us and will
926          * handle the unload for us later on, so just cleanup and return
927          * leaving the data/io intact
928          *
929          * Otherwise if lockcnt is still 0 it is possible for it to become
930          * non-zero and race, but since we hold the core->lock exclusively
931          * all that will happen is that the chain will be reloaded after we
932          * unload it.
933          */
934         ostate = hammer2_mtx_upgrade(&chain->lock);
935         if (chain->lockcnt) {
936                 hammer2_mtx_unlock(&chain->lock);
937                 return;
938         }
939
940         /*
941          * Shortcut the case if the data is embedded or not resolved.
942          * Only drop non-DIO-based data if the chain is not modified.
943          *
944          * Do NOT NULL out chain->data (e.g. inode data), it might be
945          * dirty.
946          */
947         if (chain->dio == NULL) {
948                 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0)
949                         hammer2_chain_drop_data(chain, 0);
950                 hammer2_mtx_unlock(&chain->lock);
951                 return;
952         }
953
954         /*
955          * Statistics
956          */
957         if (hammer2_io_isdirty(chain->dio)) {
958                 switch(chain->bref.type) {
959                 case HAMMER2_BREF_TYPE_DATA:
960                         counterp = &hammer2_iod_file_write;
961                         break;
962                 case HAMMER2_BREF_TYPE_INODE:
963                         counterp = &hammer2_iod_meta_write;
964                         break;
965                 case HAMMER2_BREF_TYPE_INDIRECT:
966                         counterp = &hammer2_iod_indr_write;
967                         break;
968                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
969                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
970                         counterp = &hammer2_iod_fmap_write;
971                         break;
972                 default:
973                         counterp = &hammer2_iod_volu_write;
974                         break;
975                 }
976                 *counterp += chain->bytes;
977         }
978
979         /*
980          * Clean out the dio.
981          *
982          * NOTE: Freemap leaf's use reserved blocks and thus no aliasing
983          *       is possible.
984          */
985         chain->data = NULL;
986         hammer2_io_bqrelse(&chain->dio);
987         hammer2_mtx_unlock(&chain->lock);
988 }
989
990 /*
991  * Helper to obtain the blockref[] array base and count for a chain.
992  *
993  * XXX Not widely used yet, various use cases need to be validated and
994  *     converted to use this function.
995  */
996 static
997 hammer2_blockref_t *
998 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp)
999 {
1000         hammer2_blockref_t *base;
1001         int count;
1002
1003         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1004                 base = NULL;
1005
1006                 switch(parent->bref.type) {
1007                 case HAMMER2_BREF_TYPE_INODE:
1008                         count = HAMMER2_SET_COUNT;
1009                         break;
1010                 case HAMMER2_BREF_TYPE_INDIRECT:
1011                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1012                         count = parent->bytes / sizeof(hammer2_blockref_t);
1013                         break;
1014                 case HAMMER2_BREF_TYPE_VOLUME:
1015                         count = HAMMER2_SET_COUNT;
1016                         break;
1017                 case HAMMER2_BREF_TYPE_FREEMAP:
1018                         count = HAMMER2_SET_COUNT;
1019                         break;
1020                 default:
1021                         panic("hammer2_chain_create_indirect: "
1022                               "unrecognized blockref type: %d",
1023                               parent->bref.type);
1024                         count = 0;
1025                         break;
1026                 }
1027         } else {
1028                 switch(parent->bref.type) {
1029                 case HAMMER2_BREF_TYPE_INODE:
1030                         base = &parent->data->ipdata.u.blockset.blockref[0];
1031                         count = HAMMER2_SET_COUNT;
1032                         break;
1033                 case HAMMER2_BREF_TYPE_INDIRECT:
1034                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1035                         base = &parent->data->npdata[0];
1036                         count = parent->bytes / sizeof(hammer2_blockref_t);
1037                         break;
1038                 case HAMMER2_BREF_TYPE_VOLUME:
1039                         base = &parent->data->voldata.
1040                                         sroot_blockset.blockref[0];
1041                         count = HAMMER2_SET_COUNT;
1042                         break;
1043                 case HAMMER2_BREF_TYPE_FREEMAP:
1044                         base = &parent->data->blkset.blockref[0];
1045                         count = HAMMER2_SET_COUNT;
1046                         break;
1047                 default:
1048                         panic("hammer2_chain_create_indirect: "
1049                               "unrecognized blockref type: %d",
1050                               parent->bref.type);
1051                         count = 0;
1052                         break;
1053                 }
1054         }
1055         *countp = count;
1056
1057         return base;
1058 }
1059
1060 /*
1061  * This counts the number of live blockrefs in a block array and
1062  * also calculates the point at which all remaining blockrefs are empty.
1063  * This routine can only be called on a live chain (DUPLICATED flag not set).
1064  *
1065  * NOTE: Flag is not set until after the count is complete, allowing
1066  *       callers to test the flag without holding the spinlock.
1067  *
1068  * NOTE: If base is NULL the related chain is still in the INITIAL
1069  *       state and there are no blockrefs to count.
1070  *
1071  * NOTE: live_count may already have some counts accumulated due to
1072  *       creation and deletion and could even be initially negative.
1073  */
1074 void
1075 hammer2_chain_countbrefs(hammer2_chain_t *chain,
1076                          hammer2_blockref_t *base, int count)
1077 {
1078         hammer2_spin_ex(&chain->core.spin);
1079         if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) {
1080                 if (base) {
1081                         while (--count >= 0) {
1082                                 if (base[count].type)
1083                                         break;
1084                         }
1085                         chain->core.live_zero = count + 1;
1086                         while (count >= 0) {
1087                                 if (base[count].type)
1088                                         atomic_add_int(&chain->core.live_count,
1089                                                        1);
1090                                 --count;
1091                         }
1092                 } else {
1093                         chain->core.live_zero = 0;
1094                 }
1095                 /* else do not modify live_count */
1096                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS);
1097         }
1098         hammer2_spin_unex(&chain->core.spin);
1099 }
1100
1101 /*
1102  * Resize the chain's physical storage allocation in-place.  This function does
1103  * not adjust the data pointer and must be followed by (typically) a
1104  * hammer2_chain_modify() call to copy any old data over and adjust the
1105  * data pointer.
1106  *
1107  * Chains can be resized smaller without reallocating the storage.  Resizing
1108  * larger will reallocate the storage.  Excess or prior storage is reclaimed
1109  * asynchronously at a later time.
1110  *
1111  * Must be passed an exclusively locked parent and chain.
1112  *
1113  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
1114  * to avoid instantiating a device buffer that conflicts with the vnode data
1115  * buffer.  However, because H2 can compress or encrypt data, the chain may
1116  * have a dio assigned to it in those situations, and they do not conflict.
1117  *
1118  * XXX return error if cannot resize.
1119  */
1120 void
1121 hammer2_chain_resize(hammer2_inode_t *ip,
1122                      hammer2_chain_t *parent, hammer2_chain_t *chain,
1123                      hammer2_tid_t mtid, hammer2_off_t dedup_off,
1124                      int nradix, int flags)
1125 {
1126         hammer2_dev_t *hmp;
1127         size_t obytes;
1128         size_t nbytes;
1129
1130         hmp = chain->hmp;
1131
1132         /*
1133          * Only data and indirect blocks can be resized for now.
1134          * (The volu root, inodes, and freemap elements use a fixed size).
1135          */
1136         KKASSERT(chain != &hmp->vchain);
1137         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1138                  chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
1139         KKASSERT(chain->parent == parent);
1140
1141         /*
1142          * Nothing to do if the element is already the proper size
1143          */
1144         obytes = chain->bytes;
1145         nbytes = 1U << nradix;
1146         if (obytes == nbytes)
1147                 return;
1148
1149         /*
1150          * Make sure the old data is instantiated so we can copy it.  If this
1151          * is a data block, the device data may be superfluous since the data
1152          * might be in a logical block, but compressed or encrypted data is
1153          * another matter.
1154          *
1155          * NOTE: The modify will set BMAPUPD for us if BMAPPED is set.
1156          */
1157         hammer2_chain_modify(chain, mtid, dedup_off, 0);
1158
1159         /*
1160          * Relocate the block, even if making it smaller (because different
1161          * block sizes may be in different regions).
1162          *
1163          * (data blocks only, we aren't copying the storage here).
1164          */
1165         hammer2_freemap_alloc(chain, nbytes);
1166         chain->bytes = nbytes;
1167         /*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */
1168
1169         /*
1170          * We don't want the followup chain_modify() to try to copy data
1171          * from the old (wrong-sized) buffer.  It won't know how much to
1172          * copy.  This case should only occur during writes when the
1173          * originator already has the data to write in-hand.
1174          */
1175         if (chain->dio) {
1176                 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA);
1177                 hammer2_io_brelse(&chain->dio);
1178                 chain->data = NULL;
1179         }
1180 }
1181
1182 /*
1183  * Set the chain modified so its data can be changed by the caller.
1184  *
1185  * Sets bref.modify_tid to mtid only if mtid != 0.  Note that bref.modify_tid
1186  * is a CLC (cluster level change) field and is not updated by parent
1187  * propagation during a flush.
1188  *
1189  * If the caller passes a non-zero dedup_off we assign data_off to that
1190  * instead of allocating a ne block.  Caller must not modify the data already
1191  * present at the target offset.
1192  */
1193 void
1194 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid,
1195                      hammer2_off_t dedup_off, int flags)
1196 {
1197         hammer2_blockref_t obref;
1198         hammer2_dev_t *hmp;
1199         hammer2_io_t *dio;
1200         int error;
1201         int wasinitial;
1202         int newmod;
1203         char *bdata;
1204
1205         hmp = chain->hmp;
1206         obref = chain->bref;
1207         KKASSERT((chain->flags & HAMMER2_CHAIN_FICTITIOUS) == 0);
1208
1209         /*
1210          * Data is not optional for freemap chains (we must always be sure
1211          * to copy the data on COW storage allocations).
1212          */
1213         if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
1214             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
1215                 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) ||
1216                          (flags & HAMMER2_MODIFY_OPTDATA) == 0);
1217         }
1218
1219         /*
1220          * Data must be resolved if already assigned, unless explicitly
1221          * flagged otherwise.
1222          */
1223         if (chain->data == NULL && (flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
1224             (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
1225                 hammer2_chain_load_data(chain);
1226         }
1227
1228         /*
1229          * Set MODIFIED to indicate that the chain has been modified.
1230          * Set UPDATE to ensure that the blockref is updated in the parent.
1231          */
1232         if ((chain->flags & (HAMMER2_CHAIN_DEDUP | HAMMER2_CHAIN_MODIFIED)) ==
1233             (HAMMER2_CHAIN_DEDUP | HAMMER2_CHAIN_MODIFIED)) {
1234                 /*
1235                  * Modified already set but a new allocation is needed
1236                  * anyway because we recorded this data_off for possible
1237                  * dedup operation.
1238                  */
1239                 newmod = 1;
1240         } else if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1241                 /*
1242                  * Must set modified bit.
1243                  */
1244                 atomic_add_long(&hammer2_count_modified_chains, 1);
1245                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1246                 hammer2_pfs_memory_inc(chain->pmp);     /* can be NULL */
1247                 newmod = 1;
1248         } else {
1249                 /*
1250                  * Already flagged modified, no new allocation is needed.
1251                  */
1252                 newmod = 0;
1253         }
1254
1255         /*
1256          * Flag parent update required, clear DEDUP flag (already processed
1257          * above).
1258          */
1259         if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0)
1260                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1261         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DEDUP);
1262
1263         /*
1264          * The modification or re-modification requires an allocation and
1265          * possible COW.
1266          *
1267          * If dedup_off is non-zero, caller already has a data offset
1268          * containing the caller's desired data.  The dedup offset is
1269          * allowed to be in a partially free state and we must be sure
1270          * to reset it to a fully allocated state to force two bulkfree
1271          * passes to free it again.
1272          *
1273          * XXX can a chain already be marked MODIFIED without a data
1274          * assignment?  If not, assert here instead of testing the case.
1275          */
1276         if (chain != &hmp->vchain && chain != &hmp->fchain) {
1277                 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 ||
1278                      newmod
1279                 ) {
1280                         if (dedup_off) {
1281                                 chain->bref.data_off = dedup_off;
1282                                 chain->bytes = 1 << (dedup_off &
1283                                                      HAMMER2_OFF_MASK_RADIX);
1284                                 atomic_set_int(&chain->flags,
1285                                                HAMMER2_CHAIN_DEDUP);
1286                                 hammer2_freemap_adjust(hmp, &chain->bref,
1287                                                 HAMMER2_FREEMAP_DORECOVER);
1288                         } else {
1289                                 hammer2_freemap_alloc(chain, chain->bytes);
1290                         }
1291                         /* XXX failed allocation */
1292                 }
1293         }
1294
1295         /*
1296          * Update mirror_tid and modify_tid.  modify_tid is only updated
1297          * if not passed as zero (during flushes, parent propagation passes
1298          * the value 0).
1299          *
1300          * NOTE: chain->pmp could be the device spmp.
1301          */
1302         chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1;
1303         if (mtid)
1304                 chain->bref.modify_tid = mtid;
1305
1306         /*
1307          * Set BMAPUPD to tell the flush code that an existing blockmap entry
1308          * requires updating as well as to tell the delete code that the
1309          * chain's blockref might not exactly match (in terms of physical size
1310          * or block offset) the one in the parent's blocktable.  The base key
1311          * of course will still match.
1312          */
1313         if (chain->flags & HAMMER2_CHAIN_BMAPPED)
1314                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD);
1315
1316         /*
1317          * Short-cut data blocks which the caller does not need an actual
1318          * data reference to (aka OPTDATA), as long as the chain does not
1319          * already have a data pointer to the data.  This generally means
1320          * that the modifications are being done via the logical buffer cache.
1321          * The INITIAL flag relates only to the device data buffer and thus
1322          * remains unchange in this situation.
1323          */
1324         if (chain->bref.type == HAMMER2_BREF_TYPE_DATA &&
1325             (flags & HAMMER2_MODIFY_OPTDATA) &&
1326             chain->data == NULL) {
1327                 goto skip2;
1328         }
1329
1330         /*
1331          * Clearing the INITIAL flag (for indirect blocks) indicates that
1332          * we've processed the uninitialized storage allocation.
1333          *
1334          * If this flag is already clear we are likely in a copy-on-write
1335          * situation but we have to be sure NOT to bzero the storage if
1336          * no data is present.
1337          */
1338         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1339                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1340                 wasinitial = 1;
1341         } else {
1342                 wasinitial = 0;
1343         }
1344
1345         /*
1346          * Instantiate data buffer and possibly execute COW operation
1347          */
1348         switch(chain->bref.type) {
1349         case HAMMER2_BREF_TYPE_VOLUME:
1350         case HAMMER2_BREF_TYPE_FREEMAP:
1351                 /*
1352                  * The data is embedded, no copy-on-write operation is
1353                  * needed.
1354                  */
1355                 KKASSERT(chain->dio == NULL);
1356                 break;
1357         case HAMMER2_BREF_TYPE_INODE:
1358         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1359         case HAMMER2_BREF_TYPE_DATA:
1360         case HAMMER2_BREF_TYPE_INDIRECT:
1361         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1362                 /*
1363                  * Perform the copy-on-write operation
1364                  *
1365                  * zero-fill or copy-on-write depending on whether
1366                  * chain->data exists or not and set the dirty state for
1367                  * the new buffer.  hammer2_io_new() will handle the
1368                  * zero-fill.
1369                  *
1370                  * If a dedup_off was supplied this is an existing block
1371                  * and no COW, copy, or further modification is required.
1372                  */
1373                 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain);
1374
1375                 if (wasinitial && dedup_off == 0) {
1376                         error = hammer2_io_new(hmp, chain->bref.data_off,
1377                                                chain->bytes, &dio);
1378                 } else {
1379                         error = hammer2_io_bread(hmp, chain->bref.data_off,
1380                                                  chain->bytes, &dio);
1381                 }
1382                 hammer2_adjreadcounter(&chain->bref, chain->bytes);
1383
1384                 /*
1385                  * If an I/O error occurs make sure callers cannot accidently
1386                  * modify the old buffer's contents and corrupt the filesystem.
1387                  */
1388                 if (error) {
1389                         kprintf("hammer2_chain_modify: hmp=%p I/O error\n",
1390                                 hmp);
1391                         chain->error = HAMMER2_ERROR_IO;
1392                         hammer2_io_brelse(&dio);
1393                         hammer2_io_brelse(&chain->dio);
1394                         chain->data = NULL;
1395                         break;
1396                 }
1397                 chain->error = 0;
1398                 bdata = hammer2_io_data(dio, chain->bref.data_off);
1399
1400                 if (chain->data) {
1401                         /*
1402                          * COW (unless a dedup).
1403                          */
1404                         KKASSERT(chain->dio != NULL);
1405                         if (chain->data != (void *)bdata && dedup_off == 0) {
1406                                 bcopy(chain->data, bdata, chain->bytes);
1407                         }
1408                 } else if (wasinitial == 0) {
1409                         /*
1410                          * We have a problem.  We were asked to COW but
1411                          * we don't have any data to COW with!
1412                          */
1413                         panic("hammer2_chain_modify: having a COW %p\n",
1414                               chain);
1415                 }
1416
1417                 /*
1418                  * Retire the old buffer, replace with the new.  Dirty or
1419                  * redirty the new buffer.
1420                  *
1421                  * WARNING! The system buffer cache may have already flushed
1422                  *          the buffer, so we must be sure to [re]dirty it
1423                  *          for further modification.
1424                  *
1425                  *          If dedup_off was supplied, the caller is not
1426                  *          expected to make any further modification to the
1427                  *          buffer.
1428                  */
1429                 if (chain->dio)
1430                         hammer2_io_bqrelse(&chain->dio);
1431                 chain->data = (void *)bdata;
1432                 chain->dio = dio;
1433                 if (dedup_off == 0)
1434                         hammer2_io_setdirty(dio);
1435                 break;
1436         default:
1437                 panic("hammer2_chain_modify: illegal non-embedded type %d",
1438                       chain->bref.type);
1439                 break;
1440
1441         }
1442 skip2:
1443         /*
1444          * setflush on parent indicating that the parent must recurse down
1445          * to us.  Do not call on chain itself which might already have it
1446          * set.
1447          */
1448         if (chain->parent)
1449                 hammer2_chain_setflush(chain->parent);
1450 }
1451
1452 /*
1453  * Modify the chain associated with an inode.
1454  */
1455 void
1456 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain,
1457                         hammer2_tid_t mtid, int flags)
1458 {
1459         hammer2_inode_modify(ip);
1460         hammer2_chain_modify(chain, mtid, 0, flags);
1461 }
1462
1463 /*
1464  * Volume header data locks
1465  */
1466 void
1467 hammer2_voldata_lock(hammer2_dev_t *hmp)
1468 {
1469         lockmgr(&hmp->vollk, LK_EXCLUSIVE);
1470 }
1471
1472 void
1473 hammer2_voldata_unlock(hammer2_dev_t *hmp)
1474 {
1475         lockmgr(&hmp->vollk, LK_RELEASE);
1476 }
1477
1478 void
1479 hammer2_voldata_modify(hammer2_dev_t *hmp)
1480 {
1481         if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1482                 atomic_add_long(&hammer2_count_modified_chains, 1);
1483                 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED);
1484                 hammer2_pfs_memory_inc(hmp->vchain.pmp);
1485         }
1486 }
1487
1488 /*
1489  * This function returns the chain at the nearest key within the specified
1490  * range.  The returned chain will be referenced but not locked.
1491  *
1492  * This function will recurse through chain->rbtree as necessary and will
1493  * return a *key_nextp suitable for iteration.  *key_nextp is only set if
1494  * the iteration value is less than the current value of *key_nextp.
1495  *
1496  * The caller should use (*key_nextp) to calculate the actual range of
1497  * the returned element, which will be (key_beg to *key_nextp - 1), because
1498  * there might be another element which is superior to the returned element
1499  * and overlaps it.
1500  *
1501  * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL
1502  * chains continue to be returned.  On EOF (*key_nextp) may overflow since
1503  * it will wind up being (key_end + 1).
1504  *
1505  * WARNING!  Must be called with child's spinlock held.  Spinlock remains
1506  *           held through the operation.
1507  */
1508 struct hammer2_chain_find_info {
1509         hammer2_chain_t         *best;
1510         hammer2_key_t           key_beg;
1511         hammer2_key_t           key_end;
1512         hammer2_key_t           key_next;
1513 };
1514
1515 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data);
1516 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data);
1517
1518 static
1519 hammer2_chain_t *
1520 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp,
1521                           hammer2_key_t key_beg, hammer2_key_t key_end)
1522 {
1523         struct hammer2_chain_find_info info;
1524
1525         info.best = NULL;
1526         info.key_beg = key_beg;
1527         info.key_end = key_end;
1528         info.key_next = *key_nextp;
1529
1530         RB_SCAN(hammer2_chain_tree, &parent->core.rbtree,
1531                 hammer2_chain_find_cmp, hammer2_chain_find_callback,
1532                 &info);
1533         *key_nextp = info.key_next;
1534 #if 0
1535         kprintf("chain_find %p %016jx:%016jx next=%016jx\n",
1536                 parent, key_beg, key_end, *key_nextp);
1537 #endif
1538
1539         return (info.best);
1540 }
1541
1542 static
1543 int
1544 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data)
1545 {
1546         struct hammer2_chain_find_info *info = data;
1547         hammer2_key_t child_beg;
1548         hammer2_key_t child_end;
1549
1550         child_beg = child->bref.key;
1551         child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1;
1552
1553         if (child_end < info->key_beg)
1554                 return(-1);
1555         if (child_beg > info->key_end)
1556                 return(1);
1557         return(0);
1558 }
1559
1560 static
1561 int
1562 hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
1563 {
1564         struct hammer2_chain_find_info *info = data;
1565         hammer2_chain_t *best;
1566         hammer2_key_t child_end;
1567
1568         /*
1569          * WARNING! Do not discard DUPLICATED chains, it is possible that
1570          *          we are catching an insertion half-way done.  If a
1571          *          duplicated chain turns out to be the best choice the
1572          *          caller will re-check its flags after locking it.
1573          *
1574          * WARNING! Layerq is scanned forwards, exact matches should keep
1575          *          the existing info->best.
1576          */
1577         if ((best = info->best) == NULL) {
1578                 /*
1579                  * No previous best.  Assign best
1580                  */
1581                 info->best = child;
1582         } else if (best->bref.key <= info->key_beg &&
1583                    child->bref.key <= info->key_beg) {
1584                 /*
1585                  * Illegal overlap.
1586                  */
1587                 KKASSERT(0);
1588                 /*info->best = child;*/
1589         } else if (child->bref.key < best->bref.key) {
1590                 /*
1591                  * Child has a nearer key and best is not flush with key_beg.
1592                  * Set best to child.  Truncate key_next to the old best key.
1593                  */
1594                 info->best = child;
1595                 if (info->key_next > best->bref.key || info->key_next == 0)
1596                         info->key_next = best->bref.key;
1597         } else if (child->bref.key == best->bref.key) {
1598                 /*
1599                  * If our current best is flush with the child then this
1600                  * is an illegal overlap.
1601                  *
1602                  * key_next will automatically be limited to the smaller of
1603                  * the two end-points.
1604                  */
1605                 KKASSERT(0);
1606                 info->best = child;
1607         } else {
1608                 /*
1609                  * Keep the current best but truncate key_next to the child's
1610                  * base.
1611                  *
1612                  * key_next will also automatically be limited to the smaller
1613                  * of the two end-points (probably not necessary for this case
1614                  * but we do it anyway).
1615                  */
1616                 if (info->key_next > child->bref.key || info->key_next == 0)
1617                         info->key_next = child->bref.key;
1618         }
1619
1620         /*
1621          * Always truncate key_next based on child's end-of-range.
1622          */
1623         child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits);
1624         if (child_end && (info->key_next > child_end || info->key_next == 0))
1625                 info->key_next = child_end;
1626
1627         return(0);
1628 }
1629
1630 /*
1631  * Retrieve the specified chain from a media blockref, creating the
1632  * in-memory chain structure which reflects it.
1633  *
1634  * To handle insertion races pass the INSERT_RACE flag along with the
1635  * generation number of the core.  NULL will be returned if the generation
1636  * number changes before we have a chance to insert the chain.  Insert
1637  * races can occur because the parent might be held shared.
1638  *
1639  * Caller must hold the parent locked shared or exclusive since we may
1640  * need the parent's bref array to find our block.
1641  *
1642  * WARNING! chain->pmp is always set to NULL for any chain representing
1643  *          part of the super-root topology.
1644  */
1645 hammer2_chain_t *
1646 hammer2_chain_get(hammer2_chain_t *parent, int generation,
1647                   hammer2_blockref_t *bref)
1648 {
1649         hammer2_dev_t *hmp = parent->hmp;
1650         hammer2_chain_t *chain;
1651         int error;
1652
1653         /*
1654          * Allocate a chain structure representing the existing media
1655          * entry.  Resulting chain has one ref and is not locked.
1656          */
1657         if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT)
1658                 chain = hammer2_chain_alloc(hmp, NULL, bref);
1659         else
1660                 chain = hammer2_chain_alloc(hmp, parent->pmp, bref);
1661         /* ref'd chain returned */
1662
1663         /*
1664          * Flag that the chain is in the parent's blockmap so delete/flush
1665          * knows what to do with it.
1666          */
1667         atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED);
1668
1669         /*
1670          * Link the chain into its parent.  A spinlock is required to safely
1671          * access the RBTREE, and it is possible to collide with another
1672          * hammer2_chain_get() operation because the caller might only hold
1673          * a shared lock on the parent.
1674          */
1675         KKASSERT(parent->refs > 0);
1676         error = hammer2_chain_insert(parent, chain,
1677                                      HAMMER2_CHAIN_INSERT_SPIN |
1678                                      HAMMER2_CHAIN_INSERT_RACE,
1679                                      generation);
1680         if (error) {
1681                 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
1682                 kprintf("chain %p get race\n", chain);
1683                 hammer2_chain_drop(chain);
1684                 chain = NULL;
1685         } else {
1686                 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
1687         }
1688
1689         /*
1690          * Return our new chain referenced but not locked, or NULL if
1691          * a race occurred.
1692          */
1693         return (chain);
1694 }
1695
1696 /*
1697  * Lookup initialization/completion API
1698  */
1699 hammer2_chain_t *
1700 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags)
1701 {
1702         hammer2_chain_ref(parent);
1703         if (flags & HAMMER2_LOOKUP_SHARED) {
1704                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
1705                                            HAMMER2_RESOLVE_SHARED);
1706         } else {
1707                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
1708         }
1709         return (parent);
1710 }
1711
1712 void
1713 hammer2_chain_lookup_done(hammer2_chain_t *parent)
1714 {
1715         if (parent) {
1716                 hammer2_chain_unlock(parent);
1717                 hammer2_chain_drop(parent);
1718         }
1719 }
1720
1721 hammer2_chain_t *
1722 hammer2_chain_getparent(hammer2_chain_t **parentp, int how)
1723 {
1724         hammer2_chain_t *oparent;
1725         hammer2_chain_t *nparent;
1726
1727         /*
1728          * Be careful of order, oparent must be unlocked before nparent
1729          * is locked below to avoid a deadlock.
1730          */
1731         oparent = *parentp;
1732         hammer2_spin_ex(&oparent->core.spin);
1733         nparent = oparent->parent;
1734         hammer2_chain_ref(nparent);
1735         hammer2_spin_unex(&oparent->core.spin);
1736         if (oparent) {
1737                 hammer2_chain_unlock(oparent);
1738                 hammer2_chain_drop(oparent);
1739                 oparent = NULL;
1740         }
1741
1742         hammer2_chain_lock(nparent, how);
1743         *parentp = nparent;
1744
1745         return (nparent);
1746 }
1747
1748 /*
1749  * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive.
1750  * (*parentp) typically points to an inode but can also point to a related
1751  * indirect block and this function will recurse upwards and find the inode
1752  * again.
1753  *
1754  * (*parentp) must be exclusively locked and referenced and can be an inode
1755  * or an existing indirect block within the inode.
1756  *
1757  * On return (*parentp) will be modified to point at the deepest parent chain
1758  * element encountered during the search, as a helper for an insertion or
1759  * deletion.   The new (*parentp) will be locked and referenced and the old
1760  * will be unlocked and dereferenced (no change if they are both the same).
1761  *
1762  * The matching chain will be returned exclusively locked.  If NOLOCK is
1763  * requested the chain will be returned only referenced.  Note that the
1764  * parent chain must always be locked shared or exclusive, matching the
1765  * HAMMER2_LOOKUP_SHARED flag.  We can conceivably lock it SHARED temporarily
1766  * when NOLOCK is specified but that complicates matters if *parentp must
1767  * inherit the chain.
1768  *
1769  * NOLOCK also implies NODATA, since an unlocked chain usually has a NULL
1770  * data pointer or can otherwise be in flux.
1771  *
1772  * NULL is returned if no match was found, but (*parentp) will still
1773  * potentially be adjusted.
1774  *
1775  * If a fatal error occurs (typically an I/O error), a dummy chain is
1776  * returned with chain->error and error-identifying information set.  This
1777  * chain will assert if you try to do anything fancy with it.
1778  *
1779  * XXX Depending on where the error occurs we should allow continued iteration.
1780  *
1781  * On return (*key_nextp) will point to an iterative value for key_beg.
1782  * (If NULL is returned (*key_nextp) is set to (key_end + 1)).
1783  *
1784  * This function will also recurse up the chain if the key is not within the
1785  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
1786  * can simply allow (*parentp) to float inside the loop.
1787  *
1788  * NOTE!  chain->data is not always resolved.  By default it will not be
1789  *        resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF.  Use
1790  *        HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/
1791  *        BREF_TYPE_DATA as the device buffer can alias the logical file
1792  *        buffer).
1793  */
1794 hammer2_chain_t *
1795 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp,
1796                      hammer2_key_t key_beg, hammer2_key_t key_end,
1797                      int *cache_indexp, int flags)
1798 {
1799         hammer2_dev_t *hmp;
1800         hammer2_chain_t *parent;
1801         hammer2_chain_t *chain;
1802         hammer2_blockref_t *base;
1803         hammer2_blockref_t *bref;
1804         hammer2_blockref_t bcopy;
1805         hammer2_key_t scan_beg;
1806         hammer2_key_t scan_end;
1807         int count = 0;
1808         int how_always = HAMMER2_RESOLVE_ALWAYS;
1809         int how_maybe = HAMMER2_RESOLVE_MAYBE;
1810         int how;
1811         int generation;
1812         int maxloops = 300000;
1813
1814         if (flags & HAMMER2_LOOKUP_ALWAYS) {
1815                 how_maybe = how_always;
1816                 how = HAMMER2_RESOLVE_ALWAYS;
1817         } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) {
1818                 how = HAMMER2_RESOLVE_NEVER;
1819         } else {
1820                 how = HAMMER2_RESOLVE_MAYBE;
1821         }
1822         if (flags & HAMMER2_LOOKUP_SHARED) {
1823                 how_maybe |= HAMMER2_RESOLVE_SHARED;
1824                 how_always |= HAMMER2_RESOLVE_SHARED;
1825                 how |= HAMMER2_RESOLVE_SHARED;
1826         }
1827
1828         /*
1829          * Recurse (*parentp) upward if necessary until the parent completely
1830          * encloses the key range or we hit the inode.
1831          *
1832          * This function handles races against the flusher doing a delete-
1833          * duplicate above us and re-homes the parent to the duplicate in
1834          * that case, otherwise we'd wind up recursing down a stale chain.
1835          */
1836         parent = *parentp;
1837         hmp = parent->hmp;
1838
1839         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1840                parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1841                 scan_beg = parent->bref.key;
1842                 scan_end = scan_beg +
1843                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1844                 if (key_beg >= scan_beg && key_end <= scan_end)
1845                         break;
1846                 parent = hammer2_chain_getparent(parentp, how_maybe);
1847         }
1848
1849 again:
1850         if (--maxloops == 0)
1851                 panic("hammer2_chain_lookup: maxloops");
1852         /*
1853          * Locate the blockref array.  Currently we do a fully associative
1854          * search through the array.
1855          */
1856         switch(parent->bref.type) {
1857         case HAMMER2_BREF_TYPE_INODE:
1858                 /*
1859                  * Special shortcut for embedded data returns the inode
1860                  * itself.  Callers must detect this condition and access
1861                  * the embedded data (the strategy code does this for us).
1862                  *
1863                  * This is only applicable to regular files and softlinks.
1864                  */
1865                 if (parent->data->ipdata.meta.op_flags &
1866                     HAMMER2_OPFLAG_DIRECTDATA) {
1867                         if (flags & HAMMER2_LOOKUP_NODIRECT) {
1868                                 chain = NULL;
1869                                 *key_nextp = key_end + 1;
1870                                 goto done;
1871                         }
1872                         hammer2_chain_ref(parent);
1873                         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1874                                 hammer2_chain_lock(parent, how_always);
1875                         *key_nextp = key_end + 1;
1876                         return (parent);
1877                 }
1878                 base = &parent->data->ipdata.u.blockset.blockref[0];
1879                 count = HAMMER2_SET_COUNT;
1880                 break;
1881         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1882         case HAMMER2_BREF_TYPE_INDIRECT:
1883                 /*
1884                  * Handle MATCHIND on the parent
1885                  */
1886                 if (flags & HAMMER2_LOOKUP_MATCHIND) {
1887                         scan_beg = parent->bref.key;
1888                         scan_end = scan_beg +
1889                                ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1890                         if (key_beg == scan_beg && key_end == scan_end) {
1891                                 chain = parent;
1892                                 hammer2_chain_ref(chain);
1893                                 hammer2_chain_lock(chain, how_maybe);
1894                                 *key_nextp = scan_end + 1;
1895                                 goto done;
1896                         }
1897                 }
1898                 /*
1899                  * Optimize indirect blocks in the INITIAL state to avoid
1900                  * I/O.
1901                  */
1902                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1903                         base = NULL;
1904                 } else {
1905                         if (parent->data == NULL)
1906                                 panic("parent->data is NULL");
1907                         base = &parent->data->npdata[0];
1908                 }
1909                 count = parent->bytes / sizeof(hammer2_blockref_t);
1910                 break;
1911         case HAMMER2_BREF_TYPE_VOLUME:
1912                 base = &parent->data->voldata.sroot_blockset.blockref[0];
1913                 count = HAMMER2_SET_COUNT;
1914                 break;
1915         case HAMMER2_BREF_TYPE_FREEMAP:
1916                 base = &parent->data->blkset.blockref[0];
1917                 count = HAMMER2_SET_COUNT;
1918                 break;
1919         default:
1920                 kprintf("hammer2_chain_lookup: unrecognized "
1921                         "blockref(B) type: %d",
1922                         parent->bref.type);
1923                 while (1)
1924                         tsleep(&base, 0, "dead", 0);
1925                 panic("hammer2_chain_lookup: unrecognized "
1926                       "blockref(B) type: %d",
1927                       parent->bref.type);
1928                 base = NULL;    /* safety */
1929                 count = 0;      /* safety */
1930         }
1931
1932         /*
1933          * Merged scan to find next candidate.
1934          *
1935          * hammer2_base_*() functions require the parent->core.live_* fields
1936          * to be synchronized.
1937          *
1938          * We need to hold the spinlock to access the block array and RB tree
1939          * and to interlock chain creation.
1940          */
1941         if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
1942                 hammer2_chain_countbrefs(parent, base, count);
1943
1944         /*
1945          * Combined search
1946          */
1947         hammer2_spin_ex(&parent->core.spin);
1948         chain = hammer2_combined_find(parent, base, count,
1949                                       cache_indexp, key_nextp,
1950                                       key_beg, key_end,
1951                                       &bref);
1952         generation = parent->core.generation;
1953
1954         /*
1955          * Exhausted parent chain, iterate.
1956          */
1957         if (bref == NULL) {
1958                 hammer2_spin_unex(&parent->core.spin);
1959                 if (key_beg == key_end) /* short cut single-key case */
1960                         return (NULL);
1961
1962                 /*
1963                  * Stop if we reached the end of the iteration.
1964                  */
1965                 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
1966                     parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1967                         return (NULL);
1968                 }
1969
1970                 /*
1971                  * Calculate next key, stop if we reached the end of the
1972                  * iteration, otherwise go up one level and loop.
1973                  */
1974                 key_beg = parent->bref.key +
1975                           ((hammer2_key_t)1 << parent->bref.keybits);
1976                 if (key_beg == 0 || key_beg > key_end)
1977                         return (NULL);
1978                 parent = hammer2_chain_getparent(parentp, how_maybe);
1979                 goto again;
1980         }
1981
1982         /*
1983          * Selected from blockref or in-memory chain.
1984          */
1985         if (chain == NULL) {
1986                 bcopy = *bref;
1987                 hammer2_spin_unex(&parent->core.spin);
1988                 chain = hammer2_chain_get(parent, generation,
1989                                           &bcopy);
1990                 if (chain == NULL) {
1991                         kprintf("retry lookup parent %p keys %016jx:%016jx\n",
1992                                 parent, key_beg, key_end);
1993                         goto again;
1994                 }
1995                 if (bcmp(&bcopy, bref, sizeof(bcopy))) {
1996                         hammer2_chain_drop(chain);
1997                         goto again;
1998                 }
1999         } else {
2000                 hammer2_chain_ref(chain);
2001                 hammer2_spin_unex(&parent->core.spin);
2002         }
2003
2004         /*
2005          * chain is referenced but not locked.  We must lock the chain
2006          * to obtain definitive DUPLICATED/DELETED state
2007          */
2008         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2009             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2010                 hammer2_chain_lock(chain, how_maybe);
2011         } else {
2012                 hammer2_chain_lock(chain, how);
2013         }
2014
2015         /*
2016          * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
2017          *
2018          * NOTE: Chain's key range is not relevant as there might be
2019          *       one-offs within the range that are not deleted.
2020          *
2021          * NOTE: Lookups can race delete-duplicate because
2022          *       delete-duplicate does not lock the parent's core
2023          *       (they just use the spinlock on the core).  We must
2024          *       check for races by comparing the DUPLICATED flag before
2025          *       releasing the spinlock with the flag after locking the
2026          *       chain.
2027          */
2028         if (chain->flags & HAMMER2_CHAIN_DELETED) {
2029                 hammer2_chain_unlock(chain);
2030                 hammer2_chain_drop(chain);
2031                 key_beg = *key_nextp;
2032                 if (key_beg == 0 || key_beg > key_end)
2033                         return(NULL);
2034                 goto again;
2035         }
2036
2037         /*
2038          * If the chain element is an indirect block it becomes the new
2039          * parent and we loop on it.  We must maintain our top-down locks
2040          * to prevent the flusher from interfering (i.e. doing a
2041          * delete-duplicate and leaving us recursing down a deleted chain).
2042          *
2043          * The parent always has to be locked with at least RESOLVE_MAYBE
2044          * so we can access its data.  It might need a fixup if the caller
2045          * passed incompatible flags.  Be careful not to cause a deadlock
2046          * as a data-load requires an exclusive lock.
2047          *
2048          * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key
2049          * range is within the requested key range we return the indirect
2050          * block and do NOT loop.  This is usually only used to acquire
2051          * freemap nodes.
2052          */
2053         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2054             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2055                 hammer2_chain_unlock(parent);
2056                 hammer2_chain_drop(parent);
2057                 *parentp = parent = chain;
2058                 goto again;
2059         }
2060 done:
2061         /*
2062          * All done, return the chain.
2063          *
2064          * If the caller does not want a locked chain, replace the lock with
2065          * a ref.  Perhaps this can eventually be optimized to not obtain the
2066          * lock in the first place for situations where the data does not
2067          * need to be resolved.
2068          */
2069         if (chain) {
2070                 if (flags & HAMMER2_LOOKUP_NOLOCK)
2071                         hammer2_chain_unlock(chain);
2072         }
2073
2074         return (chain);
2075 }
2076
2077 /*
2078  * After having issued a lookup we can iterate all matching keys.
2079  *
2080  * If chain is non-NULL we continue the iteration from just after it's index.
2081  *
2082  * If chain is NULL we assume the parent was exhausted and continue the
2083  * iteration at the next parent.
2084  *
2085  * If a fatal error occurs (typically an I/O error), a dummy chain is
2086  * returned with chain->error and error-identifying information set.  This
2087  * chain will assert if you try to do anything fancy with it.
2088  *
2089  * XXX Depending on where the error occurs we should allow continued iteration.
2090  *
2091  * parent must be locked on entry and remains locked throughout.  chain's
2092  * lock status must match flags.  Chain is always at least referenced.
2093  *
2094  * WARNING!  The MATCHIND flag does not apply to this function.
2095  */
2096 hammer2_chain_t *
2097 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain,
2098                    hammer2_key_t *key_nextp,
2099                    hammer2_key_t key_beg, hammer2_key_t key_end,
2100                    int *cache_indexp, int flags)
2101 {
2102         hammer2_chain_t *parent;
2103         int how_maybe;
2104
2105         /*
2106          * Calculate locking flags for upward recursion.
2107          */
2108         how_maybe = HAMMER2_RESOLVE_MAYBE;
2109         if (flags & HAMMER2_LOOKUP_SHARED)
2110                 how_maybe |= HAMMER2_RESOLVE_SHARED;
2111
2112         parent = *parentp;
2113
2114         /*
2115          * Calculate the next index and recalculate the parent if necessary.
2116          */
2117         if (chain) {
2118                 key_beg = chain->bref.key +
2119                           ((hammer2_key_t)1 << chain->bref.keybits);
2120                 if ((flags & (HAMMER2_LOOKUP_NOLOCK |
2121                               HAMMER2_LOOKUP_NOUNLOCK)) == 0) {
2122                         hammer2_chain_unlock(chain);
2123                 }
2124                 hammer2_chain_drop(chain);
2125
2126                 /*
2127                  * chain invalid past this point, but we can still do a
2128                  * pointer comparison w/parent.
2129                  *
2130                  * Any scan where the lookup returned degenerate data embedded
2131                  * in the inode has an invalid index and must terminate.
2132                  */
2133                 if (chain == parent)
2134                         return(NULL);
2135                 if (key_beg == 0 || key_beg > key_end)
2136                         return(NULL);
2137                 chain = NULL;
2138         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
2139                    parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2140                 /*
2141                  * We reached the end of the iteration.
2142                  */
2143                 return (NULL);
2144         } else {
2145                 /*
2146                  * Continue iteration with next parent unless the current
2147                  * parent covers the range.
2148                  */
2149                 key_beg = parent->bref.key +
2150                           ((hammer2_key_t)1 << parent->bref.keybits);
2151                 if (key_beg == 0 || key_beg > key_end)
2152                         return (NULL);
2153                 parent = hammer2_chain_getparent(parentp, how_maybe);
2154         }
2155
2156         /*
2157          * And execute
2158          */
2159         return (hammer2_chain_lookup(parentp, key_nextp,
2160                                      key_beg, key_end,
2161                                      cache_indexp, flags));
2162 }
2163
2164 /*
2165  * The raw scan function is similar to lookup/next but does not seek to a key.
2166  * Blockrefs are iterated via first_bref = (parent, NULL) and
2167  * next_chain = (parent, bref).
2168  *
2169  * The passed-in parent must be locked and its data resolved.  The function
2170  * nominally returns a locked and referenced *chainp != NULL for chains
2171  * the caller might need to recurse on (and will dipose of any *chainp passed
2172  * in).  The caller must check the chain->bref.type either way.
2173  *
2174  * *chainp is not set for leaf elements.
2175  *
2176  * This function takes a pointer to a stack-based bref structure whos
2177  * contents is updated for each iteration.  The same pointer is returned,
2178  * or NULL when the iteration is complete.  *firstp must be set to 1 for
2179  * the first ieration.  This function will set it to 0.
2180  */
2181 hammer2_blockref_t *
2182 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp,
2183                    hammer2_blockref_t *bref, int *firstp,
2184                    int *cache_indexp, int flags)
2185 {
2186         hammer2_dev_t *hmp;
2187         hammer2_blockref_t *base;
2188         hammer2_blockref_t *bref_ptr;
2189         hammer2_key_t key;
2190         hammer2_key_t next_key;
2191         hammer2_chain_t *chain = NULL;
2192         int count = 0;
2193         int how_always = HAMMER2_RESOLVE_ALWAYS;
2194         int how_maybe = HAMMER2_RESOLVE_MAYBE;
2195         int how;
2196         int generation;
2197         int maxloops = 300000;
2198
2199         hmp = parent->hmp;
2200
2201         /*
2202          * Scan flags borrowed from lookup.
2203          */
2204         if (flags & HAMMER2_LOOKUP_ALWAYS) {
2205                 how_maybe = how_always;
2206                 how = HAMMER2_RESOLVE_ALWAYS;
2207         } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) {
2208                 how = HAMMER2_RESOLVE_NEVER;
2209         } else {
2210                 how = HAMMER2_RESOLVE_MAYBE;
2211         }
2212         if (flags & HAMMER2_LOOKUP_SHARED) {
2213                 how_maybe |= HAMMER2_RESOLVE_SHARED;
2214                 how_always |= HAMMER2_RESOLVE_SHARED;
2215                 how |= HAMMER2_RESOLVE_SHARED;
2216         }
2217
2218         /*
2219          * Calculate key to locate first/next element, unlocking the previous
2220          * element as we go.  Be careful, the key calculation can overflow.
2221          *
2222          * (also reset bref to NULL)
2223          */
2224         if (*firstp) {
2225                 key = 0;
2226                 *firstp = 0;
2227         } else {
2228                 key = bref->key + ((hammer2_key_t)1 << bref->keybits);
2229                 if ((chain = *chainp) != NULL) {
2230                         *chainp = NULL;
2231                         hammer2_chain_unlock(chain);
2232                         hammer2_chain_drop(chain);
2233                         chain = NULL;
2234                 }
2235                 if (key == 0) {
2236                         bref = NULL;
2237                         goto done;
2238                 }
2239         }
2240
2241 again:
2242         KKASSERT(parent->error == 0);   /* XXX case not handled yet */
2243         if (--maxloops == 0)
2244                 panic("hammer2_chain_scan: maxloops");
2245         /*
2246          * Locate the blockref array.  Currently we do a fully associative
2247          * search through the array.
2248          */
2249         switch(parent->bref.type) {
2250         case HAMMER2_BREF_TYPE_INODE:
2251                 /*
2252                  * An inode with embedded data has no sub-chains.
2253                  *
2254                  * WARNING! Bulk scan code may pass a static chain marked
2255                  *          as BREF_TYPE_INODE with a copy of the volume
2256                  *          root blockset to snapshot the volume.
2257                  */
2258                 if (parent->data->ipdata.meta.op_flags &
2259                     HAMMER2_OPFLAG_DIRECTDATA) {
2260                         bref = NULL;
2261                         goto done;
2262                 }
2263                 base = &parent->data->ipdata.u.blockset.blockref[0];
2264                 count = HAMMER2_SET_COUNT;
2265                 break;
2266         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2267         case HAMMER2_BREF_TYPE_INDIRECT:
2268                 /*
2269                  * Optimize indirect blocks in the INITIAL state to avoid
2270                  * I/O.
2271                  */
2272                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2273                         base = NULL;
2274                 } else {
2275                         if (parent->data == NULL)
2276                                 panic("parent->data is NULL");
2277                         base = &parent->data->npdata[0];
2278                 }
2279                 count = parent->bytes / sizeof(hammer2_blockref_t);
2280                 break;
2281         case HAMMER2_BREF_TYPE_VOLUME:
2282                 base = &parent->data->voldata.sroot_blockset.blockref[0];
2283                 count = HAMMER2_SET_COUNT;
2284                 break;
2285         case HAMMER2_BREF_TYPE_FREEMAP:
2286                 base = &parent->data->blkset.blockref[0];
2287                 count = HAMMER2_SET_COUNT;
2288                 break;
2289         default:
2290                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
2291                       parent->bref.type);
2292                 base = NULL;    /* safety */
2293                 count = 0;      /* safety */
2294         }
2295
2296         /*
2297          * Merged scan to find next candidate.
2298          *
2299          * hammer2_base_*() functions require the parent->core.live_* fields
2300          * to be synchronized.
2301          *
2302          * We need to hold the spinlock to access the block array and RB tree
2303          * and to interlock chain creation.
2304          */
2305         if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
2306                 hammer2_chain_countbrefs(parent, base, count);
2307
2308         next_key = 0;
2309         bref_ptr = NULL;
2310         hammer2_spin_ex(&parent->core.spin);
2311         chain = hammer2_combined_find(parent, base, count,
2312                                       cache_indexp, &next_key,
2313                                       key, HAMMER2_KEY_MAX,
2314                                       &bref_ptr);
2315         generation = parent->core.generation;
2316
2317         /*
2318          * Exhausted parent chain, we're done.
2319          */
2320         if (bref_ptr == NULL) {
2321                 hammer2_spin_unex(&parent->core.spin);
2322                 KKASSERT(chain == NULL);
2323                 bref = NULL;
2324                 goto done;
2325         }
2326
2327         /*
2328          * Copy into the supplied stack-based blockref.
2329          */
2330         *bref = *bref_ptr;
2331
2332         /*
2333          * Selected from blockref or in-memory chain.
2334          */
2335         if (chain == NULL) {
2336                 switch(bref->type) {
2337                 case HAMMER2_BREF_TYPE_INODE:
2338                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2339                 case HAMMER2_BREF_TYPE_INDIRECT:
2340                 case HAMMER2_BREF_TYPE_VOLUME:
2341                 case HAMMER2_BREF_TYPE_FREEMAP:
2342                         /*
2343                          * Recursion, always get the chain
2344                          */
2345                         hammer2_spin_unex(&parent->core.spin);
2346                         chain = hammer2_chain_get(parent, generation, bref);
2347                         if (chain == NULL) {
2348                                 kprintf("retry scan parent %p keys %016jx\n",
2349                                         parent, key);
2350                                 goto again;
2351                         }
2352                         if (bcmp(bref, bref_ptr, sizeof(*bref))) {
2353                                 hammer2_chain_drop(chain);
2354                                 chain = NULL;
2355                                 goto again;
2356                         }
2357                         break;
2358                 default:
2359                         /*
2360                          * No recursion, do not waste time instantiating
2361                          * a chain, just iterate using the bref.
2362                          */
2363                         hammer2_spin_unex(&parent->core.spin);
2364                         break;
2365                 }
2366         } else {
2367                 /*
2368                  * Recursion or not we need the chain in order to supply
2369                  * the bref.
2370                  */
2371                 hammer2_chain_ref(chain);
2372                 hammer2_spin_unex(&parent->core.spin);
2373         }
2374
2375         /*
2376          * chain is referenced but not locked.  We must lock the chain
2377          * to obtain definitive DUPLICATED/DELETED state
2378          */
2379         if (chain)
2380                 hammer2_chain_lock(chain, how);
2381
2382         /*
2383          * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
2384          *
2385          * NOTE: chain's key range is not relevant as there might be
2386          *       one-offs within the range that are not deleted.
2387          *
2388          * NOTE: XXX this could create problems with scans used in
2389          *       situations other than mount-time recovery.
2390          *
2391          * NOTE: Lookups can race delete-duplicate because
2392          *       delete-duplicate does not lock the parent's core
2393          *       (they just use the spinlock on the core).  We must
2394          *       check for races by comparing the DUPLICATED flag before
2395          *       releasing the spinlock with the flag after locking the
2396          *       chain.
2397          */
2398         if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
2399                 hammer2_chain_unlock(chain);
2400                 hammer2_chain_drop(chain);
2401                 chain = NULL;
2402
2403                 key = next_key;
2404                 if (key == 0) {
2405                         bref = NULL;
2406                         goto done;
2407                 }
2408                 goto again;
2409         }
2410
2411 done:
2412         /*
2413          * All done, return the bref or NULL, supply chain if necessary.
2414          */
2415         if (chain)
2416                 *chainp = chain;
2417         return (bref);
2418 }
2419
2420 /*
2421  * Create and return a new hammer2 system memory structure of the specified
2422  * key, type and size and insert it under (*parentp).  This is a full
2423  * insertion, based on the supplied key/keybits, and may involve creating
2424  * indirect blocks and moving other chains around via delete/duplicate.
2425  *
2426  * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION
2427  * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING
2428  * FULL.  This typically means that the caller is creating the chain after
2429  * doing a hammer2_chain_lookup().
2430  *
2431  * (*parentp) must be exclusive locked and may be replaced on return
2432  * depending on how much work the function had to do.
2433  *
2434  * (*parentp) must not be errored or this function will assert.
2435  *
2436  * (*chainp) usually starts out NULL and returns the newly created chain,
2437  * but if the caller desires the caller may allocate a disconnected chain
2438  * and pass it in instead.
2439  *
2440  * This function should NOT be used to insert INDIRECT blocks.  It is
2441  * typically used to create/insert inodes and data blocks.
2442  *
2443  * Caller must pass-in an exclusively locked parent the new chain is to
2444  * be inserted under, and optionally pass-in a disconnected, exclusively
2445  * locked chain to insert (else we create a new chain).  The function will
2446  * adjust (*parentp) as necessary, create or connect the chain, and
2447  * return an exclusively locked chain in *chainp.
2448  *
2449  * When creating a PFSROOT inode under the super-root, pmp is typically NULL
2450  * and will be reassigned.
2451  */
2452 int
2453 hammer2_chain_create(hammer2_chain_t **parentp,
2454                      hammer2_chain_t **chainp, hammer2_pfs_t *pmp,
2455                      hammer2_key_t key, int keybits, int type, size_t bytes,
2456                      hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags)
2457 {
2458         hammer2_dev_t *hmp;
2459         hammer2_chain_t *chain;
2460         hammer2_chain_t *parent;
2461         hammer2_blockref_t *base;
2462         hammer2_blockref_t dummy;
2463         int allocated = 0;
2464         int error = 0;
2465         int count;
2466         int maxloops = 300000;
2467
2468         /*
2469          * Topology may be crossing a PFS boundary.
2470          */
2471         parent = *parentp;
2472         KKASSERT(hammer2_mtx_owned(&parent->lock));
2473         KKASSERT(parent->error == 0);
2474         hmp = parent->hmp;
2475         chain = *chainp;
2476
2477         if (chain == NULL) {
2478                 /*
2479                  * First allocate media space and construct the dummy bref,
2480                  * then allocate the in-memory chain structure.  Set the
2481                  * INITIAL flag for fresh chains which do not have embedded
2482                  * data.
2483                  *
2484                  * XXX for now set the check mode of the child based on
2485                  *     the parent or, if the parent is an inode, the
2486                  *     specification in the inode.
2487                  */
2488                 bzero(&dummy, sizeof(dummy));
2489                 dummy.type = type;
2490                 dummy.key = key;
2491                 dummy.keybits = keybits;
2492                 dummy.data_off = hammer2_getradix(bytes);
2493                 dummy.methods = parent->bref.methods;
2494                 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE &&
2495                     parent->data) {
2496                         dummy.methods &= ~HAMMER2_ENC_CHECK(-1);
2497                         dummy.methods |= HAMMER2_ENC_CHECK(
2498                                           parent->data->ipdata.meta.check_algo);
2499                 }
2500
2501                 chain = hammer2_chain_alloc(hmp, pmp, &dummy);
2502
2503                 /*
2504                  * Lock the chain manually, chain_lock will load the chain
2505                  * which we do NOT want to do.  (note: chain->refs is set
2506                  * to 1 by chain_alloc() for us, but lockcnt is not).
2507                  */
2508                 chain->lockcnt = 1;
2509                 hammer2_mtx_ex(&chain->lock);
2510                 allocated = 1;
2511                 ++curthread->td_tracker;
2512
2513                 /*
2514                  * Set INITIAL to optimize I/O.  The flag will generally be
2515                  * processed when we call hammer2_chain_modify().
2516                  *
2517                  * Recalculate bytes to reflect the actual media block
2518                  * allocation.
2519                  */
2520                 bytes = (hammer2_off_t)1 <<
2521                         (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
2522                 chain->bytes = bytes;
2523
2524                 switch(type) {
2525                 case HAMMER2_BREF_TYPE_VOLUME:
2526                 case HAMMER2_BREF_TYPE_FREEMAP:
2527                         panic("hammer2_chain_create: called with volume type");
2528                         break;
2529                 case HAMMER2_BREF_TYPE_INDIRECT:
2530                         panic("hammer2_chain_create: cannot be used to"
2531                               "create indirect block");
2532                         break;
2533                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2534                         panic("hammer2_chain_create: cannot be used to"
2535                               "create freemap root or node");
2536                         break;
2537                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2538                         KKASSERT(bytes == sizeof(chain->data->bmdata));
2539                         /* fall through */
2540                 case HAMMER2_BREF_TYPE_INODE:
2541                 case HAMMER2_BREF_TYPE_DATA:
2542                 default:
2543                         /*
2544                          * leave chain->data NULL, set INITIAL
2545                          */
2546                         KKASSERT(chain->data == NULL);
2547                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
2548                         break;
2549                 }
2550         } else {
2551                 /*
2552                  * We are reattaching a previously deleted chain, possibly
2553                  * under a new parent and possibly with a new key/keybits.
2554                  * The chain does not have to be in a modified state.  The
2555                  * UPDATE flag will be set later on in this routine.
2556                  *
2557                  * Do NOT mess with the current state of the INITIAL flag.
2558                  */
2559                 chain->bref.key = key;
2560                 chain->bref.keybits = keybits;
2561                 if (chain->flags & HAMMER2_CHAIN_DELETED)
2562                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2563                 KKASSERT(chain->parent == NULL);
2564         }
2565         if (flags & HAMMER2_INSERT_PFSROOT)
2566                 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
2567         else
2568                 chain->bref.flags &= ~HAMMER2_BREF_FLAG_PFSROOT;
2569
2570         /*
2571          * Calculate how many entries we have in the blockref array and
2572          * determine if an indirect block is required.
2573          */
2574 again:
2575         if (--maxloops == 0)
2576                 panic("hammer2_chain_create: maxloops");
2577
2578         switch(parent->bref.type) {
2579         case HAMMER2_BREF_TYPE_INODE:
2580                 KKASSERT((parent->data->ipdata.meta.op_flags &
2581                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
2582                 KKASSERT(parent->data != NULL);
2583                 base = &parent->data->ipdata.u.blockset.blockref[0];
2584                 count = HAMMER2_SET_COUNT;
2585                 break;
2586         case HAMMER2_BREF_TYPE_INDIRECT:
2587         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2588                 if (parent->flags & HAMMER2_CHAIN_INITIAL)
2589                         base = NULL;
2590                 else
2591                         base = &parent->data->npdata[0];
2592                 count = parent->bytes / sizeof(hammer2_blockref_t);
2593                 break;
2594         case HAMMER2_BREF_TYPE_VOLUME:
2595                 KKASSERT(parent->data != NULL);
2596                 base = &parent->data->voldata.sroot_blockset.blockref[0];
2597                 count = HAMMER2_SET_COUNT;
2598                 break;
2599         case HAMMER2_BREF_TYPE_FREEMAP:
2600                 KKASSERT(parent->data != NULL);
2601                 base = &parent->data->blkset.blockref[0];
2602                 count = HAMMER2_SET_COUNT;
2603                 break;
2604         default:
2605                 panic("hammer2_chain_create: unrecognized blockref type: %d",
2606                       parent->bref.type);
2607                 base = NULL;
2608                 count = 0;
2609                 break;
2610         }
2611
2612         /*
2613          * Make sure we've counted the brefs
2614          */
2615         if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
2616                 hammer2_chain_countbrefs(parent, base, count);
2617
2618         KKASSERT(parent->core.live_count >= 0 &&
2619                  parent->core.live_count <= count);
2620
2621         /*
2622          * If no free blockref could be found we must create an indirect
2623          * block and move a number of blockrefs into it.  With the parent
2624          * locked we can safely lock each child in order to delete+duplicate
2625          * it without causing a deadlock.
2626          *
2627          * This may return the new indirect block or the old parent depending
2628          * on where the key falls.  NULL is returned on error.
2629          */
2630         if (parent->core.live_count == count) {
2631                 hammer2_chain_t *nparent;
2632
2633                 nparent = hammer2_chain_create_indirect(parent, key, keybits,
2634                                                         mtid, type, &error);
2635                 if (nparent == NULL) {
2636                         if (allocated)
2637                                 hammer2_chain_drop(chain);
2638                         chain = NULL;
2639                         goto done;
2640                 }
2641                 if (parent != nparent) {
2642                         hammer2_chain_unlock(parent);
2643                         hammer2_chain_drop(parent);
2644                         parent = *parentp = nparent;
2645                 }
2646                 goto again;
2647         }
2648
2649         /*
2650          * Link the chain into its parent.
2651          */
2652         if (chain->parent != NULL)
2653                 panic("hammer2: hammer2_chain_create: chain already connected");
2654         KKASSERT(chain->parent == NULL);
2655         hammer2_chain_insert(parent, chain,
2656                              HAMMER2_CHAIN_INSERT_SPIN |
2657                              HAMMER2_CHAIN_INSERT_LIVE,
2658                              0);
2659
2660         if (allocated) {
2661                 /*
2662                  * Mark the newly created chain modified.  This will cause
2663                  * UPDATE to be set and process the INITIAL flag.
2664                  *
2665                  * Device buffers are not instantiated for DATA elements
2666                  * as these are handled by logical buffers.
2667                  *
2668                  * Indirect and freemap node indirect blocks are handled
2669                  * by hammer2_chain_create_indirect() and not by this
2670                  * function.
2671                  *
2672                  * Data for all other bref types is expected to be
2673                  * instantiated (INODE, LEAF).
2674                  */
2675                 switch(chain->bref.type) {
2676                 case HAMMER2_BREF_TYPE_DATA:
2677                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2678                 case HAMMER2_BREF_TYPE_INODE:
2679                         hammer2_chain_modify(chain, mtid, dedup_off,
2680                                              HAMMER2_MODIFY_OPTDATA);
2681                         break;
2682                 default:
2683                         /*
2684                          * Remaining types are not supported by this function.
2685                          * In particular, INDIRECT and LEAF_NODE types are
2686                          * handled by create_indirect().
2687                          */
2688                         panic("hammer2_chain_create: bad type: %d",
2689                               chain->bref.type);
2690                         /* NOT REACHED */
2691                         break;
2692                 }
2693         } else {
2694                 /*
2695                  * When reconnecting a chain we must set UPDATE and
2696                  * setflush so the flush recognizes that it must update
2697                  * the bref in the parent.
2698                  */
2699                 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0)
2700                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
2701         }
2702
2703         /*
2704          * We must setflush(parent) to ensure that it recurses through to
2705          * chain.  setflush(chain) might not work because ONFLUSH is possibly
2706          * already set in the chain (so it won't recurse up to set it in the
2707          * parent).
2708          */
2709         hammer2_chain_setflush(parent);
2710
2711 done:
2712         *chainp = chain;
2713
2714         return (error);
2715 }
2716
2717 /*
2718  * Move the chain from its old parent to a new parent.  The chain must have
2719  * already been deleted or already disconnected (or never associated) with
2720  * a parent.  The chain is reassociated with the new parent and the deleted
2721  * flag will be cleared (no longer deleted).  The chain's modification state
2722  * is not altered.
2723  *
2724  * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION
2725  * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING
2726  * FULL.  This typically means that the caller is creating the chain after
2727  * doing a hammer2_chain_lookup().
2728  *
2729  * A non-NULL bref is typically passed when key and keybits must be overridden.
2730  * Note that hammer2_cluster_duplicate() *ONLY* uses the key and keybits fields
2731  * from a passed-in bref and uses the old chain's bref for everything else.
2732  *
2733  * Neither (parent) or (chain) can be errored.
2734  *
2735  * If (parent) is non-NULL then the new duplicated chain is inserted under
2736  * the parent.
2737  *
2738  * If (parent) is NULL then the newly duplicated chain is not inserted
2739  * anywhere, similar to if it had just been chain_alloc()'d (suitable for
2740  * passing into hammer2_chain_create() after this function returns).
2741  *
2742  * WARNING! This function calls create which means it can insert indirect
2743  *          blocks.  This can cause other unrelated chains in the parent to
2744  *          be moved to a newly inserted indirect block in addition to the
2745  *          specific chain.
2746  */
2747 void
2748 hammer2_chain_rename(hammer2_blockref_t *bref,
2749                      hammer2_chain_t **parentp, hammer2_chain_t *chain,
2750                      hammer2_tid_t mtid, int flags)
2751 {
2752         hammer2_dev_t *hmp;
2753         hammer2_chain_t *parent;
2754         size_t bytes;
2755
2756         /*
2757          * WARNING!  We should never resolve DATA to device buffers
2758          *           (XXX allow it if the caller did?), and since
2759          *           we currently do not have the logical buffer cache
2760          *           buffer in-hand to fix its cached physical offset
2761          *           we also force the modify code to not COW it. XXX
2762          */
2763         hmp = chain->hmp;
2764         KKASSERT(chain->parent == NULL);
2765         KKASSERT(chain->error == 0);
2766
2767         /*
2768          * Now create a duplicate of the chain structure, associating
2769          * it with the same core, making it the same size, pointing it
2770          * to the same bref (the same media block).
2771          */
2772         if (bref == NULL)
2773                 bref = &chain->bref;
2774         bytes = (hammer2_off_t)1 <<
2775                 (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
2776
2777         /*
2778          * If parent is not NULL the duplicated chain will be entered under
2779          * the parent and the UPDATE bit set to tell flush to update
2780          * the blockref.
2781          *
2782          * We must setflush(parent) to ensure that it recurses through to
2783          * chain.  setflush(chain) might not work because ONFLUSH is possibly
2784          * already set in the chain (so it won't recurse up to set it in the
2785          * parent).
2786          *
2787          * Having both chains locked is extremely important for atomicy.
2788          */
2789         if (parentp && (parent = *parentp) != NULL) {
2790                 KKASSERT(hammer2_mtx_owned(&parent->lock));
2791                 KKASSERT(parent->refs > 0);
2792                 KKASSERT(parent->error == 0);
2793
2794                 hammer2_chain_create(parentp, &chain, chain->pmp,
2795                                      bref->key, bref->keybits, bref->type,
2796                                      chain->bytes, mtid, 0, flags);
2797                 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE);
2798                 hammer2_chain_setflush(*parentp);
2799         }
2800 }
2801
2802 /*
2803  * Helper function for deleting chains.
2804  *
2805  * The chain is removed from the live view (the RBTREE) as well as the parent's
2806  * blockmap.  Both chain and its parent must be locked.
2807  *
2808  * parent may not be errored.  chain can be errored.
2809  */
2810 static void
2811 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain,
2812                              hammer2_tid_t mtid, int flags)
2813 {
2814         hammer2_dev_t *hmp;
2815
2816         KKASSERT((chain->flags & (HAMMER2_CHAIN_DELETED |
2817                                   HAMMER2_CHAIN_FICTITIOUS)) == 0);
2818         KKASSERT(chain->parent == parent);
2819         hmp = chain->hmp;
2820
2821         if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
2822                 /*
2823                  * Chain is blockmapped, so there must be a parent.
2824                  * Atomically remove the chain from the parent and remove
2825                  * the blockmap entry.  The parent must be set modified
2826                  * to remove the blockmap entry.
2827                  */
2828                 hammer2_blockref_t *base;
2829                 int count;
2830
2831                 KKASSERT(parent != NULL);
2832                 KKASSERT(parent->error == 0);
2833                 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0);
2834                 hammer2_chain_modify(parent, mtid, 0, HAMMER2_MODIFY_OPTDATA);
2835
2836                 /*
2837                  * Calculate blockmap pointer
2838                  */
2839                 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
2840                 hammer2_spin_ex(&parent->core.spin);
2841
2842                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2843                 atomic_add_int(&parent->core.live_count, -1);
2844                 ++parent->core.generation;
2845                 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain);
2846                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
2847                 --parent->core.chain_count;
2848                 chain->parent = NULL;
2849
2850                 switch(parent->bref.type) {
2851                 case HAMMER2_BREF_TYPE_INODE:
2852                         /*
2853                          * Access the inode's block array.  However, there
2854                          * is no block array if the inode is flagged
2855                          * DIRECTDATA.  The DIRECTDATA case typicaly only
2856                          * occurs when a hardlink has been shifted up the
2857                          * tree and the original inode gets replaced with
2858                          * an OBJTYPE_HARDLINK placeholding inode.
2859                          */
2860                         if (parent->data &&
2861                             (parent->data->ipdata.meta.op_flags &
2862                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
2863                                 base =
2864                                    &parent->data->ipdata.u.blockset.blockref[0];
2865                         } else {
2866                                 base = NULL;
2867                         }
2868                         count = HAMMER2_SET_COUNT;
2869                         break;
2870                 case HAMMER2_BREF_TYPE_INDIRECT:
2871                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2872                         if (parent->data)
2873                                 base = &parent->data->npdata[0];
2874                         else
2875                                 base = NULL;
2876                         count = parent->bytes / sizeof(hammer2_blockref_t);
2877                         break;
2878                 case HAMMER2_BREF_TYPE_VOLUME:
2879                         base = &parent->data->voldata.
2880                                         sroot_blockset.blockref[0];
2881                         count = HAMMER2_SET_COUNT;
2882                         break;
2883                 case HAMMER2_BREF_TYPE_FREEMAP:
2884                         base = &parent->data->blkset.blockref[0];
2885                         count = HAMMER2_SET_COUNT;
2886                         break;
2887                 default:
2888                         base = NULL;
2889                         count = 0;
2890                         panic("hammer2_flush_pass2: "
2891                               "unrecognized blockref type: %d",
2892                               parent->bref.type);
2893                 }
2894
2895                 /*
2896                  * delete blockmapped chain from its parent.
2897                  *
2898                  * The parent is not affected by any statistics in chain
2899                  * which are pending synchronization.  That is, there is
2900                  * nothing to undo in the parent since they have not yet
2901                  * been incorporated into the parent.
2902                  *
2903                  * The parent is affected by statistics stored in inodes.
2904                  * Those have already been synchronized, so they must be
2905                  * undone.  XXX split update possible w/delete in middle?
2906                  */
2907                 if (base) {
2908                         int cache_index = -1;
2909                         hammer2_base_delete(parent, base, count,
2910                                             &cache_index, chain);
2911                 }
2912                 hammer2_spin_unex(&parent->core.spin);
2913         } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
2914                 /*
2915                  * Chain is not blockmapped but a parent is present.
2916                  * Atomically remove the chain from the parent.  There is
2917                  * no blockmap entry to remove.
2918                  *
2919                  * Because chain was associated with a parent but not
2920                  * synchronized, the chain's *_count_up fields contain
2921                  * inode adjustment statistics which must be undone.
2922                  */
2923                 hammer2_spin_ex(&parent->core.spin);
2924                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2925                 atomic_add_int(&parent->core.live_count, -1);
2926                 ++parent->core.generation;
2927                 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain);
2928                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
2929                 --parent->core.chain_count;
2930                 chain->parent = NULL;
2931                 hammer2_spin_unex(&parent->core.spin);
2932         } else {
2933                 /*
2934                  * Chain is not blockmapped and has no parent.  This
2935                  * is a degenerate case.
2936                  */
2937                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2938         }
2939 }
2940
2941 /*
2942  * Create an indirect block that covers one or more of the elements in the
2943  * current parent.  Either returns the existing parent with no locking or
2944  * ref changes or returns the new indirect block locked and referenced
2945  * and leaving the original parent lock/ref intact as well.
2946  *
2947  * If an error occurs, NULL is returned and *errorp is set to the error.
2948  *
2949  * The returned chain depends on where the specified key falls.
2950  *
2951  * The key/keybits for the indirect mode only needs to follow three rules:
2952  *
2953  * (1) That all elements underneath it fit within its key space and
2954  *
2955  * (2) That all elements outside it are outside its key space.
2956  *
2957  * (3) When creating the new indirect block any elements in the current
2958  *     parent that fit within the new indirect block's keyspace must be
2959  *     moved into the new indirect block.
2960  *
2961  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
2962  *     keyspace the the current parent, but lookup/iteration rules will
2963  *     ensure (and must ensure) that rule (2) for all parents leading up
2964  *     to the nearest inode or the root volume header is adhered to.  This
2965  *     is accomplished by always recursing through matching keyspaces in
2966  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
2967  *
2968  * The current implementation calculates the current worst-case keyspace by
2969  * iterating the current parent and then divides it into two halves, choosing
2970  * whichever half has the most elements (not necessarily the half containing
2971  * the requested key).
2972  *
2973  * We can also opt to use the half with the least number of elements.  This
2974  * causes lower-numbered keys (aka logical file offsets) to recurse through
2975  * fewer indirect blocks and higher-numbered keys to recurse through more.
2976  * This also has the risk of not moving enough elements to the new indirect
2977  * block and being forced to create several indirect blocks before the element
2978  * can be inserted.
2979  *
2980  * Must be called with an exclusively locked parent.
2981  */
2982 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent,
2983                                 hammer2_key_t *keyp, int keybits,
2984                                 hammer2_blockref_t *base, int count);
2985 static int hammer2_chain_indkey_normal(hammer2_chain_t *parent,
2986                                 hammer2_key_t *keyp, int keybits,
2987                                 hammer2_blockref_t *base, int count);
2988 static
2989 hammer2_chain_t *
2990 hammer2_chain_create_indirect(hammer2_chain_t *parent,
2991                               hammer2_key_t create_key, int create_bits,
2992                               hammer2_tid_t mtid, int for_type, int *errorp)
2993 {
2994         hammer2_dev_t *hmp;
2995         hammer2_blockref_t *base;
2996         hammer2_blockref_t *bref;
2997         hammer2_blockref_t bcopy;
2998         hammer2_chain_t *chain;
2999         hammer2_chain_t *ichain;
3000         hammer2_chain_t dummy;
3001         hammer2_key_t key = create_key;
3002         hammer2_key_t key_beg;
3003         hammer2_key_t key_end;
3004         hammer2_key_t key_next;
3005         int keybits = create_bits;
3006         int count;
3007         int nbytes;
3008         int cache_index;
3009         int loops;
3010         int reason;
3011         int generation;
3012         int maxloops = 300000;
3013
3014         /*
3015          * Calculate the base blockref pointer or NULL if the chain
3016          * is known to be empty.  We need to calculate the array count
3017          * for RB lookups either way.
3018          */
3019         hmp = parent->hmp;
3020         *errorp = 0;
3021         KKASSERT(hammer2_mtx_owned(&parent->lock));
3022
3023         /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/
3024         base = hammer2_chain_base_and_count(parent, &count);
3025
3026         /*
3027          * dummy used in later chain allocation (no longer used for lookups).
3028          */
3029         bzero(&dummy, sizeof(dummy));
3030
3031         /*
3032          * When creating an indirect block for a freemap node or leaf
3033          * the key/keybits must be fitted to static radix levels because
3034          * particular radix levels use particular reserved blocks in the
3035          * related zone.
3036          *
3037          * This routine calculates the key/radix of the indirect block
3038          * we need to create, and whether it is on the high-side or the
3039          * low-side.
3040          */
3041         if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3042             for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3043                 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits,
3044                                                        base, count);
3045         } else {
3046                 keybits = hammer2_chain_indkey_normal(parent, &key, keybits,
3047                                                       base, count);
3048         }
3049
3050         /*
3051          * Normalize the key for the radix being represented, keeping the
3052          * high bits and throwing away the low bits.
3053          */
3054         key &= ~(((hammer2_key_t)1 << keybits) - 1);
3055
3056         /*
3057          * How big should our new indirect block be?  It has to be at least
3058          * as large as its parent.
3059          *
3060          * The freemap uses a specific indirect block size.
3061          *
3062          * The first indirect block level down from an inode typically
3063          * uses LBUFSIZE (16384), else it uses PBUFSIZE (65536).
3064          */
3065         if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3066             for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3067                 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE;
3068         } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
3069                 nbytes = HAMMER2_IND_BYTES_MIN;
3070         } else {
3071                 nbytes = HAMMER2_IND_BYTES_MAX;
3072         }
3073         if (nbytes < count * sizeof(hammer2_blockref_t)) {
3074                 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE &&
3075                          for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF);
3076                 nbytes = count * sizeof(hammer2_blockref_t);
3077         }
3078
3079         /*
3080          * Ok, create our new indirect block
3081          */
3082         if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3083             for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3084                 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE;
3085         } else {
3086                 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
3087         }
3088         dummy.bref.key = key;
3089         dummy.bref.keybits = keybits;
3090         dummy.bref.data_off = hammer2_getradix(nbytes);
3091         dummy.bref.methods = parent->bref.methods;
3092
3093         ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy.bref);
3094         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
3095         hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE);
3096         /* ichain has one ref at this point */
3097
3098         /*
3099          * We have to mark it modified to allocate its block, but use
3100          * OPTDATA to allow it to remain in the INITIAL state.  Otherwise
3101          * it won't be acted upon by the flush code.
3102          */
3103         hammer2_chain_modify(ichain, mtid, 0, HAMMER2_MODIFY_OPTDATA);
3104
3105         /*
3106          * Iterate the original parent and move the matching brefs into
3107          * the new indirect block.
3108          *
3109          * XXX handle flushes.
3110          */
3111         key_beg = 0;
3112         key_end = HAMMER2_KEY_MAX;
3113         cache_index = 0;
3114         hammer2_spin_ex(&parent->core.spin);
3115         loops = 0;
3116         reason = 0;
3117
3118         for (;;) {
3119                 /*
3120                  * Parent may have been modified, relocating its block array.
3121                  * Reload the base pointer.
3122                  */
3123                 base = hammer2_chain_base_and_count(parent, &count);
3124
3125                 if (++loops > 100000) {
3126                     hammer2_spin_unex(&parent->core.spin);
3127                     panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n",
3128                           reason, parent, base, count, key_next);
3129                 }
3130
3131                 /*
3132                  * NOTE: spinlock stays intact, returned chain (if not NULL)
3133                  *       is not referenced or locked which means that we
3134                  *       cannot safely check its flagged / deletion status
3135                  *       until we lock it.
3136                  */
3137                 chain = hammer2_combined_find(parent, base, count,
3138                                               &cache_index, &key_next,
3139                                               key_beg, key_end,
3140                                               &bref);
3141                 generation = parent->core.generation;
3142                 if (bref == NULL)
3143                         break;
3144                 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
3145
3146                 /*
3147                  * Skip keys that are not within the key/radix of the new
3148                  * indirect block.  They stay in the parent.
3149                  */
3150                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
3151                     (key ^ bref->key)) != 0) {
3152                         goto next_key_spinlocked;
3153                 }
3154
3155                 /*
3156                  * Load the new indirect block by acquiring the related
3157                  * chains (potentially from media as it might not be
3158                  * in-memory).  Then move it to the new parent (ichain)
3159                  * via DELETE-DUPLICATE.
3160                  *
3161                  * chain is referenced but not locked.  We must lock the
3162                  * chain to obtain definitive DUPLICATED/DELETED state
3163                  */
3164                 if (chain) {
3165                         /*
3166                          * Use chain already present in the RBTREE
3167                          */
3168                         hammer2_chain_ref(chain);
3169                         hammer2_spin_unex(&parent->core.spin);
3170                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
3171                 } else {
3172                         /*
3173                          * Get chain for blockref element.  _get returns NULL
3174                          * on insertion race.
3175                          */
3176                         bcopy = *bref;
3177                         hammer2_spin_unex(&parent->core.spin);
3178                         chain = hammer2_chain_get(parent, generation, &bcopy);
3179                         if (chain == NULL) {
3180                                 reason = 1;
3181                                 hammer2_spin_ex(&parent->core.spin);
3182                                 continue;
3183                         }
3184                         if (bcmp(&bcopy, bref, sizeof(bcopy))) {
3185                                 kprintf("REASON 2\n");
3186                                 reason = 2;
3187                                 hammer2_chain_drop(chain);
3188                                 hammer2_spin_ex(&parent->core.spin);
3189                                 continue;
3190                         }
3191                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
3192                 }
3193
3194                 /*
3195                  * This is always live so if the chain has been deleted
3196                  * we raced someone and we have to retry.
3197                  *
3198                  * NOTE: Lookups can race delete-duplicate because
3199                  *       delete-duplicate does not lock the parent's core
3200                  *       (they just use the spinlock on the core).  We must
3201                  *       check for races by comparing the DUPLICATED flag before
3202                  *       releasing the spinlock with the flag after locking the
3203                  *       chain.
3204                  *
3205                  *       (note reversed logic for this one)
3206                  */
3207                 if (chain->flags & HAMMER2_CHAIN_DELETED) {
3208                         hammer2_chain_unlock(chain);
3209                         hammer2_chain_drop(chain);
3210                         goto next_key;
3211                 }
3212
3213                 /*
3214                  * Shift the chain to the indirect block.
3215                  *
3216                  * WARNING! No reason for us to load chain data, pass NOSTATS
3217                  *          to prevent delete/insert from trying to access
3218                  *          inode stats (and thus asserting if there is no
3219                  *          chain->data loaded).
3220                  *
3221                  * WARNING! The (parent, chain) deletion may modify the parent
3222                  *          and invalidate the base pointer.
3223                  */
3224                 hammer2_chain_delete(parent, chain, mtid, 0);
3225                 hammer2_chain_rename(NULL, &ichain, chain, mtid, 0);
3226                 hammer2_chain_unlock(chain);
3227                 hammer2_chain_drop(chain);
3228                 KKASSERT(parent->refs > 0);
3229                 chain = NULL;
3230                 base = NULL;    /* safety */
3231 next_key:
3232                 hammer2_spin_ex(&parent->core.spin);
3233 next_key_spinlocked:
3234                 if (--maxloops == 0)
3235                         panic("hammer2_chain_create_indirect: maxloops");
3236                 reason = 4;
3237                 if (key_next == 0 || key_next > key_end)
3238                         break;
3239                 key_beg = key_next;
3240                 /* loop */
3241         }
3242         hammer2_spin_unex(&parent->core.spin);
3243
3244         /*
3245          * Insert the new indirect block into the parent now that we've
3246          * cleared out some entries in the parent.  We calculated a good
3247          * insertion index in the loop above (ichain->index).
3248          *
3249          * We don't have to set UPDATE here because we mark ichain
3250          * modified down below (so the normal modified -> flush -> set-moved
3251          * sequence applies).
3252          *
3253          * The insertion shouldn't race as this is a completely new block
3254          * and the parent is locked.
3255          */
3256         base = NULL;    /* safety, parent modify may change address */
3257         KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
3258         hammer2_chain_insert(parent, ichain,
3259                              HAMMER2_CHAIN_INSERT_SPIN |
3260                              HAMMER2_CHAIN_INSERT_LIVE,
3261                              0);
3262
3263         /*
3264          * Make sure flushes propogate after our manual insertion.
3265          */
3266         hammer2_chain_setflush(ichain);
3267         hammer2_chain_setflush(parent);
3268
3269         /*
3270          * Figure out what to return.
3271          */
3272         if (~(((hammer2_key_t)1 << keybits) - 1) &
3273                    (create_key ^ key)) {
3274                 /*
3275                  * Key being created is outside the key range,
3276                  * return the original parent.
3277                  */
3278                 hammer2_chain_unlock(ichain);
3279                 hammer2_chain_drop(ichain);
3280         } else {
3281                 /*
3282                  * Otherwise its in the range, return the new parent.
3283                  * (leave both the new and old parent locked).
3284                  */
3285                 parent = ichain;
3286         }
3287
3288         return(parent);
3289 }
3290
3291 /*
3292  * Calculate the keybits and highside/lowside of the freemap node the
3293  * caller is creating.
3294  *
3295  * This routine will specify the next higher-level freemap key/radix
3296  * representing the lowest-ordered set.  By doing so, eventually all
3297  * low-ordered sets will be moved one level down.
3298  *
3299  * We have to be careful here because the freemap reserves a limited
3300  * number of blocks for a limited number of levels.  So we can't just
3301  * push indiscriminately.
3302  */
3303 int
3304 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
3305                              int keybits, hammer2_blockref_t *base, int count)
3306 {
3307         hammer2_chain_t *chain;
3308         hammer2_blockref_t *bref;
3309         hammer2_key_t key;
3310         hammer2_key_t key_beg;
3311         hammer2_key_t key_end;
3312         hammer2_key_t key_next;
3313         int cache_index;
3314         int locount;
3315         int hicount;
3316         int maxloops = 300000;
3317
3318         key = *keyp;
3319         locount = 0;
3320         hicount = 0;
3321         keybits = 64;
3322
3323         /*
3324          * Calculate the range of keys in the array being careful to skip
3325          * slots which are overridden with a deletion.
3326          */
3327         key_beg = 0;
3328         key_end = HAMMER2_KEY_MAX;
3329         cache_index = 0;
3330         hammer2_spin_ex(&parent->core.spin);
3331
3332         for (;;) {
3333                 if (--maxloops == 0) {
3334                         panic("indkey_freemap shit %p %p:%d\n",
3335                               parent, base, count);
3336                 }
3337                 chain = hammer2_combined_find(parent, base, count,
3338                                               &cache_index, &key_next,
3339                                               key_beg, key_end,
3340                                               &bref);
3341
3342                 /*
3343                  * Exhausted search
3344                  */
3345                 if (bref == NULL)
3346                         break;
3347
3348                 /*
3349                  * Skip deleted chains.
3350                  */
3351                 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
3352                         if (key_next == 0 || key_next > key_end)
3353                                 break;
3354                         key_beg = key_next;
3355                         continue;
3356                 }
3357
3358                 /*
3359                  * Use the full live (not deleted) element for the scan
3360                  * iteration.  HAMMER2 does not allow partial replacements.
3361                  *
3362                  * XXX should be built into hammer2_combined_find().
3363                  */
3364                 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
3365
3366                 if (keybits > bref->keybits) {
3367                         key = bref->key;
3368                         keybits = bref->keybits;
3369                 } else if (keybits == bref->keybits && bref->key < key) {
3370                         key = bref->key;
3371                 }
3372                 if (key_next == 0)
3373                         break;
3374                 key_beg = key_next;
3375         }
3376         hammer2_spin_unex(&parent->core.spin);
3377
3378         /*
3379          * Return the keybits for a higher-level FREEMAP_NODE covering
3380          * this node.
3381          */
3382         switch(keybits) {
3383         case HAMMER2_FREEMAP_LEVEL0_RADIX:
3384                 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX;
3385                 break;
3386         case HAMMER2_FREEMAP_LEVEL1_RADIX:
3387                 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX;
3388                 break;
3389         case HAMMER2_FREEMAP_LEVEL2_RADIX:
3390                 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX;
3391                 break;
3392         case HAMMER2_FREEMAP_LEVEL3_RADIX:
3393                 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX;
3394                 break;
3395         case HAMMER2_FREEMAP_LEVEL4_RADIX:
3396                 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX;
3397                 break;
3398         case HAMMER2_FREEMAP_LEVEL5_RADIX:
3399                 panic("hammer2_chain_indkey_freemap: level too high");
3400                 break;
3401         default:
3402                 panic("hammer2_chain_indkey_freemap: bad radix");
3403                 break;
3404         }
3405         *keyp = key;
3406
3407         return (keybits);
3408 }
3409
3410 /*
3411  * Calculate the keybits and highside/lowside of the indirect block the
3412  * caller is creating.
3413  */
3414 static int
3415 hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
3416                             int keybits, hammer2_blockref_t *base, int count)
3417 {
3418         hammer2_blockref_t *bref;
3419         hammer2_chain_t *chain;
3420         hammer2_key_t key_beg;
3421         hammer2_key_t key_end;
3422         hammer2_key_t key_next;
3423         hammer2_key_t key;
3424         int nkeybits;
3425         int locount;
3426         int hicount;
3427         int cache_index;
3428         int maxloops = 300000;
3429
3430         key = *keyp;
3431         locount = 0;
3432         hicount = 0;
3433
3434         /*
3435          * Calculate the range of keys in the array being careful to skip
3436          * slots which are overridden with a deletion.  Once the scan
3437          * completes we will cut the key range in half and shift half the
3438          * range into the new indirect block.
3439          */
3440         key_beg = 0;
3441         key_end = HAMMER2_KEY_MAX;
3442         cache_index = 0;
3443         hammer2_spin_ex(&parent->core.spin);
3444
3445         for (;;) {
3446                 if (--maxloops == 0) {
3447                         panic("indkey_freemap shit %p %p:%d\n",
3448                               parent, base, count);
3449                 }
3450                 chain = hammer2_combined_find(parent, base, count,
3451                                               &cache_index, &key_next,
3452                                               key_beg, key_end,
3453                                               &bref);
3454
3455                 /*
3456                  * Exhausted search
3457                  */
3458                 if (bref == NULL)
3459                         break;
3460
3461                 /*
3462                  * NOTE: No need to check DUPLICATED here because we do
3463                  *       not release the spinlock.
3464                  */
3465                 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
3466                         if (key_next == 0 || key_next > key_end)
3467                                 break;
3468                         key_beg = key_next;
3469                         continue;
3470                 }
3471
3472                 /*
3473                  * Use the full live (not deleted) element for the scan
3474                  * iteration.  HAMMER2 does not allow partial replacements.
3475                  *
3476                  * XXX should be built into hammer2_combined_find().
3477                  */
3478                 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
3479
3480                 /*
3481                  * Expand our calculated key range (key, keybits) to fit
3482                  * the scanned key.  nkeybits represents the full range
3483                  * that we will later cut in half (two halves @ nkeybits - 1).
3484                  */
3485                 nkeybits = keybits;
3486                 if (nkeybits < bref->keybits) {
3487                         if (bref->keybits > 64) {
3488                                 kprintf("bad bref chain %p bref %p\n",
3489                                         chain, bref);
3490                                 Debugger("fubar");
3491                         }
3492                         nkeybits = bref->keybits;
3493                 }
3494                 while (nkeybits < 64 &&
3495                        (~(((hammer2_key_t)1 << nkeybits) - 1) &
3496                         (key ^ bref->key)) != 0) {
3497                         ++nkeybits;
3498                 }
3499
3500                 /*
3501                  * If the new key range is larger we have to determine
3502                  * which side of the new key range the existing keys fall
3503                  * under by checking the high bit, then collapsing the
3504                  * locount into the hicount or vise-versa.
3505                  */
3506                 if (keybits != nkeybits) {
3507                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
3508                                 hicount += locount;
3509                                 locount = 0;
3510                         } else {
3511                                 locount += hicount;
3512                                 hicount = 0;
3513                         }
3514                         keybits = nkeybits;
3515                 }
3516
3517                 /*
3518                  * The newly scanned key will be in the lower half or the
3519                  * upper half of the (new) key range.
3520                  */
3521                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
3522                         ++hicount;
3523                 else
3524                         ++locount;
3525
3526                 if (key_next == 0)
3527                         break;
3528                 key_beg = key_next;
3529         }
3530         hammer2_spin_unex(&parent->core.spin);
3531         bref = NULL;    /* now invalid (safety) */
3532
3533         /*
3534          * Adjust keybits to represent half of the full range calculated
3535          * above (radix 63 max)
3536          */
3537         --keybits;
3538
3539         /*
3540          * Select whichever half contains the most elements.  Theoretically
3541          * we can select either side as long as it contains at least one
3542          * element (in order to ensure that a free slot is present to hold
3543          * the indirect block).
3544          */
3545         if (hammer2_indirect_optimize) {
3546                 /*
3547                  * Insert node for least number of keys, this will arrange
3548                  * the first few blocks of a large file or the first few
3549                  * inodes in a directory with fewer indirect blocks when
3550                  * created linearly.
3551                  */
3552                 if (hicount < locount && hicount != 0)
3553                         key |= (hammer2_key_t)1 << keybits;
3554                 else
3555                         key &= ~(hammer2_key_t)1 << keybits;
3556         } else {
3557                 /*
3558                  * Insert node for most number of keys, best for heavily
3559                  * fragmented files.
3560                  */
3561                 if (hicount > locount)
3562                         key |= (hammer2_key_t)1 << keybits;
3563                 else
3564                         key &= ~(hammer2_key_t)1 << keybits;
3565         }
3566         *keyp = key;
3567
3568         return (keybits);
3569 }
3570
3571 /*
3572  * Sets CHAIN_DELETED and remove the chain's blockref from the parent if
3573  * it exists.
3574  *
3575  * Both parent and chain must be locked exclusively.
3576  *
3577  * This function will modify the parent if the blockref requires removal
3578  * from the parent's block table.
3579  *
3580  * This function is NOT recursive.  Any entity already pushed into the
3581  * chain (such as an inode) may still need visibility into its contents,
3582  * as well as the ability to read and modify the contents.  For example,
3583  * for an unlinked file which is still open.
3584  */
3585 void
3586 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain,
3587                      hammer2_tid_t mtid, int flags)
3588 {
3589         KKASSERT(hammer2_mtx_owned(&chain->lock));
3590
3591         /*
3592          * Nothing to do if already marked.
3593          *
3594          * We need the spinlock on the core whos RBTREE contains chain
3595          * to protect against races.
3596          */
3597         if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
3598                 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
3599                          chain->parent == parent);
3600                 _hammer2_chain_delete_helper(parent, chain, mtid, flags);
3601         }
3602
3603         /*
3604          * Permanent deletions mark the chain as destroyed.  H
3605          */
3606         if (flags & HAMMER2_DELETE_PERMANENT) {
3607                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
3608         } else {
3609                 /* XXX might not be needed */
3610                 hammer2_chain_setflush(chain);
3611         }
3612 }
3613
3614 /*
3615  * Returns the index of the nearest element in the blockref array >= elm.
3616  * Returns (count) if no element could be found.
3617  *
3618  * Sets *key_nextp to the next key for loop purposes but does not modify
3619  * it if the next key would be higher than the current value of *key_nextp.
3620  * Note that *key_nexp can overflow to 0, which should be tested by the
3621  * caller.
3622  *
3623  * (*cache_indexp) is a heuristic and can be any value without effecting
3624  * the result.
3625  *
3626  * WARNING!  Must be called with parent's spinlock held.  Spinlock remains
3627  *           held through the operation.
3628  */
3629 static int
3630 hammer2_base_find(hammer2_chain_t *parent,
3631                   hammer2_blockref_t *base, int count,
3632                   int *cache_indexp, hammer2_key_t *key_nextp,
3633                   hammer2_key_t key_beg, hammer2_key_t key_end)
3634 {
3635         hammer2_blockref_t *scan;
3636         hammer2_key_t scan_end;
3637         int i;
3638         int limit;
3639
3640         /*
3641          * Require the live chain's already have their core's counted
3642          * so we can optimize operations.
3643          */
3644         KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS);
3645
3646         /*
3647          * Degenerate case
3648          */
3649         if (count == 0 || base == NULL)
3650                 return(count);
3651
3652         /*
3653          * Sequential optimization using *cache_indexp.  This is the most
3654          * likely scenario.
3655          *
3656          * We can avoid trailing empty entries on live chains, otherwise
3657          * we might have to check the whole block array.
3658          */
3659         i = *cache_indexp;
3660         cpu_ccfence();
3661         limit = parent->core.live_zero;
3662         if (i >= limit)
3663                 i = limit - 1;
3664         if (i < 0)
3665                 i = 0;
3666         KKASSERT(i < count);
3667
3668         /*
3669          * Search backwards
3670          */
3671         scan = &base[i];
3672         while (i > 0 && (scan->type == 0 || scan->key > key_beg)) {
3673                 --scan;
3674                 --i;
3675         }
3676         *cache_indexp = i;
3677
3678         /*
3679          * Search forwards, stop when we find a scan element which
3680          * encloses the key or until we know that there are no further
3681          * elements.
3682          */
3683         while (i < count) {
3684                 if (scan->type != 0) {
3685                         scan_end = scan->key +
3686                                    ((hammer2_key_t)1 << scan->keybits) - 1;
3687                         if (scan->key > key_beg || scan_end >= key_beg)
3688                                 break;
3689                 }
3690                 if (i >= limit)
3691                         return (count);
3692                 ++scan;
3693                 ++i;
3694         }
3695         if (i != count) {
3696                 *cache_indexp = i;
3697                 if (i >= limit) {
3698                         i = count;
3699                 } else {
3700                         scan_end = scan->key +
3701                                    ((hammer2_key_t)1 << scan->keybits);
3702                         if (scan_end && (*key_nextp > scan_end ||
3703                                          *key_nextp == 0)) {
3704                                 *key_nextp = scan_end;
3705                         }
3706                 }
3707         }
3708         return (i);
3709 }
3710
3711 /*
3712  * Do a combined search and return the next match either from the blockref
3713  * array or from the in-memory chain.  Sets *bresp to the returned bref in
3714  * both cases, or sets it to NULL if the search exhausted.  Only returns
3715  * a non-NULL chain if the search matched from the in-memory chain.
3716  *
3717  * When no in-memory chain has been found and a non-NULL bref is returned
3718  * in *bresp.
3719  *
3720  *
3721  * The returned chain is not locked or referenced.  Use the returned bref
3722  * to determine if the search exhausted or not.  Iterate if the base find
3723  * is chosen but matches a deleted chain.
3724  *
3725  * WARNING!  Must be called with parent's spinlock held.  Spinlock remains
3726  *           held through the operation.
3727  */
3728 static hammer2_chain_t *
3729 hammer2_combined_find(hammer2_chain_t *parent,
3730                       hammer2_blockref_t *base, int count,
3731                       int *cache_indexp, hammer2_key_t *key_nextp,
3732                       hammer2_key_t key_beg, hammer2_key_t key_end,
3733                       hammer2_blockref_t **bresp)
3734 {
3735         hammer2_blockref_t *bref;
3736         hammer2_chain_t *chain;
3737         int i;
3738
3739         /*
3740          * Lookup in block array and in rbtree.
3741          */
3742         *key_nextp = key_end + 1;
3743         i = hammer2_base_find(parent, base, count, cache_indexp,
3744                               key_nextp, key_beg, key_end);
3745         chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end);
3746
3747         /*
3748          * Neither matched
3749          */
3750         if (i == count && chain == NULL) {
3751                 *bresp = NULL;
3752                 return(NULL);
3753         }
3754
3755         /*
3756          * Only chain matched.
3757          */
3758         if (i == count) {
3759                 bref = &chain->bref;
3760                 goto found;
3761         }
3762
3763         /*
3764          * Only blockref matched.
3765          */
3766         if (chain == NULL) {
3767                 bref = &base[i];
3768                 goto found;
3769         }
3770
3771         /*
3772          * Both in-memory and blockref matched, select the nearer element.
3773          *
3774          * If both are flush with the left-hand side or both are the
3775          * same distance away, select the chain.  In this situation the
3776          * chain must have been loaded from the matching blockmap.
3777          */
3778         if ((chain->bref.key <= key_beg && base[i].key <= key_beg) ||
3779             chain->bref.key == base[i].key) {
3780                 KKASSERT(chain->bref.key == base[i].key);
3781                 bref = &chain->bref;
3782                 goto found;
3783         }
3784
3785         /*
3786          * Select the nearer key
3787          */
3788         if (chain->bref.key < base[i].key) {
3789                 bref = &chain->bref;
3790         } else {
3791                 bref = &base[i];
3792                 chain = NULL;
3793         }
3794
3795         /*
3796          * If the bref is out of bounds we've exhausted our search.
3797          */
3798 found:
3799         if (bref->key > key_end) {
3800                 *bresp = NULL;
3801                 chain = NULL;
3802         } else {
3803                 *bresp = bref;
3804         }
3805         return(chain);
3806 }
3807
3808 /*
3809  * Locate the specified block array element and delete it.  The element
3810  * must exist.
3811  *
3812  * The spin lock on the related chain must be held.
3813  *
3814  * NOTE: live_count was adjusted when the chain was deleted, so it does not
3815  *       need to be adjusted when we commit the media change.
3816  */
3817 void
3818 hammer2_base_delete(hammer2_chain_t *parent,
3819                     hammer2_blockref_t *base, int count,
3820                     int *cache_indexp, hammer2_chain_t *chain)
3821 {
3822         hammer2_blockref_t *elm = &chain->bref;
3823         hammer2_key_t key_next;
3824         int i;
3825
3826         /*
3827          * Delete element.  Expect the element to exist.
3828          *
3829          * XXX see caller, flush code not yet sophisticated enough to prevent
3830          *     re-flushed in some cases.
3831          */
3832         key_next = 0; /* max range */
3833         i = hammer2_base_find(parent, base, count, cache_indexp,
3834                               &key_next, elm->key, elm->key);
3835         if (i == count || base[i].type == 0 ||
3836             base[i].key != elm->key ||
3837             ((chain->flags & HAMMER2_CHAIN_BMAPUPD) == 0 &&
3838              base[i].keybits != elm->keybits)) {
3839                 hammer2_spin_unex(&parent->core.spin);
3840                 panic("delete base %p element not found at %d/%d elm %p\n",
3841                       base, i, count, elm);
3842                 return;
3843         }
3844
3845         /*
3846          * Update stats and zero the entry
3847          */
3848         parent->bref.data_count -= base[i].data_count;
3849         parent->bref.data_count -= (hammer2_off_t)1 <<
3850                         (int)(base[i].data_off & HAMMER2_OFF_MASK_RADIX);
3851         parent->bref.inode_count -= base[i].inode_count;
3852         if (base[i].type == HAMMER2_BREF_TYPE_INODE)
3853                 parent->bref.inode_count -= 1;
3854
3855         bzero(&base[i], sizeof(*base));
3856
3857         /*
3858          * We can only optimize parent->core.live_zero for live chains.
3859          */
3860         if (parent->core.live_zero == i + 1) {
3861                 while (--i >= 0 && base[i].type == 0)
3862                         ;
3863                 parent->core.live_zero = i + 1;
3864         }
3865
3866         /*
3867          * Clear appropriate blockmap flags in chain.
3868          */
3869         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED |
3870                                         HAMMER2_CHAIN_BMAPUPD);
3871 }
3872
3873 /*
3874  * Insert the specified element.  The block array must not already have the
3875  * element and must have space available for the insertion.
3876  *
3877  * The spin lock on the related chain must be held.
3878  *
3879  * NOTE: live_count was adjusted when the chain was deleted, so it does not
3880  *       need to be adjusted when we commit the media change.
3881  */
3882 void
3883 hammer2_base_insert(hammer2_chain_t *parent,
3884                     hammer2_blockref_t *base, int count,
3885                     int *cache_indexp, hammer2_chain_t *chain)
3886 {
3887         hammer2_blockref_t *elm = &chain->bref;
3888         hammer2_key_t key_next;
3889         hammer2_key_t xkey;
3890         int i;
3891         int j;
3892         int k;
3893         int l;
3894         int u = 1;
3895
3896         /*
3897          * Insert new element.  Expect the element to not already exist
3898          * unless we are replacing it.
3899          *
3900          * XXX see caller, flush code not yet sophisticated enough to prevent
3901          *     re-flushed in some cases.
3902          */
3903         key_next = 0; /* max range */
3904         i = hammer2_base_find(parent, base, count, cache_indexp,
3905                               &key_next, elm->key, elm->key);
3906
3907         /*
3908          * Shortcut fill optimization, typical ordered insertion(s) may not
3909          * require a search.
3910          */
3911         KKASSERT(i >= 0 && i <= count);
3912
3913         /*
3914          * Set appropriate blockmap flags in chain.
3915          */
3916         atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED);
3917
3918         /*
3919          * Update stats and zero the entry
3920          */
3921         parent->bref.data_count += elm->data_count;
3922         parent->bref.data_count += (hammer2_off_t)1 <<
3923                         (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX);
3924         parent->bref.inode_count += elm->inode_count;
3925         if (elm->type == HAMMER2_BREF_TYPE_INODE)
3926                 parent->bref.inode_count += 1;
3927
3928
3929         /*
3930          * We can only optimize parent->core.live_zero for live chains.
3931          */
3932         if (i == count && parent->core.live_zero < count) {
3933                 i = parent->core.live_zero++;
3934                 base[i] = *elm;
3935                 return;
3936         }
3937
3938         xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1;
3939         if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) {
3940                 hammer2_spin_unex(&parent->core.spin);
3941                 panic("insert base %p overlapping elements at %d elm %p\n",
3942                       base, i, elm);
3943         }
3944
3945         /*
3946          * Try to find an empty slot before or after.
3947          */
3948         j = i;
3949         k = i;
3950         while (j > 0 || k < count) {
3951                 --j;
3952                 if (j >= 0 && base[j].type == 0) {
3953                         if (j == i - 1) {
3954                                 base[j] = *elm;
3955                         } else {
3956                                 bcopy(&base[j+1], &base[j],
3957                                       (i - j - 1) * sizeof(*base));
3958                                 base[i - 1] = *elm;
3959                         }
3960                         goto validate;
3961                 }
3962                 ++k;
3963                 if (k < count && base[k].type == 0) {
3964                         bcopy(&base[i], &base[i+1],
3965                               (k - i) * sizeof(hammer2_blockref_t));
3966                         base[i] = *elm;
3967
3968                         /*
3969                          * We can only update parent->core.live_zero for live
3970                          * chains.
3971                          */
3972                         if (parent->core.live_zero <= k)
3973                                 parent->core.live_zero = k + 1;
3974                         u = 2;
3975                         goto validate;
3976                 }
3977         }
3978         panic("hammer2_base_insert: no room!");
3979
3980         /*
3981          * Debugging
3982          */
3983 validate:
3984         key_next = 0;
3985         for (l = 0; l < count; ++l) {
3986                 if (base[l].type) {
3987                         key_next = base[l].key +
3988                                    ((hammer2_key_t)1 << base[l].keybits) - 1;
3989                         break;
3990                 }
3991         }
3992         while (++l < count) {
3993                 if (base[l].type) {
3994                         if (base[l].key <= key_next)
3995                                 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l);
3996                         key_next = base[l].key +
3997                                    ((hammer2_key_t)1 << base[l].keybits) - 1;
3998
3999                 }
4000         }
4001
4002 }
4003
4004 #if 0
4005
4006 /*
4007  * Sort the blockref array for the chain.  Used by the flush code to
4008  * sort the blockref[] array.
4009  *
4010  * The chain must be exclusively locked AND spin-locked.
4011  */
4012 typedef hammer2_blockref_t *hammer2_blockref_p;
4013
4014 static
4015 int
4016 hammer2_base_sort_callback(const void *v1, const void *v2)
4017 {
4018         hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1;
4019         hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2;
4020
4021         /*
4022          * Make sure empty elements are placed at the end of the array
4023          */
4024         if (bref1->type == 0) {
4025                 if (bref2->type == 0)
4026                         return(0);
4027                 return(1);
4028         } else if (bref2->type == 0) {
4029                 return(-1);
4030         }
4031
4032         /*
4033          * Sort by key
4034          */
4035         if (bref1->key < bref2->key)
4036                 return(-1);
4037         if (bref1->key > bref2->key)
4038                 return(1);
4039         return(0);
4040 }
4041
4042 void
4043 hammer2_base_sort(hammer2_chain_t *chain)
4044 {
4045         hammer2_blockref_t *base;
4046         int count;
4047
4048         switch(chain->bref.type) {
4049         case HAMMER2_BREF_TYPE_INODE:
4050                 /*
4051                  * Special shortcut for embedded data returns the inode
4052                  * itself.  Callers must detect this condition and access
4053                  * the embedded data (the strategy code does this for us).
4054                  *
4055                  * This is only applicable to regular files and softlinks.
4056                  */
4057                 if (chain->data->ipdata.meta.op_flags &
4058                     HAMMER2_OPFLAG_DIRECTDATA) {
4059                         return;
4060                 }
4061                 base = &chain->data->ipdata.u.blockset.blockref[0];
4062                 count = HAMMER2_SET_COUNT;
4063                 break;
4064         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
4065         case HAMMER2_BREF_TYPE_INDIRECT:
4066                 /*
4067                  * Optimize indirect blocks in the INITIAL state to avoid
4068                  * I/O.
4069                  */
4070                 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0);
4071                 base = &chain->data->npdata[0];
4072                 count = chain->bytes / sizeof(hammer2_blockref_t);
4073                 break;
4074         case HAMMER2_BREF_TYPE_VOLUME:
4075                 base = &chain->data->voldata.sroot_blockset.blockref[0];
4076                 count = HAMMER2_SET_COUNT;
4077                 break;
4078         case HAMMER2_BREF_TYPE_FREEMAP:
4079                 base = &chain->data->blkset.blockref[0];
4080                 count = HAMMER2_SET_COUNT;
4081                 break;
4082         default:
4083                 kprintf("hammer2_chain_lookup: unrecognized "
4084                         "blockref(A) type: %d",
4085                         chain->bref.type);
4086                 while (1)
4087                         tsleep(&base, 0, "dead", 0);
4088                 panic("hammer2_chain_lookup: unrecognized "
4089                       "blockref(A) type: %d",
4090                       chain->bref.type);
4091                 base = NULL;    /* safety */
4092                 count = 0;      /* safety */
4093         }
4094         kqsort(base, count, sizeof(*base), hammer2_base_sort_callback);
4095 }
4096
4097 #endif
4098
4099 /*
4100  * Chain memory management
4101  */
4102 void
4103 hammer2_chain_wait(hammer2_chain_t *chain)
4104 {
4105         tsleep(chain, 0, "chnflw", 1);
4106 }
4107
4108 const hammer2_media_data_t *
4109 hammer2_chain_rdata(hammer2_chain_t *chain)
4110 {
4111         KKASSERT(chain->data != NULL);
4112         return (chain->data);
4113 }
4114
4115 hammer2_media_data_t *
4116 hammer2_chain_wdata(hammer2_chain_t *chain)
4117 {
4118         KKASSERT(chain->data != NULL);
4119         return (chain->data);
4120 }
4121
4122 /*
4123  * Set the check data for a chain.  This can be a heavy-weight operation
4124  * and typically only runs on-flush.  For file data check data is calculated
4125  * when the logical buffers are flushed.
4126  */
4127 void
4128 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata)
4129 {
4130         chain->bref.flags &= ~HAMMER2_BREF_FLAG_ZERO;
4131
4132         switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
4133         case HAMMER2_CHECK_NONE:
4134                 break;
4135         case HAMMER2_CHECK_DISABLED:
4136                 break;
4137         case HAMMER2_CHECK_ISCSI32:
4138                 chain->bref.check.iscsi32.value =
4139                         hammer2_icrc32(bdata, chain->bytes);
4140                 break;
4141         case HAMMER2_CHECK_CRC64:
4142                 chain->bref.check.crc64.value = 0;
4143                 /* XXX */
4144                 break;
4145         case HAMMER2_CHECK_SHA192:
4146                 {
4147                         SHA256_CTX hash_ctx;
4148                         union {
4149                                 uint8_t digest[SHA256_DIGEST_LENGTH];
4150                                 uint64_t digest64[SHA256_DIGEST_LENGTH/8];
4151                         } u;
4152
4153                         SHA256_Init(&hash_ctx);
4154                         SHA256_Update(&hash_ctx, bdata, chain->bytes);
4155                         SHA256_Final(u.digest, &hash_ctx);
4156                         u.digest64[2] ^= u.digest64[3];
4157                         bcopy(u.digest,
4158                               chain->bref.check.sha192.data,
4159                               sizeof(chain->bref.check.sha192.data));
4160                 }
4161                 break;
4162         case HAMMER2_CHECK_FREEMAP:
4163                 chain->bref.check.freemap.icrc32 =
4164                         hammer2_icrc32(bdata, chain->bytes);
4165                 break;
4166         default:
4167                 kprintf("hammer2_chain_setcheck: unknown check type %02x\n",
4168                         chain->bref.methods);
4169                 break;
4170         }
4171 }
4172
4173 int
4174 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata)
4175 {
4176         int r;
4177
4178         if (chain->bref.flags & HAMMER2_BREF_FLAG_ZERO)
4179                 return 1;
4180
4181         switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
4182         case HAMMER2_CHECK_NONE:
4183                 r = 1;
4184                 break;
4185         case HAMMER2_CHECK_DISABLED:
4186                 r = 1;
4187                 break;
4188         case HAMMER2_CHECK_ISCSI32:
4189                 r = (chain->bref.check.iscsi32.value ==
4190                      hammer2_icrc32(bdata, chain->bytes));
4191                 break;
4192         case HAMMER2_CHECK_CRC64:
4193                 r = (chain->bref.check.crc64.value == 0);
4194                 /* XXX */
4195                 break;
4196         case HAMMER2_CHECK_SHA192:
4197                 {
4198                         SHA256_CTX hash_ctx;
4199                         union {
4200                                 uint8_t digest[SHA256_DIGEST_LENGTH];
4201                                 uint64_t digest64[SHA256_DIGEST_LENGTH/8];
4202                         } u;
4203
4204                         SHA256_Init(&hash_ctx);
4205                         SHA256_Update(&hash_ctx, bdata, chain->bytes);
4206                         SHA256_Final(u.digest, &hash_ctx);
4207                         u.digest64[2] ^= u.digest64[3];
4208                         if (bcmp(u.digest,
4209                                  chain->bref.check.sha192.data,
4210                                  sizeof(chain->bref.check.sha192.data)) == 0) {
4211                                 r = 1;
4212                         } else {
4213                                 r = 0;
4214                         }
4215                 }
4216                 break;
4217         case HAMMER2_CHECK_FREEMAP:
4218                 r = (chain->bref.check.freemap.icrc32 ==
4219                      hammer2_icrc32(bdata, chain->bytes));
4220                 if (r == 0) {
4221                         kprintf("freemap.icrc %08x icrc32 %08x (%d)\n",
4222                                 chain->bref.check.freemap.icrc32,
4223                                 hammer2_icrc32(bdata, chain->bytes), chain->bytes);
4224                         if (chain->dio)
4225                                 kprintf("dio %p buf %016jx,%d bdata %p/%p\n",
4226                                         chain->dio, chain->dio->bp->b_loffset, chain->dio->bp->b_bufsize, bdata, chain->dio->bp->b_data);
4227                 }
4228
4229                 break;
4230         default:
4231                 kprintf("hammer2_chain_setcheck: unknown check type %02x\n",
4232                         chain->bref.methods);
4233                 r = 1;
4234                 break;
4235         }
4236         return r;
4237 }
4238
4239 /*
4240  * The caller presents a shared-locked (parent, chain) where the chain
4241  * is of type HAMMER2_OBJTYPE_HARDLINK.  The caller must hold the ip
4242  * structure representing the inode locked to prevent
4243  * consolidation/deconsolidation races.
4244  *
4245  * The flags passed in are LOOKUP flags, not RESOLVE flags.  Only
4246  * HAMMER2_LOOKUP_SHARED is supported.
4247  *
4248  * We locate the hardlink in the current or a common parent directory.
4249  *
4250  * If we are unable to locate the hardlink, EIO is returned and
4251  * (*chainp) is unlocked and dropped.
4252  */
4253 int
4254 hammer2_chain_hardlink_find(hammer2_inode_t *dip,
4255                         hammer2_chain_t **parentp,
4256                         hammer2_chain_t **chainp,
4257                         int flags)
4258 {
4259         hammer2_chain_t *parent;
4260         hammer2_chain_t *rchain;
4261         hammer2_key_t key_dummy;
4262         hammer2_key_t lhc;
4263         int cache_index = -1;
4264         int resolve_flags;
4265
4266         resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ?
4267                         HAMMER2_RESOLVE_SHARED : 0;
4268
4269         /*
4270          * Obtain the key for the hardlink from *chainp.
4271          */
4272         rchain = *chainp;
4273         lhc = rchain->data->ipdata.meta.inum;
4274         hammer2_chain_unlock(rchain);
4275         hammer2_chain_drop(rchain);
4276         rchain = NULL;
4277
4278         for (;;) {
4279                 int nloops;
4280                 rchain = hammer2_chain_lookup(parentp, &key_dummy,
4281                                               lhc, lhc,
4282                                               &cache_index, flags);
4283                 if (rchain)
4284                         break;
4285
4286                 /*
4287                  * Iterate parents, handle parent rename races by retrying
4288                  * the operation.
4289                  */
4290                 nloops = -1;
4291                 while (nloops) {
4292                         --nloops;
4293                         parent = *parentp;
4294                         if (nloops < 0 &&
4295                             parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
4296                                 nloops = 1;
4297                         }
4298                         if (parent->bref.flags & HAMMER2_BREF_FLAG_PFSROOT)
4299                                 goto done;
4300                         if (parent->parent == NULL)
4301                                 goto done;
4302                         parent = parent->parent;
4303                         hammer2_chain_ref(parent);
4304                         hammer2_chain_unlock(*parentp);
4305                         hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
4306                                                    resolve_flags);
4307                         if ((*parentp)->parent == parent) {
4308                                 hammer2_chain_drop(*parentp);
4309                                 *parentp = parent;
4310                         } else {
4311                                 hammer2_chain_unlock(parent);
4312                                 hammer2_chain_drop(parent);
4313                                 hammer2_chain_lock(*parentp,
4314                                                    HAMMER2_RESOLVE_ALWAYS |
4315                                                    resolve_flags);
4316                                 parent = NULL;  /* safety */
4317                                 /* retry */
4318                         }
4319                 }
4320         }
4321 done:
4322
4323         *chainp = rchain;
4324         return (rchain ? EINVAL : 0);
4325 }
4326
4327 /*
4328  * Used by the bulkscan code to snapshot the synchronized storage for
4329  * a volume, allowing it to be scanned concurrently against normal
4330  * operation.
4331  */
4332 hammer2_chain_t *
4333 hammer2_chain_bulksnap(hammer2_chain_t *chain)
4334 {
4335         hammer2_chain_t *copy;
4336
4337         copy = hammer2_chain_alloc(chain->hmp, chain->pmp, &chain->bref);
4338         switch(chain->bref.type) {
4339         case HAMMER2_BREF_TYPE_VOLUME:
4340                 copy->data = kmalloc(sizeof(copy->data->voldata),
4341                                      chain->hmp->mchain,
4342                                      M_WAITOK | M_ZERO);
4343                 hammer2_spin_ex(&chain->core.spin);
4344                 copy->data->voldata = chain->data->voldata;
4345                 hammer2_spin_unex(&chain->core.spin);
4346                 break;
4347         case HAMMER2_BREF_TYPE_FREEMAP:
4348                 copy->data = kmalloc(sizeof(hammer2_blockset_t),
4349                                      chain->hmp->mchain,
4350                                      M_WAITOK | M_ZERO);
4351                 hammer2_spin_ex(&chain->core.spin);
4352                 copy->data->blkset = chain->data->blkset;
4353                 hammer2_spin_unex(&chain->core.spin);
4354                 break;
4355         default:
4356                 break;
4357         }
4358         return copy;
4359 }
4360
4361 void
4362 hammer2_chain_bulkdrop(hammer2_chain_t *copy)
4363 {
4364         switch(copy->bref.type) {
4365         case HAMMER2_BREF_TYPE_VOLUME:
4366         case HAMMER2_BREF_TYPE_FREEMAP:
4367                 KKASSERT(copy->data);
4368                 kfree(copy->data, copy->hmp->mchain);
4369                 copy->data = NULL;
4370         default:
4371                 break;
4372         }
4373         hammer2_chain_drop(copy);
4374 }
4375
4376 /*
4377  * Create a snapshot of the specified {parent, ochain} with the specified
4378  * label.  The originating hammer2_inode must be exclusively locked for
4379  * safety.
4380  *
4381  * The ioctl code has already synced the filesystem.
4382  */
4383 int
4384 hammer2_chain_snapshot(hammer2_chain_t *chain, hammer2_ioc_pfs_t *pmp,
4385                        hammer2_tid_t mtid)
4386 {
4387         hammer2_dev_t *hmp;
4388         const hammer2_inode_data_t *ripdata;
4389         hammer2_inode_data_t *wipdata;
4390         hammer2_chain_t *nchain;
4391         hammer2_inode_t *nip;
4392         size_t name_len;
4393         hammer2_key_t lhc;
4394         struct vattr vat;
4395 #if 0
4396         uuid_t opfs_clid;
4397 #endif
4398         int error;
4399
4400         kprintf("snapshot %s\n", pmp->name);
4401
4402         name_len = strlen(pmp->name);
4403         lhc = hammer2_dirhash(pmp->name, name_len);
4404
4405         /*
4406          * Get the clid
4407          */
4408         ripdata = &chain->data->ipdata;
4409 #if 0
4410         opfs_clid = ripdata->meta.pfs_clid;
4411 #endif
4412         hmp = chain->hmp;
4413
4414         /*
4415          * Create the snapshot directory under the super-root
4416          *
4417          * Set PFS type, generate a unique filesystem id, and generate
4418          * a cluster id.  Use the same clid when snapshotting a PFS root,
4419          * which theoretically allows the snapshot to be used as part of
4420          * the same cluster (perhaps as a cache).
4421          *
4422          * Copy the (flushed) blockref array.  Theoretically we could use
4423          * chain_duplicate() but it becomes difficult to disentangle
4424          * the shared core so for now just brute-force it.
4425          */
4426         VATTR_NULL(&vat);
4427         vat.va_type = VDIR;
4428         vat.va_mode = 0755;
4429         nip = hammer2_inode_create(hmp->spmp->iroot, &vat, proc0.p_ucred,
4430                                    pmp->name, name_len, 0,
4431                                    1, 0, 0,
4432                                    HAMMER2_INSERT_PFSROOT, &error);
4433
4434         if (nip) {
4435                 hammer2_inode_modify(nip);
4436                 nchain = hammer2_inode_chain(nip, 0, HAMMER2_RESOLVE_ALWAYS);
4437                 hammer2_chain_modify(nchain, mtid, 0, 0);
4438                 wipdata = &nchain->data->ipdata;
4439
4440                 nip->meta.pfs_type = HAMMER2_PFSTYPE_MASTER;
4441                 nip->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
4442                 nip->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT;
4443                 kern_uuidgen(&nip->meta.pfs_fsid, 1);
4444
4445                 /*
4446                  * Give the snapshot its own private cluster id.  As a
4447                  * snapshot no further synchronization with the original
4448                  * cluster will be done.
4449                  */
4450 #if 0
4451                 if (chain->flags & HAMMER2_CHAIN_PFSBOUNDARY)
4452                         nip->meta.pfs_clid = opfs_clid;
4453                 else
4454                         kern_uuidgen(&nip->meta.pfs_clid, 1);
4455 #endif
4456                 kern_uuidgen(&nip->meta.pfs_clid, 1);
4457                 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
4458
4459                 /* XXX hack blockset copy */
4460                 /* XXX doesn't work with real cluster */
4461                 wipdata->meta = nip->meta;
4462                 wipdata->u.blockset = ripdata->u.blockset;
4463                 hammer2_flush(nchain, 1);
4464                 hammer2_chain_unlock(nchain);
4465                 hammer2_chain_drop(nchain);
4466                 hammer2_inode_unlock(nip);
4467         }
4468         return (error);
4469 }