hammer2 - Present hardlink solution, misc stability work
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem implements most of the core support functions for
37  * the hammer2_chain structure.
38  *
39  * Chains are the in-memory version on media objects (volume header, inodes,
40  * indirect blocks, data blocks, etc).  Chains represent a portion of the
41  * HAMMER2 topology.
42  *
43  * Chains are no-longer delete-duplicated.  Instead, the original in-memory
44  * chain will be moved along with its block reference (e.g. for things like
45  * renames, hardlink operations, modifications, etc), and will be indexed
46  * on a secondary list for flush handling instead of propagating a flag
47  * upward to the root.
48  *
49  * Concurrent front-end operations can still run against backend flushes
50  * as long as they do not cross the current flush boundary.  An operation
51  * running above the current flush (in areas not yet flushed) can become
52  * part of the current flush while ano peration running below the current
53  * flush can become part of the next flush.
54  */
55 #include <sys/cdefs.h>
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/types.h>
59 #include <sys/lock.h>
60 #include <sys/kern_syscall.h>
61 #include <sys/uuid.h>
62
63 #include <crypto/sha2/sha2.h>
64
65 #include "hammer2.h"
66
67 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
68
69 static hammer2_chain_t *hammer2_chain_create_indirect(
70                 hammer2_chain_t *parent,
71                 hammer2_key_t key, int keybits,
72                 hammer2_tid_t mtid, int for_type, int *errorp);
73 static void hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop);
74 static hammer2_chain_t *hammer2_combined_find(
75                 hammer2_chain_t *parent,
76                 hammer2_blockref_t *base, int count,
77                 int *cache_indexp, hammer2_key_t *key_nextp,
78                 hammer2_key_t key_beg, hammer2_key_t key_end,
79                 hammer2_blockref_t **bresp);
80
81 /*
82  * Basic RBTree for chains (core->rbtree and core->dbtree).  Chains cannot
83  * overlap in the RB trees.  Deleted chains are moved from rbtree to either
84  * dbtree or to dbq.
85  *
86  * Chains in delete-duplicate sequences can always iterate through core_entry
87  * to locate the live version of the chain.
88  */
89 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
90
91 int
92 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
93 {
94         hammer2_key_t c1_beg;
95         hammer2_key_t c1_end;
96         hammer2_key_t c2_beg;
97         hammer2_key_t c2_end;
98
99         /*
100          * Compare chains.  Overlaps are not supposed to happen and catch
101          * any software issues early we count overlaps as a match.
102          */
103         c1_beg = chain1->bref.key;
104         c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1;
105         c2_beg = chain2->bref.key;
106         c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1;
107
108         if (c1_end < c2_beg)    /* fully to the left */
109                 return(-1);
110         if (c1_beg > c2_end)    /* fully to the right */
111                 return(1);
112         return(0);              /* overlap (must not cross edge boundary) */
113 }
114
115 static __inline
116 int
117 hammer2_isclusterable(hammer2_chain_t *chain)
118 {
119         if (hammer2_cluster_enable) {
120                 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
121                     chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
122                     chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
123                         return(1);
124                 }
125         }
126         return(0);
127 }
128
129 /*
130  * Make a chain visible to the flusher.  The flusher needs to be able to
131  * do flushes of subdirectory chains or single files so it does a top-down
132  * recursion using the ONFLUSH flag for the recursion.  It locates MODIFIED
133  * or UPDATE chains and flushes back up the chain to the volume root.
134  *
135  * This routine sets ONFLUSH upward until it hits the volume root.  For
136  * simplicity we ignore PFSROOT boundaries whos rules can be complex.
137  * Extra ONFLUSH flagging doesn't hurt the filesystem.
138  */
139 void
140 hammer2_chain_setflush(hammer2_chain_t *chain)
141 {
142         hammer2_chain_t *parent;
143
144         if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) {
145                 hammer2_spin_sh(&chain->core.spin);
146                 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) {
147                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
148                         if ((parent = chain->parent) == NULL)
149                                 break;
150                         hammer2_spin_sh(&parent->core.spin);
151                         hammer2_spin_unsh(&chain->core.spin);
152                         chain = parent;
153                 }
154                 hammer2_spin_unsh(&chain->core.spin);
155         }
156 }
157
158 /*
159  * Allocate a new disconnected chain element representing the specified
160  * bref.  chain->refs is set to 1 and the passed bref is copied to
161  * chain->bref.  chain->bytes is derived from the bref.
162  *
163  * chain->pmp inherits pmp unless the chain is an inode (other than the
164  * super-root inode).
165  *
166  * NOTE: Returns a referenced but unlocked (because there is no core) chain.
167  */
168 hammer2_chain_t *
169 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp,
170                     hammer2_blockref_t *bref)
171 {
172         hammer2_chain_t *chain;
173         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
174
175         /*
176          * Construct the appropriate system structure.
177          */
178         switch(bref->type) {
179         case HAMMER2_BREF_TYPE_INODE:
180         case HAMMER2_BREF_TYPE_INDIRECT:
181         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
182         case HAMMER2_BREF_TYPE_DATA:
183         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
184                 /*
185                  * Chain's are really only associated with the hmp but we
186                  * maintain a pmp association for per-mount memory tracking
187                  * purposes.  The pmp can be NULL.
188                  */
189                 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO);
190                 break;
191         case HAMMER2_BREF_TYPE_VOLUME:
192         case HAMMER2_BREF_TYPE_FREEMAP:
193                 /*
194                  * Only hammer2_chain_bulksnap() calls this function with these
195                  * types.
196                  */
197                 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO);
198                 break;
199         default:
200                 chain = NULL;
201                 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
202                       bref->type);
203         }
204
205         /*
206          * Initialize the new chain structure.  pmp must be set to NULL for
207          * chains belonging to the super-root topology of a device mount.
208          */
209         if (pmp == hmp->spmp)
210                 chain->pmp = NULL;
211         else
212                 chain->pmp = pmp;
213         chain->hmp = hmp;
214         chain->bref = *bref;
215         chain->bytes = bytes;
216         chain->refs = 1;
217         chain->flags = HAMMER2_CHAIN_ALLOCATED;
218
219         /*
220          * Set the PFS boundary flag if this chain represents a PFS root.
221          */
222         if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT)
223                 chain->flags |= HAMMER2_CHAIN_PFSBOUNDARY;
224         hammer2_chain_core_init(chain);
225
226         return (chain);
227 }
228
229 /*
230  * Initialize a chain's core structure.  This structure used to be allocated
231  * but is now embedded.
232  *
233  * The core is not locked.  No additional refs on the chain are made.
234  * (trans) must not be NULL if (core) is not NULL.
235  */
236 void
237 hammer2_chain_core_init(hammer2_chain_t *chain)
238 {
239         /*
240          * Fresh core under nchain (no multi-homing of ochain's
241          * sub-tree).
242          */
243         RB_INIT(&chain->core.rbtree);   /* live chains */
244         hammer2_mtx_init(&chain->lock, "h2chain");
245 }
246
247 /*
248  * Add a reference to a chain element, preventing its destruction.
249  *
250  * (can be called with spinlock held)
251  */
252 void
253 hammer2_chain_ref(hammer2_chain_t *chain)
254 {
255         atomic_add_int(&chain->refs, 1);
256 #if 0
257         kprintf("REFC %p %d %08x\n", chain, chain->refs - 1, chain->flags);
258         print_backtrace(8);
259 #endif
260 }
261
262 /*
263  * Insert the chain in the core rbtree.
264  *
265  * Normal insertions are placed in the live rbtree.  Insertion of a deleted
266  * chain is a special case used by the flush code that is placed on the
267  * unstaged deleted list to avoid confusing the live view.
268  */
269 #define HAMMER2_CHAIN_INSERT_SPIN       0x0001
270 #define HAMMER2_CHAIN_INSERT_LIVE       0x0002
271 #define HAMMER2_CHAIN_INSERT_RACE       0x0004
272
273 static
274 int
275 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain,
276                      int flags, int generation)
277 {
278         hammer2_chain_t *xchain;
279         int error = 0;
280
281         if (flags & HAMMER2_CHAIN_INSERT_SPIN)
282                 hammer2_spin_ex(&parent->core.spin);
283
284         /*
285          * Interlocked by spinlock, check for race
286          */
287         if ((flags & HAMMER2_CHAIN_INSERT_RACE) &&
288             parent->core.generation != generation) {
289                 error = EAGAIN;
290                 goto failed;
291         }
292
293         /*
294          * Insert chain
295          */
296         xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain);
297         KASSERT(xchain == NULL,
298                 ("hammer2_chain_insert: collision %p %p", chain, xchain));
299         atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
300         chain->parent = parent;
301         ++parent->core.chain_count;
302         ++parent->core.generation;      /* XXX incs for _get() too, XXX */
303
304         /*
305          * We have to keep track of the effective live-view blockref count
306          * so the create code knows when to push an indirect block.
307          */
308         if (flags & HAMMER2_CHAIN_INSERT_LIVE)
309                 atomic_add_int(&parent->core.live_count, 1);
310 failed:
311         if (flags & HAMMER2_CHAIN_INSERT_SPIN)
312                 hammer2_spin_unex(&parent->core.spin);
313         return error;
314 }
315
316 /*
317  * Drop the caller's reference to the chain.  When the ref count drops to
318  * zero this function will try to disassociate the chain from its parent and
319  * deallocate it, then recursely drop the parent using the implied ref
320  * from the chain's chain->parent.
321  */
322 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain);
323
324 void
325 hammer2_chain_drop(hammer2_chain_t *chain)
326 {
327         u_int refs;
328
329         if (hammer2_debug & 0x200000)
330                 Debugger("drop");
331 #if 0
332         kprintf("DROP %p %d %08x\n", chain, chain->refs - 1, chain->flags);
333         print_backtrace(8);
334 #endif
335
336         KKASSERT(chain->refs > 0);
337
338         while (chain) {
339                 refs = chain->refs;
340                 cpu_ccfence();
341                 KKASSERT(refs > 0);
342
343                 if (refs == 1) {
344                         chain = hammer2_chain_lastdrop(chain);
345                 } else {
346                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1))
347                                 break;
348                         /* retry the same chain */
349                 }
350         }
351 }
352
353 /*
354  * Safe handling of the 1->0 transition on chain.  Returns a chain for
355  * recursive drop or NULL, possibly returning the same chain if the atomic
356  * op fails.
357  *
358  * Whem two chains need to be recursively dropped we use the chain
359  * we would otherwise free to placehold the additional chain.  It's a bit
360  * convoluted but we can't just recurse without potentially blowing out
361  * the kernel stack.
362  *
363  * The chain cannot be freed if it has any children.
364  * The chain cannot be freed if flagged MODIFIED unless we can dispose of that.
365  * The chain cannot be freed if flagged UPDATE unless we can dispose of that.
366  *
367  * The core spinlock is allowed nest child-to-parent (not parent-to-child).
368  */
369 static
370 hammer2_chain_t *
371 hammer2_chain_lastdrop(hammer2_chain_t *chain)
372 {
373         hammer2_pfs_t *pmp;
374         hammer2_dev_t *hmp;
375         hammer2_chain_t *parent;
376         hammer2_chain_t *rdrop;
377
378         /*
379          * Critical field access.
380          */
381         hammer2_spin_ex(&chain->core.spin);
382
383         if (chain->parent) {
384                 /*
385                  * If the chain has a parent the UPDATE bit prevents scrapping
386                  * as the chain is needed to properly flush the parent.  Try
387                  * to complete the 1->0 transition and return NULL.  Retry
388                  * (return chain) if we are unable to complete the 1->0
389                  * transition, else return NULL (nothing more to do).
390                  *
391                  * If the chain has a parent the MODIFIED bit prevents
392                  * scrapping.
393                  */
394                 if (chain->flags & (HAMMER2_CHAIN_UPDATE |
395                                     HAMMER2_CHAIN_MODIFIED)) {
396                         if (atomic_cmpset_int(&chain->refs, 1, 0)) {
397                                 hammer2_spin_unex(&chain->core.spin);
398                                 chain = NULL;
399                         } else {
400                                 hammer2_spin_unex(&chain->core.spin);
401                         }
402                         return (chain);
403                 }
404                 /* spinlock still held */
405         } else {
406                 /*
407                  * The chain has no parent and can be flagged for destruction.
408                  * Since it has no parent, UPDATE can also be cleared.
409                  */
410                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
411                 if (chain->flags & HAMMER2_CHAIN_UPDATE)
412                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
413
414                 /*
415                  * If the chain has children or if it has been MODIFIED and
416                  * also recorded for DEDUP, we must still flush the chain.
417                  *
418                  * In the case where it has children, the DESTROY flag test
419                  * in the flush code will prevent unnecessary flushes of
420                  * MODIFIED chains that are not flagged DEDUP so don't worry
421                  * about that here.
422                  */
423                 if (chain->core.chain_count ||
424                     (chain->flags & (HAMMER2_CHAIN_MODIFIED |
425                                      HAMMER2_CHAIN_DEDUP)) ==
426                     (HAMMER2_CHAIN_MODIFIED |
427                      HAMMER2_CHAIN_DEDUP)) {
428                         /*
429                          * Put on flushq (should ensure refs > 1), retry
430                          * the drop.
431                          */
432                         hammer2_spin_unex(&chain->core.spin);
433                         hammer2_delayed_flush(chain);
434                         return(chain);  /* retry drop */
435                 }
436
437                 /*
438                  * Otherwise we can scrap the MODIFIED bit if it is set,
439                  * and continue along the freeing path.
440                  */
441                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
442                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
443                         atomic_add_long(&hammer2_count_modified_chains, -1);
444                 }
445                 /* spinlock still held */
446         }
447
448         /*
449          * If any children exist we must leave the chain intact with refs == 0.
450          * They exist because chains are retained below us which have refs or
451          * may require flushing.  This case can occur when parent != NULL.
452          *
453          * Retry (return chain) if we fail to transition the refs to 0, else
454          * return NULL indication nothing more to do.
455          */
456         if (chain->core.chain_count) {
457                 if (atomic_cmpset_int(&chain->refs, 1, 0)) {
458                         hammer2_spin_unex(&chain->core.spin);
459                         chain = NULL;
460                 } else {
461                         hammer2_spin_unex(&chain->core.spin);
462                 }
463                 return (chain);
464         }
465         /* spinlock still held */
466         /* no chains left under us */
467
468         /*
469          * chain->core has no children left so no accessors can get to our
470          * chain from there.  Now we have to lock the parent core to interlock
471          * remaining possible accessors that might bump chain's refs before
472          * we can safely drop chain's refs with intent to free the chain.
473          */
474         hmp = chain->hmp;
475         pmp = chain->pmp;       /* can be NULL */
476         rdrop = NULL;
477
478         /*
479          * Spinlock the parent and try to drop the last ref on chain.
480          * On success remove chain from its parent, otherwise return NULL.
481          *
482          * (normal core locks are top-down recursive but we define core
483          *  spinlocks as bottom-up recursive, so this is safe).
484          */
485         if ((parent = chain->parent) != NULL) {
486                 hammer2_spin_ex(&parent->core.spin);
487                 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
488                         /* 1->0 transition failed, retry */
489                         hammer2_spin_unex(&parent->core.spin);
490                         hammer2_spin_unex(&chain->core.spin);
491                         return(chain);
492                 }
493
494                 /*
495                  * 1->0 transition successful, remove chain from the
496                  * parent.
497                  */
498                 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
499                         RB_REMOVE(hammer2_chain_tree,
500                                   &parent->core.rbtree, chain);
501                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
502                         --parent->core.chain_count;
503                         chain->parent = NULL;
504                 }
505
506                 /*
507                  * If our chain was the last chain in the parent's core the
508                  * core is now empty and its parent might have to be
509                  * re-dropped if it has 0 refs.
510                  */
511                 if (parent->core.chain_count == 0) {
512                         rdrop = parent;
513                         atomic_add_int(&rdrop->refs, 1);
514                         /*
515                         if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0)
516                                 rdrop = NULL;
517                         */
518                 }
519                 hammer2_spin_unex(&parent->core.spin);
520                 parent = NULL;  /* safety */
521         }
522
523         /*
524          * Successful 1->0 transition and the chain can be destroyed now.
525          *
526          * We still have the core spinlock, and core's chain_count is 0.
527          * Any parent spinlock is gone.
528          */
529         hammer2_spin_unex(&chain->core.spin);
530         KKASSERT(RB_EMPTY(&chain->core.rbtree) &&
531                  chain->core.chain_count == 0);
532
533         /*
534          * All spin locks are gone, no pointers remain to the chain, finish
535          * freeing it.
536          */
537         KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE |
538                                   HAMMER2_CHAIN_MODIFIED)) == 0);
539         hammer2_chain_drop_data(chain, 1);
540
541         KKASSERT(chain->dio == NULL);
542
543         /*
544          * Once chain resources are gone we can use the now dead chain
545          * structure to placehold what might otherwise require a recursive
546          * drop, because we have potentially two things to drop and can only
547          * return one directly.
548          */
549         if (chain->flags & HAMMER2_CHAIN_ALLOCATED) {
550                 chain->flags &= ~HAMMER2_CHAIN_ALLOCATED;
551                 chain->hmp = NULL;
552                 kfree(chain, hmp->mchain);
553         }
554
555         /*
556          * Possible chaining loop when parent re-drop needed.
557          */
558         return(rdrop);
559 }
560
561 /*
562  * On either last lock release or last drop
563  */
564 static void
565 hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop)
566 {
567         /*hammer2_dev_t *hmp = chain->hmp;*/
568
569         switch(chain->bref.type) {
570         case HAMMER2_BREF_TYPE_VOLUME:
571         case HAMMER2_BREF_TYPE_FREEMAP:
572                 if (lastdrop)
573                         chain->data = NULL;
574                 break;
575         default:
576                 KKASSERT(chain->data == NULL);
577                 break;
578         }
579 }
580
581 /*
582  * Lock a referenced chain element, acquiring its data with I/O if necessary,
583  * and specify how you would like the data to be resolved.
584  *
585  * If an I/O or other fatal error occurs, chain->error will be set to non-zero.
586  *
587  * The lock is allowed to recurse, multiple locking ops will aggregate
588  * the requested resolve types.  Once data is assigned it will not be
589  * removed until the last unlock.
590  *
591  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
592  *                         (typically used to avoid device/logical buffer
593  *                          aliasing for data)
594  *
595  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
596  *                         the INITIAL-create state (indirect blocks only).
597  *
598  *                         Do not resolve data elements for DATA chains.
599  *                         (typically used to avoid device/logical buffer
600  *                          aliasing for data)
601  *
602  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
603  *
604  * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
605  *                         it will be locked exclusive.
606  *
607  * NOTE: Embedded elements (volume header, inodes) are always resolved
608  *       regardless.
609  *
610  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
611  *       element will instantiate and zero its buffer, and flush it on
612  *       release.
613  *
614  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
615  *       so as not to instantiate a device buffer, which could alias against
616  *       a logical file buffer.  However, if ALWAYS is specified the
617  *       device buffer will be instantiated anyway.
618  *
619  * WARNING! This function blocks on I/O if data needs to be fetched.  This
620  *          blocking can run concurrent with other compatible lock holders
621  *          who do not need data returning.  The lock is not upgraded to
622  *          exclusive during a data fetch, a separate bit is used to
623  *          interlock I/O.  However, an exclusive lock holder can still count
624  *          on being interlocked against an I/O fetch managed by a shared
625  *          lock holder.
626  */
627 void
628 hammer2_chain_lock(hammer2_chain_t *chain, int how)
629 {
630         /*
631          * Ref and lock the element.  Recursive locks are allowed.
632          */
633         KKASSERT(chain->refs > 0);
634         atomic_add_int(&chain->lockcnt, 1);
635
636         /*
637          * Get the appropriate lock.
638          */
639         if (how & HAMMER2_RESOLVE_SHARED)
640                 hammer2_mtx_sh(&chain->lock);
641         else
642                 hammer2_mtx_ex(&chain->lock);
643         ++curthread->td_tracker;
644
645         /*
646          * If we already have a valid data pointer no further action is
647          * necessary.
648          */
649         if (chain->data)
650                 return;
651
652         /*
653          * Do we have to resolve the data?
654          */
655         switch(how & HAMMER2_RESOLVE_MASK) {
656         case HAMMER2_RESOLVE_NEVER:
657                 return;
658         case HAMMER2_RESOLVE_MAYBE:
659                 if (chain->flags & HAMMER2_CHAIN_INITIAL)
660                         return;
661                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
662                         return;
663 #if 0
664                 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE)
665                         return;
666                 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF)
667                         return;
668 #endif
669                 /* fall through */
670         case HAMMER2_RESOLVE_ALWAYS:
671         default:
672                 break;
673         }
674
675         /*
676          * Caller requires data
677          */
678         hammer2_chain_load_data(chain);
679 }
680
681 /*
682  * Downgrade an exclusive chain lock to a shared chain lock.
683  *
684  * NOTE: There is no upgrade equivalent due to the ease of
685  *       deadlocks in that direction.
686  */
687 void
688 hammer2_chain_lock_downgrade(hammer2_chain_t *chain)
689 {
690         hammer2_mtx_downgrade(&chain->lock);
691 }
692
693 /*
694  * Obtains a second shared lock on the chain, does not account the second
695  * shared lock as being owned by the current thread.
696  *
697  * Caller must already own a shared lock on this chain.
698  *
699  * The lock function is required to obtain the second shared lock without
700  * blocking on pending exclusive requests.
701  */
702 void
703 hammer2_chain_push_shared_lock(hammer2_chain_t *chain)
704 {
705         hammer2_mtx_sh_again(&chain->lock);
706         atomic_add_int(&chain->lockcnt, 1);
707         /* do not count in td_tracker for this thread */
708 }
709
710 /*
711  * Accounts for a shared lock that was pushed to us as being owned by our
712  * thread.
713  */
714 void
715 hammer2_chain_pull_shared_lock(hammer2_chain_t *chain)
716 {
717         ++curthread->td_tracker;
718 }
719
720 /*
721  * Issue I/O and install chain->data.  Caller must hold a chain lock, lock
722  * may be of any type.
723  *
724  * Once chain->data is set it cannot be disposed of until all locks are
725  * released.
726  */
727 void
728 hammer2_chain_load_data(hammer2_chain_t *chain)
729 {
730         hammer2_blockref_t *bref;
731         hammer2_dev_t *hmp;
732         char *bdata;
733         int error;
734
735         /*
736          * Degenerate case, data already present.
737          */
738         if (chain->data)
739                 return;
740
741         hmp = chain->hmp;
742         KKASSERT(hmp != NULL);
743
744         /*
745          * Gain the IOINPROG bit, interlocked block.
746          */
747         for (;;) {
748                 u_int oflags;
749                 u_int nflags;
750
751                 oflags = chain->flags;
752                 cpu_ccfence();
753                 if (oflags & HAMMER2_CHAIN_IOINPROG) {
754                         nflags = oflags | HAMMER2_CHAIN_IOSIGNAL;
755                         tsleep_interlock(&chain->flags, 0);
756                         if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
757                                 tsleep(&chain->flags, PINTERLOCKED,
758                                         "h2iocw", 0);
759                         }
760                         /* retry */
761                 } else {
762                         nflags = oflags | HAMMER2_CHAIN_IOINPROG;
763                         if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
764                                 break;
765                         }
766                         /* retry */
767                 }
768         }
769
770         /*
771          * We own CHAIN_IOINPROG
772          *
773          * Degenerate case if we raced another load.
774          */
775         if (chain->data)
776                 goto done;
777
778         /*
779          * We must resolve to a device buffer, either by issuing I/O or
780          * by creating a zero-fill element.  We do not mark the buffer
781          * dirty when creating a zero-fill element (the hammer2_chain_modify()
782          * API must still be used to do that).
783          *
784          * The device buffer is variable-sized in powers of 2 down
785          * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
786          * chunk always contains buffers of the same size. (XXX)
787          *
788          * The minimum physical IO size may be larger than the variable
789          * block size.
790          */
791         bref = &chain->bref;
792
793         /*
794          * The getblk() optimization can only be used on newly created
795          * elements if the physical block size matches the request.
796          */
797         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
798                 error = hammer2_io_new(hmp, bref->data_off, chain->bytes,
799                                         &chain->dio);
800         } else {
801                 error = hammer2_io_bread(hmp, bref->data_off, chain->bytes,
802                                          &chain->dio);
803                 hammer2_adjreadcounter(&chain->bref, chain->bytes);
804         }
805         if (error) {
806                 chain->error = HAMMER2_ERROR_IO;
807                 kprintf("hammer2_chain_lock: I/O error %016jx: %d\n",
808                         (intmax_t)bref->data_off, error);
809                 hammer2_io_bqrelse(&chain->dio);
810                 goto done;
811         }
812         chain->error = 0;
813
814         /*
815          * NOTE: A locked chain's data cannot be modified without first
816          *       calling hammer2_chain_modify().
817          */
818
819         /*
820          * Clear INITIAL.  In this case we used io_new() and the buffer has
821          * been zero'd and marked dirty.
822          */
823         bdata = hammer2_io_data(chain->dio, chain->bref.data_off);
824         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
825                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
826                 chain->bref.flags |= HAMMER2_BREF_FLAG_ZERO;
827         } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
828                 /*
829                  * check data not currently synchronized due to
830                  * modification.  XXX assumes data stays in the buffer
831                  * cache, which might not be true (need biodep on flush
832                  * to calculate crc?  or simple crc?).
833                  */
834         } else {
835                 if (hammer2_chain_testcheck(chain, bdata) == 0) {
836                         kprintf("chain %016jx.%02x meth=%02x "
837                                 "CHECK FAIL %08x (flags=%08x)\n",
838                                 chain->bref.data_off,
839                                 chain->bref.type,
840                                 chain->bref.methods,
841                                 hammer2_icrc32(bdata, chain->bytes),
842                                 chain->flags);
843                         chain->error = HAMMER2_ERROR_CHECK;
844                 }
845         }
846
847         /*
848          * Setup the data pointer, either pointing it to an embedded data
849          * structure and copying the data from the buffer, or pointing it
850          * into the buffer.
851          *
852          * The buffer is not retained when copying to an embedded data
853          * structure in order to avoid potential deadlocks or recursions
854          * on the same physical buffer.
855          *
856          * WARNING! Other threads can start using the data the instant we
857          *          set chain->data non-NULL.
858          */
859         switch (bref->type) {
860         case HAMMER2_BREF_TYPE_VOLUME:
861         case HAMMER2_BREF_TYPE_FREEMAP:
862                 /*
863                  * Copy data from bp to embedded buffer
864                  */
865                 panic("hammer2_chain_lock: called on unresolved volume header");
866                 break;
867         case HAMMER2_BREF_TYPE_INODE:
868         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
869         case HAMMER2_BREF_TYPE_INDIRECT:
870         case HAMMER2_BREF_TYPE_DATA:
871         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
872         default:
873                 /*
874                  * Point data at the device buffer and leave dio intact.
875                  */
876                 chain->data = (void *)bdata;
877                 break;
878         }
879
880         /*
881          * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested.
882          */
883 done:
884         for (;;) {
885                 u_int oflags;
886                 u_int nflags;
887
888                 oflags = chain->flags;
889                 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG |
890                                     HAMMER2_CHAIN_IOSIGNAL);
891                 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG);
892                 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
893                         if (oflags & HAMMER2_CHAIN_IOSIGNAL)
894                                 wakeup(&chain->flags);
895                         break;
896                 }
897         }
898 }
899
900 /*
901  * Unlock and deref a chain element.
902  *
903  * On the last lock release any non-embedded data (chain->dio) will be
904  * retired.
905  */
906 void
907 hammer2_chain_unlock(hammer2_chain_t *chain)
908 {
909         hammer2_mtx_state_t ostate;
910         long *counterp;
911         u_int lockcnt;
912
913         --curthread->td_tracker;
914         /*
915          * If multiple locks are present (or being attempted) on this
916          * particular chain we can just unlock, drop refs, and return.
917          *
918          * Otherwise fall-through on the 1->0 transition.
919          */
920         for (;;) {
921                 lockcnt = chain->lockcnt;
922                 KKASSERT(lockcnt > 0);
923                 cpu_ccfence();
924                 if (lockcnt > 1) {
925                         if (atomic_cmpset_int(&chain->lockcnt,
926                                               lockcnt, lockcnt - 1)) {
927                                 hammer2_mtx_unlock(&chain->lock);
928                                 return;
929                         }
930                 } else {
931                         if (atomic_cmpset_int(&chain->lockcnt, 1, 0))
932                                 break;
933                 }
934                 /* retry */
935         }
936
937         /*
938          * On the 1->0 transition we upgrade the core lock (if necessary)
939          * to exclusive for terminal processing.  If after upgrading we find
940          * that lockcnt is non-zero, another thread is racing us and will
941          * handle the unload for us later on, so just cleanup and return
942          * leaving the data/io intact
943          *
944          * Otherwise if lockcnt is still 0 it is possible for it to become
945          * non-zero and race, but since we hold the core->lock exclusively
946          * all that will happen is that the chain will be reloaded after we
947          * unload it.
948          */
949         ostate = hammer2_mtx_upgrade(&chain->lock);
950         if (chain->lockcnt) {
951                 hammer2_mtx_unlock(&chain->lock);
952                 return;
953         }
954
955         /*
956          * Shortcut the case if the data is embedded or not resolved.
957          * Only drop non-DIO-based data if the chain is not modified.
958          *
959          * Do NOT NULL out chain->data (e.g. inode data), it might be
960          * dirty.
961          */
962         if (chain->dio == NULL) {
963                 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0)
964                         hammer2_chain_drop_data(chain, 0);
965                 hammer2_mtx_unlock(&chain->lock);
966                 return;
967         }
968
969         /*
970          * Statistics
971          */
972         if (hammer2_io_isdirty(chain->dio)) {
973                 switch(chain->bref.type) {
974                 case HAMMER2_BREF_TYPE_DATA:
975                         counterp = &hammer2_iod_file_write;
976                         break;
977                 case HAMMER2_BREF_TYPE_INODE:
978                         counterp = &hammer2_iod_meta_write;
979                         break;
980                 case HAMMER2_BREF_TYPE_INDIRECT:
981                         counterp = &hammer2_iod_indr_write;
982                         break;
983                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
984                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
985                         counterp = &hammer2_iod_fmap_write;
986                         break;
987                 default:
988                         counterp = &hammer2_iod_volu_write;
989                         break;
990                 }
991                 *counterp += chain->bytes;
992         }
993
994         /*
995          * Clean out the dio.
996          *
997          * NOTE: Freemap leaf's use reserved blocks and thus no aliasing
998          *       is possible.
999          */
1000         chain->data = NULL;
1001         hammer2_io_bqrelse(&chain->dio);
1002         hammer2_mtx_unlock(&chain->lock);
1003 }
1004
1005 /*
1006  * Helper to obtain the blockref[] array base and count for a chain.
1007  *
1008  * XXX Not widely used yet, various use cases need to be validated and
1009  *     converted to use this function.
1010  */
1011 static
1012 hammer2_blockref_t *
1013 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp)
1014 {
1015         hammer2_blockref_t *base;
1016         int count;
1017
1018         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1019                 base = NULL;
1020
1021                 switch(parent->bref.type) {
1022                 case HAMMER2_BREF_TYPE_INODE:
1023                         count = HAMMER2_SET_COUNT;
1024                         break;
1025                 case HAMMER2_BREF_TYPE_INDIRECT:
1026                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1027                         count = parent->bytes / sizeof(hammer2_blockref_t);
1028                         break;
1029                 case HAMMER2_BREF_TYPE_VOLUME:
1030                         count = HAMMER2_SET_COUNT;
1031                         break;
1032                 case HAMMER2_BREF_TYPE_FREEMAP:
1033                         count = HAMMER2_SET_COUNT;
1034                         break;
1035                 default:
1036                         panic("hammer2_chain_create_indirect: "
1037                               "unrecognized blockref type: %d",
1038                               parent->bref.type);
1039                         count = 0;
1040                         break;
1041                 }
1042         } else {
1043                 switch(parent->bref.type) {
1044                 case HAMMER2_BREF_TYPE_INODE:
1045                         base = &parent->data->ipdata.u.blockset.blockref[0];
1046                         count = HAMMER2_SET_COUNT;
1047                         break;
1048                 case HAMMER2_BREF_TYPE_INDIRECT:
1049                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1050                         base = &parent->data->npdata[0];
1051                         count = parent->bytes / sizeof(hammer2_blockref_t);
1052                         break;
1053                 case HAMMER2_BREF_TYPE_VOLUME:
1054                         base = &parent->data->voldata.
1055                                         sroot_blockset.blockref[0];
1056                         count = HAMMER2_SET_COUNT;
1057                         break;
1058                 case HAMMER2_BREF_TYPE_FREEMAP:
1059                         base = &parent->data->blkset.blockref[0];
1060                         count = HAMMER2_SET_COUNT;
1061                         break;
1062                 default:
1063                         panic("hammer2_chain_create_indirect: "
1064                               "unrecognized blockref type: %d",
1065                               parent->bref.type);
1066                         count = 0;
1067                         break;
1068                 }
1069         }
1070         *countp = count;
1071
1072         return base;
1073 }
1074
1075 /*
1076  * This counts the number of live blockrefs in a block array and
1077  * also calculates the point at which all remaining blockrefs are empty.
1078  * This routine can only be called on a live chain (DUPLICATED flag not set).
1079  *
1080  * NOTE: Flag is not set until after the count is complete, allowing
1081  *       callers to test the flag without holding the spinlock.
1082  *
1083  * NOTE: If base is NULL the related chain is still in the INITIAL
1084  *       state and there are no blockrefs to count.
1085  *
1086  * NOTE: live_count may already have some counts accumulated due to
1087  *       creation and deletion and could even be initially negative.
1088  */
1089 void
1090 hammer2_chain_countbrefs(hammer2_chain_t *chain,
1091                          hammer2_blockref_t *base, int count)
1092 {
1093         hammer2_spin_ex(&chain->core.spin);
1094         if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) {
1095                 if (base) {
1096                         while (--count >= 0) {
1097                                 if (base[count].type)
1098                                         break;
1099                         }
1100                         chain->core.live_zero = count + 1;
1101                         while (count >= 0) {
1102                                 if (base[count].type)
1103                                         atomic_add_int(&chain->core.live_count,
1104                                                        1);
1105                                 --count;
1106                         }
1107                 } else {
1108                         chain->core.live_zero = 0;
1109                 }
1110                 /* else do not modify live_count */
1111                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS);
1112         }
1113         hammer2_spin_unex(&chain->core.spin);
1114 }
1115
1116 /*
1117  * Resize the chain's physical storage allocation in-place.  This function does
1118  * not adjust the data pointer and must be followed by (typically) a
1119  * hammer2_chain_modify() call to copy any old data over and adjust the
1120  * data pointer.
1121  *
1122  * Chains can be resized smaller without reallocating the storage.  Resizing
1123  * larger will reallocate the storage.  Excess or prior storage is reclaimed
1124  * asynchronously at a later time.
1125  *
1126  * Must be passed an exclusively locked parent and chain.
1127  *
1128  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
1129  * to avoid instantiating a device buffer that conflicts with the vnode data
1130  * buffer.  However, because H2 can compress or encrypt data, the chain may
1131  * have a dio assigned to it in those situations, and they do not conflict.
1132  *
1133  * XXX return error if cannot resize.
1134  */
1135 void
1136 hammer2_chain_resize(hammer2_inode_t *ip,
1137                      hammer2_chain_t *parent, hammer2_chain_t *chain,
1138                      hammer2_tid_t mtid, hammer2_off_t dedup_off,
1139                      int nradix, int flags)
1140 {
1141         hammer2_dev_t *hmp;
1142         size_t obytes;
1143         size_t nbytes;
1144
1145         hmp = chain->hmp;
1146
1147         /*
1148          * Only data and indirect blocks can be resized for now.
1149          * (The volu root, inodes, and freemap elements use a fixed size).
1150          */
1151         KKASSERT(chain != &hmp->vchain);
1152         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1153                  chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
1154         KKASSERT(chain->parent == parent);
1155
1156         /*
1157          * Nothing to do if the element is already the proper size
1158          */
1159         obytes = chain->bytes;
1160         nbytes = 1U << nradix;
1161         if (obytes == nbytes)
1162                 return;
1163
1164         /*
1165          * Make sure the old data is instantiated so we can copy it.  If this
1166          * is a data block, the device data may be superfluous since the data
1167          * might be in a logical block, but compressed or encrypted data is
1168          * another matter.
1169          *
1170          * NOTE: The modify will set BMAPUPD for us if BMAPPED is set.
1171          */
1172         hammer2_chain_modify(chain, mtid, dedup_off, 0);
1173
1174         /*
1175          * Relocate the block, even if making it smaller (because different
1176          * block sizes may be in different regions).
1177          *
1178          * (data blocks only, we aren't copying the storage here).
1179          */
1180         hammer2_freemap_alloc(chain, nbytes);
1181         chain->bytes = nbytes;
1182         /*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */
1183
1184         /*
1185          * We don't want the followup chain_modify() to try to copy data
1186          * from the old (wrong-sized) buffer.  It won't know how much to
1187          * copy.  This case should only occur during writes when the
1188          * originator already has the data to write in-hand.
1189          */
1190         if (chain->dio) {
1191                 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA);
1192                 hammer2_io_brelse(&chain->dio);
1193                 chain->data = NULL;
1194         }
1195 }
1196
1197 /*
1198  * Set the chain modified so its data can be changed by the caller.
1199  *
1200  * Sets bref.modify_tid to mtid only if mtid != 0.  Note that bref.modify_tid
1201  * is a CLC (cluster level change) field and is not updated by parent
1202  * propagation during a flush.
1203  *
1204  * If the caller passes a non-zero dedup_off we assign data_off to that
1205  * instead of allocating a ne block.  Caller must not modify the data already
1206  * present at the target offset.
1207  */
1208 void
1209 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid,
1210                      hammer2_off_t dedup_off, int flags)
1211 {
1212         hammer2_blockref_t obref;
1213         hammer2_dev_t *hmp;
1214         hammer2_io_t *dio;
1215         int error;
1216         int wasinitial;
1217         int newmod;
1218         char *bdata;
1219
1220         hmp = chain->hmp;
1221         obref = chain->bref;
1222         KKASSERT((chain->flags & HAMMER2_CHAIN_FICTITIOUS) == 0);
1223
1224         /*
1225          * Data is not optional for freemap chains (we must always be sure
1226          * to copy the data on COW storage allocations).
1227          */
1228         if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
1229             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
1230                 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) ||
1231                          (flags & HAMMER2_MODIFY_OPTDATA) == 0);
1232         }
1233
1234         /*
1235          * Data must be resolved if already assigned, unless explicitly
1236          * flagged otherwise.
1237          */
1238         if (chain->data == NULL && (flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
1239             (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
1240                 hammer2_chain_load_data(chain);
1241         }
1242
1243         /*
1244          * Set MODIFIED to indicate that the chain has been modified.
1245          * Set UPDATE to ensure that the blockref is updated in the parent.
1246          */
1247         if ((chain->flags & (HAMMER2_CHAIN_DEDUP | HAMMER2_CHAIN_MODIFIED)) ==
1248             (HAMMER2_CHAIN_DEDUP | HAMMER2_CHAIN_MODIFIED)) {
1249                 /*
1250                  * Modified already set but a new allocation is needed
1251                  * anyway because we recorded this data_off for possible
1252                  * dedup operation.
1253                  */
1254                 newmod = 1;
1255         } else if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1256                 /*
1257                  * Must set modified bit.
1258                  */
1259                 atomic_add_long(&hammer2_count_modified_chains, 1);
1260                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1261                 hammer2_pfs_memory_inc(chain->pmp);     /* can be NULL */
1262                 newmod = 1;
1263         } else {
1264                 /*
1265                  * Already flagged modified, no new allocation is needed.
1266                  */
1267                 newmod = 0;
1268         }
1269
1270         /*
1271          * Flag parent update required, clear DEDUP flag (already processed
1272          * above).
1273          */
1274         if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0)
1275                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1276         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DEDUP);
1277
1278         /*
1279          * The modification or re-modification requires an allocation and
1280          * possible COW.
1281          *
1282          * If dedup_off is non-zero, caller already has a data offset
1283          * containing the caller's desired data.  The dedup offset is
1284          * allowed to be in a partially free state and we must be sure
1285          * to reset it to a fully allocated state to force two bulkfree
1286          * passes to free it again.
1287          *
1288          * XXX can a chain already be marked MODIFIED without a data
1289          * assignment?  If not, assert here instead of testing the case.
1290          */
1291         if (chain != &hmp->vchain && chain != &hmp->fchain) {
1292                 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 ||
1293                      newmod
1294                 ) {
1295                         if (dedup_off) {
1296                                 chain->bref.data_off = dedup_off;
1297                                 chain->bytes = 1 << (dedup_off &
1298                                                      HAMMER2_OFF_MASK_RADIX);
1299                                 atomic_set_int(&chain->flags,
1300                                                HAMMER2_CHAIN_DEDUP);
1301                                 hammer2_freemap_adjust(hmp, &chain->bref,
1302                                                 HAMMER2_FREEMAP_DORECOVER);
1303                         } else {
1304                                 hammer2_freemap_alloc(chain, chain->bytes);
1305                         }
1306                         /* XXX failed allocation */
1307                 }
1308         }
1309
1310         /*
1311          * Update mirror_tid and modify_tid.  modify_tid is only updated
1312          * if not passed as zero (during flushes, parent propagation passes
1313          * the value 0).
1314          *
1315          * NOTE: chain->pmp could be the device spmp.
1316          */
1317         chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1;
1318         if (mtid)
1319                 chain->bref.modify_tid = mtid;
1320
1321         /*
1322          * Set BMAPUPD to tell the flush code that an existing blockmap entry
1323          * requires updating as well as to tell the delete code that the
1324          * chain's blockref might not exactly match (in terms of physical size
1325          * or block offset) the one in the parent's blocktable.  The base key
1326          * of course will still match.
1327          */
1328         if (chain->flags & HAMMER2_CHAIN_BMAPPED)
1329                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD);
1330
1331         /*
1332          * Short-cut data blocks which the caller does not need an actual
1333          * data reference to (aka OPTDATA), as long as the chain does not
1334          * already have a data pointer to the data.  This generally means
1335          * that the modifications are being done via the logical buffer cache.
1336          * The INITIAL flag relates only to the device data buffer and thus
1337          * remains unchange in this situation.
1338          */
1339         if (chain->bref.type == HAMMER2_BREF_TYPE_DATA &&
1340             (flags & HAMMER2_MODIFY_OPTDATA) &&
1341             chain->data == NULL) {
1342                 goto skip2;
1343         }
1344
1345         /*
1346          * Clearing the INITIAL flag (for indirect blocks) indicates that
1347          * we've processed the uninitialized storage allocation.
1348          *
1349          * If this flag is already clear we are likely in a copy-on-write
1350          * situation but we have to be sure NOT to bzero the storage if
1351          * no data is present.
1352          */
1353         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1354                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1355                 wasinitial = 1;
1356         } else {
1357                 wasinitial = 0;
1358         }
1359
1360         /*
1361          * Instantiate data buffer and possibly execute COW operation
1362          */
1363         switch(chain->bref.type) {
1364         case HAMMER2_BREF_TYPE_VOLUME:
1365         case HAMMER2_BREF_TYPE_FREEMAP:
1366                 /*
1367                  * The data is embedded, no copy-on-write operation is
1368                  * needed.
1369                  */
1370                 KKASSERT(chain->dio == NULL);
1371                 break;
1372         case HAMMER2_BREF_TYPE_INODE:
1373         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1374         case HAMMER2_BREF_TYPE_DATA:
1375         case HAMMER2_BREF_TYPE_INDIRECT:
1376         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1377                 /*
1378                  * Perform the copy-on-write operation
1379                  *
1380                  * zero-fill or copy-on-write depending on whether
1381                  * chain->data exists or not and set the dirty state for
1382                  * the new buffer.  hammer2_io_new() will handle the
1383                  * zero-fill.
1384                  *
1385                  * If a dedup_off was supplied this is an existing block
1386                  * and no COW, copy, or further modification is required.
1387                  */
1388                 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain);
1389
1390                 if (wasinitial && dedup_off == 0) {
1391                         error = hammer2_io_new(hmp, chain->bref.data_off,
1392                                                chain->bytes, &dio);
1393                 } else {
1394                         error = hammer2_io_bread(hmp, chain->bref.data_off,
1395                                                  chain->bytes, &dio);
1396                 }
1397                 hammer2_adjreadcounter(&chain->bref, chain->bytes);
1398
1399                 /*
1400                  * If an I/O error occurs make sure callers cannot accidently
1401                  * modify the old buffer's contents and corrupt the filesystem.
1402                  */
1403                 if (error) {
1404                         kprintf("hammer2_chain_modify: hmp=%p I/O error\n",
1405                                 hmp);
1406                         chain->error = HAMMER2_ERROR_IO;
1407                         hammer2_io_brelse(&dio);
1408                         hammer2_io_brelse(&chain->dio);
1409                         chain->data = NULL;
1410                         break;
1411                 }
1412                 chain->error = 0;
1413                 bdata = hammer2_io_data(dio, chain->bref.data_off);
1414
1415                 if (chain->data) {
1416                         /*
1417                          * COW (unless a dedup).
1418                          */
1419                         KKASSERT(chain->dio != NULL);
1420                         if (chain->data != (void *)bdata && dedup_off == 0) {
1421                                 bcopy(chain->data, bdata, chain->bytes);
1422                         }
1423                 } else if (wasinitial == 0) {
1424                         /*
1425                          * We have a problem.  We were asked to COW but
1426                          * we don't have any data to COW with!
1427                          */
1428                         panic("hammer2_chain_modify: having a COW %p\n",
1429                               chain);
1430                 }
1431
1432                 /*
1433                  * Retire the old buffer, replace with the new.  Dirty or
1434                  * redirty the new buffer.
1435                  *
1436                  * WARNING! The system buffer cache may have already flushed
1437                  *          the buffer, so we must be sure to [re]dirty it
1438                  *          for further modification.
1439                  *
1440                  *          If dedup_off was supplied, the caller is not
1441                  *          expected to make any further modification to the
1442                  *          buffer.
1443                  */
1444                 if (chain->dio)
1445                         hammer2_io_bqrelse(&chain->dio);
1446                 chain->data = (void *)bdata;
1447                 chain->dio = dio;
1448                 if (dedup_off == 0)
1449                         hammer2_io_setdirty(dio);
1450                 break;
1451         default:
1452                 panic("hammer2_chain_modify: illegal non-embedded type %d",
1453                       chain->bref.type);
1454                 break;
1455
1456         }
1457 skip2:
1458         /*
1459          * setflush on parent indicating that the parent must recurse down
1460          * to us.  Do not call on chain itself which might already have it
1461          * set.
1462          */
1463         if (chain->parent)
1464                 hammer2_chain_setflush(chain->parent);
1465 }
1466
1467 /*
1468  * Modify the chain associated with an inode.
1469  */
1470 void
1471 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain,
1472                         hammer2_tid_t mtid, int flags)
1473 {
1474         hammer2_inode_modify(ip);
1475         hammer2_chain_modify(chain, mtid, 0, flags);
1476 }
1477
1478 /*
1479  * Volume header data locks
1480  */
1481 void
1482 hammer2_voldata_lock(hammer2_dev_t *hmp)
1483 {
1484         lockmgr(&hmp->vollk, LK_EXCLUSIVE);
1485 }
1486
1487 void
1488 hammer2_voldata_unlock(hammer2_dev_t *hmp)
1489 {
1490         lockmgr(&hmp->vollk, LK_RELEASE);
1491 }
1492
1493 void
1494 hammer2_voldata_modify(hammer2_dev_t *hmp)
1495 {
1496         if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1497                 atomic_add_long(&hammer2_count_modified_chains, 1);
1498                 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED);
1499                 hammer2_pfs_memory_inc(hmp->vchain.pmp);
1500         }
1501 }
1502
1503 /*
1504  * This function returns the chain at the nearest key within the specified
1505  * range.  The returned chain will be referenced but not locked.
1506  *
1507  * This function will recurse through chain->rbtree as necessary and will
1508  * return a *key_nextp suitable for iteration.  *key_nextp is only set if
1509  * the iteration value is less than the current value of *key_nextp.
1510  *
1511  * The caller should use (*key_nextp) to calculate the actual range of
1512  * the returned element, which will be (key_beg to *key_nextp - 1), because
1513  * there might be another element which is superior to the returned element
1514  * and overlaps it.
1515  *
1516  * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL
1517  * chains continue to be returned.  On EOF (*key_nextp) may overflow since
1518  * it will wind up being (key_end + 1).
1519  *
1520  * WARNING!  Must be called with child's spinlock held.  Spinlock remains
1521  *           held through the operation.
1522  */
1523 struct hammer2_chain_find_info {
1524         hammer2_chain_t         *best;
1525         hammer2_key_t           key_beg;
1526         hammer2_key_t           key_end;
1527         hammer2_key_t           key_next;
1528 };
1529
1530 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data);
1531 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data);
1532
1533 static
1534 hammer2_chain_t *
1535 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp,
1536                           hammer2_key_t key_beg, hammer2_key_t key_end)
1537 {
1538         struct hammer2_chain_find_info info;
1539
1540         info.best = NULL;
1541         info.key_beg = key_beg;
1542         info.key_end = key_end;
1543         info.key_next = *key_nextp;
1544
1545         RB_SCAN(hammer2_chain_tree, &parent->core.rbtree,
1546                 hammer2_chain_find_cmp, hammer2_chain_find_callback,
1547                 &info);
1548         *key_nextp = info.key_next;
1549 #if 0
1550         kprintf("chain_find %p %016jx:%016jx next=%016jx\n",
1551                 parent, key_beg, key_end, *key_nextp);
1552 #endif
1553
1554         return (info.best);
1555 }
1556
1557 static
1558 int
1559 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data)
1560 {
1561         struct hammer2_chain_find_info *info = data;
1562         hammer2_key_t child_beg;
1563         hammer2_key_t child_end;
1564
1565         child_beg = child->bref.key;
1566         child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1;
1567
1568         if (child_end < info->key_beg)
1569                 return(-1);
1570         if (child_beg > info->key_end)
1571                 return(1);
1572         return(0);
1573 }
1574
1575 static
1576 int
1577 hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
1578 {
1579         struct hammer2_chain_find_info *info = data;
1580         hammer2_chain_t *best;
1581         hammer2_key_t child_end;
1582
1583         /*
1584          * WARNING! Do not discard DUPLICATED chains, it is possible that
1585          *          we are catching an insertion half-way done.  If a
1586          *          duplicated chain turns out to be the best choice the
1587          *          caller will re-check its flags after locking it.
1588          *
1589          * WARNING! Layerq is scanned forwards, exact matches should keep
1590          *          the existing info->best.
1591          */
1592         if ((best = info->best) == NULL) {
1593                 /*
1594                  * No previous best.  Assign best
1595                  */
1596                 info->best = child;
1597         } else if (best->bref.key <= info->key_beg &&
1598                    child->bref.key <= info->key_beg) {
1599                 /*
1600                  * Illegal overlap.
1601                  */
1602                 KKASSERT(0);
1603                 /*info->best = child;*/
1604         } else if (child->bref.key < best->bref.key) {
1605                 /*
1606                  * Child has a nearer key and best is not flush with key_beg.
1607                  * Set best to child.  Truncate key_next to the old best key.
1608                  */
1609                 info->best = child;
1610                 if (info->key_next > best->bref.key || info->key_next == 0)
1611                         info->key_next = best->bref.key;
1612         } else if (child->bref.key == best->bref.key) {
1613                 /*
1614                  * If our current best is flush with the child then this
1615                  * is an illegal overlap.
1616                  *
1617                  * key_next will automatically be limited to the smaller of
1618                  * the two end-points.
1619                  */
1620                 KKASSERT(0);
1621                 info->best = child;
1622         } else {
1623                 /*
1624                  * Keep the current best but truncate key_next to the child's
1625                  * base.
1626                  *
1627                  * key_next will also automatically be limited to the smaller
1628                  * of the two end-points (probably not necessary for this case
1629                  * but we do it anyway).
1630                  */
1631                 if (info->key_next > child->bref.key || info->key_next == 0)
1632                         info->key_next = child->bref.key;
1633         }
1634
1635         /*
1636          * Always truncate key_next based on child's end-of-range.
1637          */
1638         child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits);
1639         if (child_end && (info->key_next > child_end || info->key_next == 0))
1640                 info->key_next = child_end;
1641
1642         return(0);
1643 }
1644
1645 /*
1646  * Retrieve the specified chain from a media blockref, creating the
1647  * in-memory chain structure which reflects it.
1648  *
1649  * To handle insertion races pass the INSERT_RACE flag along with the
1650  * generation number of the core.  NULL will be returned if the generation
1651  * number changes before we have a chance to insert the chain.  Insert
1652  * races can occur because the parent might be held shared.
1653  *
1654  * Caller must hold the parent locked shared or exclusive since we may
1655  * need the parent's bref array to find our block.
1656  *
1657  * WARNING! chain->pmp is always set to NULL for any chain representing
1658  *          part of the super-root topology.
1659  */
1660 hammer2_chain_t *
1661 hammer2_chain_get(hammer2_chain_t *parent, int generation,
1662                   hammer2_blockref_t *bref)
1663 {
1664         hammer2_dev_t *hmp = parent->hmp;
1665         hammer2_chain_t *chain;
1666         int error;
1667
1668         /*
1669          * Allocate a chain structure representing the existing media
1670          * entry.  Resulting chain has one ref and is not locked.
1671          */
1672         if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT)
1673                 chain = hammer2_chain_alloc(hmp, NULL, bref);
1674         else
1675                 chain = hammer2_chain_alloc(hmp, parent->pmp, bref);
1676         /* ref'd chain returned */
1677
1678         /*
1679          * Flag that the chain is in the parent's blockmap so delete/flush
1680          * knows what to do with it.
1681          */
1682         atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED);
1683
1684         /*
1685          * Link the chain into its parent.  A spinlock is required to safely
1686          * access the RBTREE, and it is possible to collide with another
1687          * hammer2_chain_get() operation because the caller might only hold
1688          * a shared lock on the parent.
1689          */
1690         KKASSERT(parent->refs > 0);
1691         error = hammer2_chain_insert(parent, chain,
1692                                      HAMMER2_CHAIN_INSERT_SPIN |
1693                                      HAMMER2_CHAIN_INSERT_RACE,
1694                                      generation);
1695         if (error) {
1696                 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
1697                 kprintf("chain %p get race\n", chain);
1698                 hammer2_chain_drop(chain);
1699                 chain = NULL;
1700         } else {
1701                 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
1702         }
1703
1704         /*
1705          * Return our new chain referenced but not locked, or NULL if
1706          * a race occurred.
1707          */
1708         return (chain);
1709 }
1710
1711 /*
1712  * Lookup initialization/completion API
1713  */
1714 hammer2_chain_t *
1715 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags)
1716 {
1717         hammer2_chain_ref(parent);
1718         if (flags & HAMMER2_LOOKUP_SHARED) {
1719                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
1720                                            HAMMER2_RESOLVE_SHARED);
1721         } else {
1722                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
1723         }
1724         return (parent);
1725 }
1726
1727 void
1728 hammer2_chain_lookup_done(hammer2_chain_t *parent)
1729 {
1730         if (parent) {
1731                 hammer2_chain_unlock(parent);
1732                 hammer2_chain_drop(parent);
1733         }
1734 }
1735
1736 hammer2_chain_t *
1737 hammer2_chain_getparent(hammer2_chain_t **parentp, int how)
1738 {
1739         hammer2_chain_t *oparent;
1740         hammer2_chain_t *nparent;
1741
1742         /*
1743          * Be careful of order, oparent must be unlocked before nparent
1744          * is locked below to avoid a deadlock.
1745          */
1746         oparent = *parentp;
1747         hammer2_spin_ex(&oparent->core.spin);
1748         nparent = oparent->parent;
1749         hammer2_chain_ref(nparent);
1750         hammer2_spin_unex(&oparent->core.spin);
1751         if (oparent) {
1752                 hammer2_chain_unlock(oparent);
1753                 hammer2_chain_drop(oparent);
1754                 oparent = NULL;
1755         }
1756
1757         hammer2_chain_lock(nparent, how);
1758         *parentp = nparent;
1759
1760         return (nparent);
1761 }
1762
1763 /*
1764  * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive.
1765  * (*parentp) typically points to an inode but can also point to a related
1766  * indirect block and this function will recurse upwards and find the inode
1767  * again.
1768  *
1769  * (*parentp) must be exclusively locked and referenced and can be an inode
1770  * or an existing indirect block within the inode.
1771  *
1772  * On return (*parentp) will be modified to point at the deepest parent chain
1773  * element encountered during the search, as a helper for an insertion or
1774  * deletion.   The new (*parentp) will be locked and referenced and the old
1775  * will be unlocked and dereferenced (no change if they are both the same).
1776  *
1777  * The matching chain will be returned exclusively locked.  If NOLOCK is
1778  * requested the chain will be returned only referenced.  Note that the
1779  * parent chain must always be locked shared or exclusive, matching the
1780  * HAMMER2_LOOKUP_SHARED flag.  We can conceivably lock it SHARED temporarily
1781  * when NOLOCK is specified but that complicates matters if *parentp must
1782  * inherit the chain.
1783  *
1784  * NOLOCK also implies NODATA, since an unlocked chain usually has a NULL
1785  * data pointer or can otherwise be in flux.
1786  *
1787  * NULL is returned if no match was found, but (*parentp) will still
1788  * potentially be adjusted.
1789  *
1790  * If a fatal error occurs (typically an I/O error), a dummy chain is
1791  * returned with chain->error and error-identifying information set.  This
1792  * chain will assert if you try to do anything fancy with it.
1793  *
1794  * XXX Depending on where the error occurs we should allow continued iteration.
1795  *
1796  * On return (*key_nextp) will point to an iterative value for key_beg.
1797  * (If NULL is returned (*key_nextp) is set to (key_end + 1)).
1798  *
1799  * This function will also recurse up the chain if the key is not within the
1800  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
1801  * can simply allow (*parentp) to float inside the loop.
1802  *
1803  * NOTE!  chain->data is not always resolved.  By default it will not be
1804  *        resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF.  Use
1805  *        HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/
1806  *        BREF_TYPE_DATA as the device buffer can alias the logical file
1807  *        buffer).
1808  */
1809 hammer2_chain_t *
1810 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp,
1811                      hammer2_key_t key_beg, hammer2_key_t key_end,
1812                      int *cache_indexp, int flags)
1813 {
1814         hammer2_dev_t *hmp;
1815         hammer2_chain_t *parent;
1816         hammer2_chain_t *chain;
1817         hammer2_blockref_t *base;
1818         hammer2_blockref_t *bref;
1819         hammer2_blockref_t bcopy;
1820         hammer2_key_t scan_beg;
1821         hammer2_key_t scan_end;
1822         int count = 0;
1823         int how_always = HAMMER2_RESOLVE_ALWAYS;
1824         int how_maybe = HAMMER2_RESOLVE_MAYBE;
1825         int how;
1826         int generation;
1827         int maxloops = 300000;
1828
1829         if (flags & HAMMER2_LOOKUP_ALWAYS) {
1830                 how_maybe = how_always;
1831                 how = HAMMER2_RESOLVE_ALWAYS;
1832         } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) {
1833                 how = HAMMER2_RESOLVE_NEVER;
1834         } else {
1835                 how = HAMMER2_RESOLVE_MAYBE;
1836         }
1837         if (flags & HAMMER2_LOOKUP_SHARED) {
1838                 how_maybe |= HAMMER2_RESOLVE_SHARED;
1839                 how_always |= HAMMER2_RESOLVE_SHARED;
1840                 how |= HAMMER2_RESOLVE_SHARED;
1841         }
1842
1843         /*
1844          * Recurse (*parentp) upward if necessary until the parent completely
1845          * encloses the key range or we hit the inode.
1846          *
1847          * This function handles races against the flusher doing a delete-
1848          * duplicate above us and re-homes the parent to the duplicate in
1849          * that case, otherwise we'd wind up recursing down a stale chain.
1850          */
1851         parent = *parentp;
1852         hmp = parent->hmp;
1853
1854         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1855                parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1856                 scan_beg = parent->bref.key;
1857                 scan_end = scan_beg +
1858                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1859                 if (key_beg >= scan_beg && key_end <= scan_end)
1860                         break;
1861                 parent = hammer2_chain_getparent(parentp, how_maybe);
1862         }
1863
1864 again:
1865         if (--maxloops == 0)
1866                 panic("hammer2_chain_lookup: maxloops");
1867         /*
1868          * Locate the blockref array.  Currently we do a fully associative
1869          * search through the array.
1870          */
1871         switch(parent->bref.type) {
1872         case HAMMER2_BREF_TYPE_INODE:
1873                 /*
1874                  * Special shortcut for embedded data returns the inode
1875                  * itself.  Callers must detect this condition and access
1876                  * the embedded data (the strategy code does this for us).
1877                  *
1878                  * This is only applicable to regular files and softlinks.
1879                  */
1880                 if (parent->data->ipdata.meta.op_flags &
1881                     HAMMER2_OPFLAG_DIRECTDATA) {
1882                         if (flags & HAMMER2_LOOKUP_NODIRECT) {
1883                                 chain = NULL;
1884                                 *key_nextp = key_end + 1;
1885                                 goto done;
1886                         }
1887                         hammer2_chain_ref(parent);
1888                         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1889                                 hammer2_chain_lock(parent, how_always);
1890                         *key_nextp = key_end + 1;
1891                         return (parent);
1892                 }
1893                 base = &parent->data->ipdata.u.blockset.blockref[0];
1894                 count = HAMMER2_SET_COUNT;
1895                 break;
1896         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1897         case HAMMER2_BREF_TYPE_INDIRECT:
1898                 /*
1899                  * Handle MATCHIND on the parent
1900                  */
1901                 if (flags & HAMMER2_LOOKUP_MATCHIND) {
1902                         scan_beg = parent->bref.key;
1903                         scan_end = scan_beg +
1904                                ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1905                         if (key_beg == scan_beg && key_end == scan_end) {
1906                                 chain = parent;
1907                                 hammer2_chain_ref(chain);
1908                                 hammer2_chain_lock(chain, how_maybe);
1909                                 *key_nextp = scan_end + 1;
1910                                 goto done;
1911                         }
1912                 }
1913                 /*
1914                  * Optimize indirect blocks in the INITIAL state to avoid
1915                  * I/O.
1916                  */
1917                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1918                         base = NULL;
1919                 } else {
1920                         if (parent->data == NULL)
1921                                 panic("parent->data is NULL");
1922                         base = &parent->data->npdata[0];
1923                 }
1924                 count = parent->bytes / sizeof(hammer2_blockref_t);
1925                 break;
1926         case HAMMER2_BREF_TYPE_VOLUME:
1927                 base = &parent->data->voldata.sroot_blockset.blockref[0];
1928                 count = HAMMER2_SET_COUNT;
1929                 break;
1930         case HAMMER2_BREF_TYPE_FREEMAP:
1931                 base = &parent->data->blkset.blockref[0];
1932                 count = HAMMER2_SET_COUNT;
1933                 break;
1934         default:
1935                 kprintf("hammer2_chain_lookup: unrecognized "
1936                         "blockref(B) type: %d",
1937                         parent->bref.type);
1938                 while (1)
1939                         tsleep(&base, 0, "dead", 0);
1940                 panic("hammer2_chain_lookup: unrecognized "
1941                       "blockref(B) type: %d",
1942                       parent->bref.type);
1943                 base = NULL;    /* safety */
1944                 count = 0;      /* safety */
1945         }
1946
1947         /*
1948          * Merged scan to find next candidate.
1949          *
1950          * hammer2_base_*() functions require the parent->core.live_* fields
1951          * to be synchronized.
1952          *
1953          * We need to hold the spinlock to access the block array and RB tree
1954          * and to interlock chain creation.
1955          */
1956         if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
1957                 hammer2_chain_countbrefs(parent, base, count);
1958
1959         /*
1960          * Combined search
1961          */
1962         hammer2_spin_ex(&parent->core.spin);
1963         chain = hammer2_combined_find(parent, base, count,
1964                                       cache_indexp, key_nextp,
1965                                       key_beg, key_end,
1966                                       &bref);
1967         generation = parent->core.generation;
1968
1969         /*
1970          * Exhausted parent chain, iterate.
1971          */
1972         if (bref == NULL) {
1973                 hammer2_spin_unex(&parent->core.spin);
1974                 if (key_beg == key_end) /* short cut single-key case */
1975                         return (NULL);
1976
1977                 /*
1978                  * Stop if we reached the end of the iteration.
1979                  */
1980                 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
1981                     parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1982                         return (NULL);
1983                 }
1984
1985                 /*
1986                  * Calculate next key, stop if we reached the end of the
1987                  * iteration, otherwise go up one level and loop.
1988                  */
1989                 key_beg = parent->bref.key +
1990                           ((hammer2_key_t)1 << parent->bref.keybits);
1991                 if (key_beg == 0 || key_beg > key_end)
1992                         return (NULL);
1993                 parent = hammer2_chain_getparent(parentp, how_maybe);
1994                 goto again;
1995         }
1996
1997         /*
1998          * Selected from blockref or in-memory chain.
1999          */
2000         if (chain == NULL) {
2001                 bcopy = *bref;
2002                 hammer2_spin_unex(&parent->core.spin);
2003                 chain = hammer2_chain_get(parent, generation,
2004                                           &bcopy);
2005                 if (chain == NULL) {
2006                         kprintf("retry lookup parent %p keys %016jx:%016jx\n",
2007                                 parent, key_beg, key_end);
2008                         goto again;
2009                 }
2010                 if (bcmp(&bcopy, bref, sizeof(bcopy))) {
2011                         hammer2_chain_drop(chain);
2012                         goto again;
2013                 }
2014         } else {
2015                 hammer2_chain_ref(chain);
2016                 hammer2_spin_unex(&parent->core.spin);
2017         }
2018
2019         /*
2020          * chain is referenced but not locked.  We must lock the chain
2021          * to obtain definitive DUPLICATED/DELETED state
2022          */
2023         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2024             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2025                 hammer2_chain_lock(chain, how_maybe);
2026         } else {
2027                 hammer2_chain_lock(chain, how);
2028         }
2029
2030         /*
2031          * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
2032          *
2033          * NOTE: Chain's key range is not relevant as there might be
2034          *       one-offs within the range that are not deleted.
2035          *
2036          * NOTE: Lookups can race delete-duplicate because
2037          *       delete-duplicate does not lock the parent's core
2038          *       (they just use the spinlock on the core).  We must
2039          *       check for races by comparing the DUPLICATED flag before
2040          *       releasing the spinlock with the flag after locking the
2041          *       chain.
2042          */
2043         if (chain->flags & HAMMER2_CHAIN_DELETED) {
2044                 hammer2_chain_unlock(chain);
2045                 hammer2_chain_drop(chain);
2046                 key_beg = *key_nextp;
2047                 if (key_beg == 0 || key_beg > key_end)
2048                         return(NULL);
2049                 goto again;
2050         }
2051
2052         /*
2053          * If the chain element is an indirect block it becomes the new
2054          * parent and we loop on it.  We must maintain our top-down locks
2055          * to prevent the flusher from interfering (i.e. doing a
2056          * delete-duplicate and leaving us recursing down a deleted chain).
2057          *
2058          * The parent always has to be locked with at least RESOLVE_MAYBE
2059          * so we can access its data.  It might need a fixup if the caller
2060          * passed incompatible flags.  Be careful not to cause a deadlock
2061          * as a data-load requires an exclusive lock.
2062          *
2063          * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key
2064          * range is within the requested key range we return the indirect
2065          * block and do NOT loop.  This is usually only used to acquire
2066          * freemap nodes.
2067          */
2068         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2069             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2070                 hammer2_chain_unlock(parent);
2071                 hammer2_chain_drop(parent);
2072                 *parentp = parent = chain;
2073                 goto again;
2074         }
2075 done:
2076         /*
2077          * All done, return the chain.
2078          *
2079          * If the caller does not want a locked chain, replace the lock with
2080          * a ref.  Perhaps this can eventually be optimized to not obtain the
2081          * lock in the first place for situations where the data does not
2082          * need to be resolved.
2083          */
2084         if (chain) {
2085                 if (flags & HAMMER2_LOOKUP_NOLOCK)
2086                         hammer2_chain_unlock(chain);
2087         }
2088
2089         return (chain);
2090 }
2091
2092 /*
2093  * After having issued a lookup we can iterate all matching keys.
2094  *
2095  * If chain is non-NULL we continue the iteration from just after it's index.
2096  *
2097  * If chain is NULL we assume the parent was exhausted and continue the
2098  * iteration at the next parent.
2099  *
2100  * If a fatal error occurs (typically an I/O error), a dummy chain is
2101  * returned with chain->error and error-identifying information set.  This
2102  * chain will assert if you try to do anything fancy with it.
2103  *
2104  * XXX Depending on where the error occurs we should allow continued iteration.
2105  *
2106  * parent must be locked on entry and remains locked throughout.  chain's
2107  * lock status must match flags.  Chain is always at least referenced.
2108  *
2109  * WARNING!  The MATCHIND flag does not apply to this function.
2110  */
2111 hammer2_chain_t *
2112 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain,
2113                    hammer2_key_t *key_nextp,
2114                    hammer2_key_t key_beg, hammer2_key_t key_end,
2115                    int *cache_indexp, int flags)
2116 {
2117         hammer2_chain_t *parent;
2118         int how_maybe;
2119
2120         /*
2121          * Calculate locking flags for upward recursion.
2122          */
2123         how_maybe = HAMMER2_RESOLVE_MAYBE;
2124         if (flags & HAMMER2_LOOKUP_SHARED)
2125                 how_maybe |= HAMMER2_RESOLVE_SHARED;
2126
2127         parent = *parentp;
2128
2129         /*
2130          * Calculate the next index and recalculate the parent if necessary.
2131          */
2132         if (chain) {
2133                 key_beg = chain->bref.key +
2134                           ((hammer2_key_t)1 << chain->bref.keybits);
2135                 if ((flags & (HAMMER2_LOOKUP_NOLOCK |
2136                               HAMMER2_LOOKUP_NOUNLOCK)) == 0) {
2137                         hammer2_chain_unlock(chain);
2138                 }
2139                 hammer2_chain_drop(chain);
2140
2141                 /*
2142                  * chain invalid past this point, but we can still do a
2143                  * pointer comparison w/parent.
2144                  *
2145                  * Any scan where the lookup returned degenerate data embedded
2146                  * in the inode has an invalid index and must terminate.
2147                  */
2148                 if (chain == parent)
2149                         return(NULL);
2150                 if (key_beg == 0 || key_beg > key_end)
2151                         return(NULL);
2152                 chain = NULL;
2153         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
2154                    parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2155                 /*
2156                  * We reached the end of the iteration.
2157                  */
2158                 return (NULL);
2159         } else {
2160                 /*
2161                  * Continue iteration with next parent unless the current
2162                  * parent covers the range.
2163                  */
2164                 key_beg = parent->bref.key +
2165                           ((hammer2_key_t)1 << parent->bref.keybits);
2166                 if (key_beg == 0 || key_beg > key_end)
2167                         return (NULL);
2168                 parent = hammer2_chain_getparent(parentp, how_maybe);
2169         }
2170
2171         /*
2172          * And execute
2173          */
2174         return (hammer2_chain_lookup(parentp, key_nextp,
2175                                      key_beg, key_end,
2176                                      cache_indexp, flags));
2177 }
2178
2179 /*
2180  * The raw scan function is similar to lookup/next but does not seek to a key.
2181  * Blockrefs are iterated via first_bref = (parent, NULL) and
2182  * next_chain = (parent, bref).
2183  *
2184  * The passed-in parent must be locked and its data resolved.  The function
2185  * nominally returns a locked and referenced *chainp != NULL for chains
2186  * the caller might need to recurse on (and will dipose of any *chainp passed
2187  * in).  The caller must check the chain->bref.type either way.
2188  *
2189  * *chainp is not set for leaf elements.
2190  *
2191  * This function takes a pointer to a stack-based bref structure whos
2192  * contents is updated for each iteration.  The same pointer is returned,
2193  * or NULL when the iteration is complete.  *firstp must be set to 1 for
2194  * the first ieration.  This function will set it to 0.
2195  */
2196 hammer2_blockref_t *
2197 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp,
2198                    hammer2_blockref_t *bref, int *firstp,
2199                    int *cache_indexp, int flags)
2200 {
2201         hammer2_dev_t *hmp;
2202         hammer2_blockref_t *base;
2203         hammer2_blockref_t *bref_ptr;
2204         hammer2_key_t key;
2205         hammer2_key_t next_key;
2206         hammer2_chain_t *chain = NULL;
2207         int count = 0;
2208         int how_always = HAMMER2_RESOLVE_ALWAYS;
2209         int how_maybe = HAMMER2_RESOLVE_MAYBE;
2210         int how;
2211         int generation;
2212         int maxloops = 300000;
2213
2214         hmp = parent->hmp;
2215
2216         /*
2217          * Scan flags borrowed from lookup.
2218          */
2219         if (flags & HAMMER2_LOOKUP_ALWAYS) {
2220                 how_maybe = how_always;
2221                 how = HAMMER2_RESOLVE_ALWAYS;
2222         } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) {
2223                 how = HAMMER2_RESOLVE_NEVER;
2224         } else {
2225                 how = HAMMER2_RESOLVE_MAYBE;
2226         }
2227         if (flags & HAMMER2_LOOKUP_SHARED) {
2228                 how_maybe |= HAMMER2_RESOLVE_SHARED;
2229                 how_always |= HAMMER2_RESOLVE_SHARED;
2230                 how |= HAMMER2_RESOLVE_SHARED;
2231         }
2232
2233         /*
2234          * Calculate key to locate first/next element, unlocking the previous
2235          * element as we go.  Be careful, the key calculation can overflow.
2236          *
2237          * (also reset bref to NULL)
2238          */
2239         if (*firstp) {
2240                 key = 0;
2241                 *firstp = 0;
2242         } else {
2243                 key = bref->key + ((hammer2_key_t)1 << bref->keybits);
2244                 if ((chain = *chainp) != NULL) {
2245                         *chainp = NULL;
2246                         hammer2_chain_unlock(chain);
2247                         hammer2_chain_drop(chain);
2248                         chain = NULL;
2249                 }
2250                 if (key == 0) {
2251                         bref = NULL;
2252                         goto done;
2253                 }
2254         }
2255
2256 again:
2257         KKASSERT(parent->error == 0);   /* XXX case not handled yet */
2258         if (--maxloops == 0)
2259                 panic("hammer2_chain_scan: maxloops");
2260         /*
2261          * Locate the blockref array.  Currently we do a fully associative
2262          * search through the array.
2263          */
2264         switch(parent->bref.type) {
2265         case HAMMER2_BREF_TYPE_INODE:
2266                 /*
2267                  * An inode with embedded data has no sub-chains.
2268                  *
2269                  * WARNING! Bulk scan code may pass a static chain marked
2270                  *          as BREF_TYPE_INODE with a copy of the volume
2271                  *          root blockset to snapshot the volume.
2272                  */
2273                 if (parent->data->ipdata.meta.op_flags &
2274                     HAMMER2_OPFLAG_DIRECTDATA) {
2275                         bref = NULL;
2276                         goto done;
2277                 }
2278                 base = &parent->data->ipdata.u.blockset.blockref[0];
2279                 count = HAMMER2_SET_COUNT;
2280                 break;
2281         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2282         case HAMMER2_BREF_TYPE_INDIRECT:
2283                 /*
2284                  * Optimize indirect blocks in the INITIAL state to avoid
2285                  * I/O.
2286                  */
2287                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2288                         base = NULL;
2289                 } else {
2290                         if (parent->data == NULL)
2291                                 panic("parent->data is NULL");
2292                         base = &parent->data->npdata[0];
2293                 }
2294                 count = parent->bytes / sizeof(hammer2_blockref_t);
2295                 break;
2296         case HAMMER2_BREF_TYPE_VOLUME:
2297                 base = &parent->data->voldata.sroot_blockset.blockref[0];
2298                 count = HAMMER2_SET_COUNT;
2299                 break;
2300         case HAMMER2_BREF_TYPE_FREEMAP:
2301                 base = &parent->data->blkset.blockref[0];
2302                 count = HAMMER2_SET_COUNT;
2303                 break;
2304         default:
2305                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
2306                       parent->bref.type);
2307                 base = NULL;    /* safety */
2308                 count = 0;      /* safety */
2309         }
2310
2311         /*
2312          * Merged scan to find next candidate.
2313          *
2314          * hammer2_base_*() functions require the parent->core.live_* fields
2315          * to be synchronized.
2316          *
2317          * We need to hold the spinlock to access the block array and RB tree
2318          * and to interlock chain creation.
2319          */
2320         if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
2321                 hammer2_chain_countbrefs(parent, base, count);
2322
2323         next_key = 0;
2324         bref_ptr = NULL;
2325         hammer2_spin_ex(&parent->core.spin);
2326         chain = hammer2_combined_find(parent, base, count,
2327                                       cache_indexp, &next_key,
2328                                       key, HAMMER2_KEY_MAX,
2329                                       &bref_ptr);
2330         generation = parent->core.generation;
2331
2332         /*
2333          * Exhausted parent chain, we're done.
2334          */
2335         if (bref_ptr == NULL) {
2336                 hammer2_spin_unex(&parent->core.spin);
2337                 KKASSERT(chain == NULL);
2338                 bref = NULL;
2339                 goto done;
2340         }
2341
2342         /*
2343          * Copy into the supplied stack-based blockref.
2344          */
2345         *bref = *bref_ptr;
2346
2347         /*
2348          * Selected from blockref or in-memory chain.
2349          */
2350         if (chain == NULL) {
2351                 switch(bref->type) {
2352                 case HAMMER2_BREF_TYPE_INODE:
2353                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2354                 case HAMMER2_BREF_TYPE_INDIRECT:
2355                 case HAMMER2_BREF_TYPE_VOLUME:
2356                 case HAMMER2_BREF_TYPE_FREEMAP:
2357                         /*
2358                          * Recursion, always get the chain
2359                          */
2360                         hammer2_spin_unex(&parent->core.spin);
2361                         chain = hammer2_chain_get(parent, generation, bref);
2362                         if (chain == NULL) {
2363                                 kprintf("retry scan parent %p keys %016jx\n",
2364                                         parent, key);
2365                                 goto again;
2366                         }
2367                         if (bcmp(bref, bref_ptr, sizeof(*bref))) {
2368                                 hammer2_chain_drop(chain);
2369                                 chain = NULL;
2370                                 goto again;
2371                         }
2372                         break;
2373                 default:
2374                         /*
2375                          * No recursion, do not waste time instantiating
2376                          * a chain, just iterate using the bref.
2377                          */
2378                         hammer2_spin_unex(&parent->core.spin);
2379                         break;
2380                 }
2381         } else {
2382                 /*
2383                  * Recursion or not we need the chain in order to supply
2384                  * the bref.
2385                  */
2386                 hammer2_chain_ref(chain);
2387                 hammer2_spin_unex(&parent->core.spin);
2388         }
2389
2390         /*
2391          * chain is referenced but not locked.  We must lock the chain
2392          * to obtain definitive DUPLICATED/DELETED state
2393          */
2394         if (chain)
2395                 hammer2_chain_lock(chain, how);
2396
2397         /*
2398          * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
2399          *
2400          * NOTE: chain's key range is not relevant as there might be
2401          *       one-offs within the range that are not deleted.
2402          *
2403          * NOTE: XXX this could create problems with scans used in
2404          *       situations other than mount-time recovery.
2405          *
2406          * NOTE: Lookups can race delete-duplicate because
2407          *       delete-duplicate does not lock the parent's core
2408          *       (they just use the spinlock on the core).  We must
2409          *       check for races by comparing the DUPLICATED flag before
2410          *       releasing the spinlock with the flag after locking the
2411          *       chain.
2412          */
2413         if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
2414                 hammer2_chain_unlock(chain);
2415                 hammer2_chain_drop(chain);
2416                 chain = NULL;
2417
2418                 key = next_key;
2419                 if (key == 0) {
2420                         bref = NULL;
2421                         goto done;
2422                 }
2423                 goto again;
2424         }
2425
2426 done:
2427         /*
2428          * All done, return the bref or NULL, supply chain if necessary.
2429          */
2430         if (chain)
2431                 *chainp = chain;
2432         return (bref);
2433 }
2434
2435 /*
2436  * Create and return a new hammer2 system memory structure of the specified
2437  * key, type and size and insert it under (*parentp).  This is a full
2438  * insertion, based on the supplied key/keybits, and may involve creating
2439  * indirect blocks and moving other chains around via delete/duplicate.
2440  *
2441  * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION
2442  * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING
2443  * FULL.  This typically means that the caller is creating the chain after
2444  * doing a hammer2_chain_lookup().
2445  *
2446  * (*parentp) must be exclusive locked and may be replaced on return
2447  * depending on how much work the function had to do.
2448  *
2449  * (*parentp) must not be errored or this function will assert.
2450  *
2451  * (*chainp) usually starts out NULL and returns the newly created chain,
2452  * but if the caller desires the caller may allocate a disconnected chain
2453  * and pass it in instead.
2454  *
2455  * This function should NOT be used to insert INDIRECT blocks.  It is
2456  * typically used to create/insert inodes and data blocks.
2457  *
2458  * Caller must pass-in an exclusively locked parent the new chain is to
2459  * be inserted under, and optionally pass-in a disconnected, exclusively
2460  * locked chain to insert (else we create a new chain).  The function will
2461  * adjust (*parentp) as necessary, create or connect the chain, and
2462  * return an exclusively locked chain in *chainp.
2463  *
2464  * When creating a PFSROOT inode under the super-root, pmp is typically NULL
2465  * and will be reassigned.
2466  */
2467 int
2468 hammer2_chain_create(hammer2_chain_t **parentp,
2469                      hammer2_chain_t **chainp, hammer2_pfs_t *pmp,
2470                      hammer2_key_t key, int keybits, int type, size_t bytes,
2471                      hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags)
2472 {
2473         hammer2_dev_t *hmp;
2474         hammer2_chain_t *chain;
2475         hammer2_chain_t *parent;
2476         hammer2_blockref_t *base;
2477         hammer2_blockref_t dummy;
2478         int allocated = 0;
2479         int error = 0;
2480         int count;
2481         int maxloops = 300000;
2482
2483         /*
2484          * Topology may be crossing a PFS boundary.
2485          */
2486         parent = *parentp;
2487         KKASSERT(hammer2_mtx_owned(&parent->lock));
2488         KKASSERT(parent->error == 0);
2489         hmp = parent->hmp;
2490         chain = *chainp;
2491
2492         if (chain == NULL) {
2493                 /*
2494                  * First allocate media space and construct the dummy bref,
2495                  * then allocate the in-memory chain structure.  Set the
2496                  * INITIAL flag for fresh chains which do not have embedded
2497                  * data.
2498                  *
2499                  * XXX for now set the check mode of the child based on
2500                  *     the parent or, if the parent is an inode, the
2501                  *     specification in the inode.
2502                  */
2503                 bzero(&dummy, sizeof(dummy));
2504                 dummy.type = type;
2505                 dummy.key = key;
2506                 dummy.keybits = keybits;
2507                 dummy.data_off = hammer2_getradix(bytes);
2508                 dummy.methods = parent->bref.methods;
2509                 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE &&
2510                     parent->data) {
2511                         dummy.methods &= ~HAMMER2_ENC_CHECK(-1);
2512                         dummy.methods |= HAMMER2_ENC_CHECK(
2513                                           parent->data->ipdata.meta.check_algo);
2514                 }
2515
2516                 chain = hammer2_chain_alloc(hmp, pmp, &dummy);
2517
2518                 /*
2519                  * Lock the chain manually, chain_lock will load the chain
2520                  * which we do NOT want to do.  (note: chain->refs is set
2521                  * to 1 by chain_alloc() for us, but lockcnt is not).
2522                  */
2523                 chain->lockcnt = 1;
2524                 hammer2_mtx_ex(&chain->lock);
2525                 allocated = 1;
2526                 ++curthread->td_tracker;
2527
2528                 /*
2529                  * Set INITIAL to optimize I/O.  The flag will generally be
2530                  * processed when we call hammer2_chain_modify().
2531                  *
2532                  * Recalculate bytes to reflect the actual media block
2533                  * allocation.
2534                  */
2535                 bytes = (hammer2_off_t)1 <<
2536                         (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
2537                 chain->bytes = bytes;
2538
2539                 switch(type) {
2540                 case HAMMER2_BREF_TYPE_VOLUME:
2541                 case HAMMER2_BREF_TYPE_FREEMAP:
2542                         panic("hammer2_chain_create: called with volume type");
2543                         break;
2544                 case HAMMER2_BREF_TYPE_INDIRECT:
2545                         panic("hammer2_chain_create: cannot be used to"
2546                               "create indirect block");
2547                         break;
2548                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2549                         panic("hammer2_chain_create: cannot be used to"
2550                               "create freemap root or node");
2551                         break;
2552                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2553                         KKASSERT(bytes == sizeof(chain->data->bmdata));
2554                         /* fall through */
2555                 case HAMMER2_BREF_TYPE_INODE:
2556                 case HAMMER2_BREF_TYPE_DATA:
2557                 default:
2558                         /*
2559                          * leave chain->data NULL, set INITIAL
2560                          */
2561                         KKASSERT(chain->data == NULL);
2562                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
2563                         break;
2564                 }
2565         } else {
2566                 /*
2567                  * We are reattaching a previously deleted chain, possibly
2568                  * under a new parent and possibly with a new key/keybits.
2569                  * The chain does not have to be in a modified state.  The
2570                  * UPDATE flag will be set later on in this routine.
2571                  *
2572                  * Do NOT mess with the current state of the INITIAL flag.
2573                  */
2574                 chain->bref.key = key;
2575                 chain->bref.keybits = keybits;
2576                 if (chain->flags & HAMMER2_CHAIN_DELETED)
2577                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2578                 KKASSERT(chain->parent == NULL);
2579         }
2580         if (flags & HAMMER2_INSERT_PFSROOT)
2581                 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
2582         else
2583                 chain->bref.flags &= ~HAMMER2_BREF_FLAG_PFSROOT;
2584
2585         /*
2586          * Calculate how many entries we have in the blockref array and
2587          * determine if an indirect block is required.
2588          */
2589 again:
2590         if (--maxloops == 0)
2591                 panic("hammer2_chain_create: maxloops");
2592
2593         switch(parent->bref.type) {
2594         case HAMMER2_BREF_TYPE_INODE:
2595                 KKASSERT((parent->data->ipdata.meta.op_flags &
2596                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
2597                 KKASSERT(parent->data != NULL);
2598                 base = &parent->data->ipdata.u.blockset.blockref[0];
2599                 count = HAMMER2_SET_COUNT;
2600                 break;
2601         case HAMMER2_BREF_TYPE_INDIRECT:
2602         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2603                 if (parent->flags & HAMMER2_CHAIN_INITIAL)
2604                         base = NULL;
2605                 else
2606                         base = &parent->data->npdata[0];
2607                 count = parent->bytes / sizeof(hammer2_blockref_t);
2608                 break;
2609         case HAMMER2_BREF_TYPE_VOLUME:
2610                 KKASSERT(parent->data != NULL);
2611                 base = &parent->data->voldata.sroot_blockset.blockref[0];
2612                 count = HAMMER2_SET_COUNT;
2613                 break;
2614         case HAMMER2_BREF_TYPE_FREEMAP:
2615                 KKASSERT(parent->data != NULL);
2616                 base = &parent->data->blkset.blockref[0];
2617                 count = HAMMER2_SET_COUNT;
2618                 break;
2619         default:
2620                 panic("hammer2_chain_create: unrecognized blockref type: %d",
2621                       parent->bref.type);
2622                 base = NULL;
2623                 count = 0;
2624                 break;
2625         }
2626
2627         /*
2628          * Make sure we've counted the brefs
2629          */
2630         if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
2631                 hammer2_chain_countbrefs(parent, base, count);
2632
2633         KKASSERT(parent->core.live_count >= 0 &&
2634                  parent->core.live_count <= count);
2635
2636         /*
2637          * If no free blockref could be found we must create an indirect
2638          * block and move a number of blockrefs into it.  With the parent
2639          * locked we can safely lock each child in order to delete+duplicate
2640          * it without causing a deadlock.
2641          *
2642          * This may return the new indirect block or the old parent depending
2643          * on where the key falls.  NULL is returned on error.
2644          */
2645         if (parent->core.live_count == count) {
2646                 hammer2_chain_t *nparent;
2647
2648                 nparent = hammer2_chain_create_indirect(parent, key, keybits,
2649                                                         mtid, type, &error);
2650                 if (nparent == NULL) {
2651                         if (allocated)
2652                                 hammer2_chain_drop(chain);
2653                         chain = NULL;
2654                         goto done;
2655                 }
2656                 if (parent != nparent) {
2657                         hammer2_chain_unlock(parent);
2658                         hammer2_chain_drop(parent);
2659                         parent = *parentp = nparent;
2660                 }
2661                 goto again;
2662         }
2663
2664         /*
2665          * Link the chain into its parent.
2666          */
2667         if (chain->parent != NULL)
2668                 panic("hammer2: hammer2_chain_create: chain already connected");
2669         KKASSERT(chain->parent == NULL);
2670         hammer2_chain_insert(parent, chain,
2671                              HAMMER2_CHAIN_INSERT_SPIN |
2672                              HAMMER2_CHAIN_INSERT_LIVE,
2673                              0);
2674
2675         if (allocated) {
2676                 /*
2677                  * Mark the newly created chain modified.  This will cause
2678                  * UPDATE to be set and process the INITIAL flag.
2679                  *
2680                  * Device buffers are not instantiated for DATA elements
2681                  * as these are handled by logical buffers.
2682                  *
2683                  * Indirect and freemap node indirect blocks are handled
2684                  * by hammer2_chain_create_indirect() and not by this
2685                  * function.
2686                  *
2687                  * Data for all other bref types is expected to be
2688                  * instantiated (INODE, LEAF).
2689                  */
2690                 switch(chain->bref.type) {
2691                 case HAMMER2_BREF_TYPE_DATA:
2692                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2693                 case HAMMER2_BREF_TYPE_INODE:
2694                         hammer2_chain_modify(chain, mtid, dedup_off,
2695                                              HAMMER2_MODIFY_OPTDATA);
2696                         break;
2697                 default:
2698                         /*
2699                          * Remaining types are not supported by this function.
2700                          * In particular, INDIRECT and LEAF_NODE types are
2701                          * handled by create_indirect().
2702                          */
2703                         panic("hammer2_chain_create: bad type: %d",
2704                               chain->bref.type);
2705                         /* NOT REACHED */
2706                         break;
2707                 }
2708         } else {
2709                 /*
2710                  * When reconnecting a chain we must set UPDATE and
2711                  * setflush so the flush recognizes that it must update
2712                  * the bref in the parent.
2713                  */
2714                 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0)
2715                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
2716         }
2717
2718         /*
2719          * We must setflush(parent) to ensure that it recurses through to
2720          * chain.  setflush(chain) might not work because ONFLUSH is possibly
2721          * already set in the chain (so it won't recurse up to set it in the
2722          * parent).
2723          */
2724         hammer2_chain_setflush(parent);
2725
2726 done:
2727         *chainp = chain;
2728
2729         return (error);
2730 }
2731
2732 /*
2733  * Move the chain from its old parent to a new parent.  The chain must have
2734  * already been deleted or already disconnected (or never associated) with
2735  * a parent.  The chain is reassociated with the new parent and the deleted
2736  * flag will be cleared (no longer deleted).  The chain's modification state
2737  * is not altered.
2738  *
2739  * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION
2740  * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING
2741  * FULL.  This typically means that the caller is creating the chain after
2742  * doing a hammer2_chain_lookup().
2743  *
2744  * A non-NULL bref is typically passed when key and keybits must be overridden.
2745  * Note that hammer2_cluster_duplicate() *ONLY* uses the key and keybits fields
2746  * from a passed-in bref and uses the old chain's bref for everything else.
2747  *
2748  * Neither (parent) or (chain) can be errored.
2749  *
2750  * If (parent) is non-NULL then the new duplicated chain is inserted under
2751  * the parent.
2752  *
2753  * If (parent) is NULL then the newly duplicated chain is not inserted
2754  * anywhere, similar to if it had just been chain_alloc()'d (suitable for
2755  * passing into hammer2_chain_create() after this function returns).
2756  *
2757  * WARNING! This function calls create which means it can insert indirect
2758  *          blocks.  This can cause other unrelated chains in the parent to
2759  *          be moved to a newly inserted indirect block in addition to the
2760  *          specific chain.
2761  */
2762 void
2763 hammer2_chain_rename(hammer2_blockref_t *bref,
2764                      hammer2_chain_t **parentp, hammer2_chain_t *chain,
2765                      hammer2_tid_t mtid, int flags)
2766 {
2767         hammer2_dev_t *hmp;
2768         hammer2_chain_t *parent;
2769         size_t bytes;
2770
2771         /*
2772          * WARNING!  We should never resolve DATA to device buffers
2773          *           (XXX allow it if the caller did?), and since
2774          *           we currently do not have the logical buffer cache
2775          *           buffer in-hand to fix its cached physical offset
2776          *           we also force the modify code to not COW it. XXX
2777          */
2778         hmp = chain->hmp;
2779         KKASSERT(chain->parent == NULL);
2780         KKASSERT(chain->error == 0);
2781
2782         /*
2783          * Now create a duplicate of the chain structure, associating
2784          * it with the same core, making it the same size, pointing it
2785          * to the same bref (the same media block).
2786          */
2787         if (bref == NULL)
2788                 bref = &chain->bref;
2789         bytes = (hammer2_off_t)1 <<
2790                 (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
2791
2792         /*
2793          * If parent is not NULL the duplicated chain will be entered under
2794          * the parent and the UPDATE bit set to tell flush to update
2795          * the blockref.
2796          *
2797          * We must setflush(parent) to ensure that it recurses through to
2798          * chain.  setflush(chain) might not work because ONFLUSH is possibly
2799          * already set in the chain (so it won't recurse up to set it in the
2800          * parent).
2801          *
2802          * Having both chains locked is extremely important for atomicy.
2803          */
2804         if (parentp && (parent = *parentp) != NULL) {
2805                 KKASSERT(hammer2_mtx_owned(&parent->lock));
2806                 KKASSERT(parent->refs > 0);
2807                 KKASSERT(parent->error == 0);
2808
2809                 hammer2_chain_create(parentp, &chain, chain->pmp,
2810                                      bref->key, bref->keybits, bref->type,
2811                                      chain->bytes, mtid, 0, flags);
2812                 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE);
2813                 hammer2_chain_setflush(*parentp);
2814         }
2815 }
2816
2817 /*
2818  * Helper function for deleting chains.
2819  *
2820  * The chain is removed from the live view (the RBTREE) as well as the parent's
2821  * blockmap.  Both chain and its parent must be locked.
2822  *
2823  * parent may not be errored.  chain can be errored.
2824  */
2825 static void
2826 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain,
2827                              hammer2_tid_t mtid, int flags)
2828 {
2829         hammer2_dev_t *hmp;
2830
2831         KKASSERT((chain->flags & (HAMMER2_CHAIN_DELETED |
2832                                   HAMMER2_CHAIN_FICTITIOUS)) == 0);
2833         KKASSERT(chain->parent == parent);
2834         hmp = chain->hmp;
2835
2836         if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
2837                 /*
2838                  * Chain is blockmapped, so there must be a parent.
2839                  * Atomically remove the chain from the parent and remove
2840                  * the blockmap entry.  The parent must be set modified
2841                  * to remove the blockmap entry.
2842                  */
2843                 hammer2_blockref_t *base;
2844                 int count;
2845
2846                 KKASSERT(parent != NULL);
2847                 KKASSERT(parent->error == 0);
2848                 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0);
2849                 hammer2_chain_modify(parent, mtid, 0, HAMMER2_MODIFY_OPTDATA);
2850
2851                 /*
2852                  * Calculate blockmap pointer
2853                  */
2854                 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
2855                 hammer2_spin_ex(&parent->core.spin);
2856
2857                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2858                 atomic_add_int(&parent->core.live_count, -1);
2859                 ++parent->core.generation;
2860                 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain);
2861                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
2862                 --parent->core.chain_count;
2863                 chain->parent = NULL;
2864
2865                 switch(parent->bref.type) {
2866                 case HAMMER2_BREF_TYPE_INODE:
2867                         /*
2868                          * Access the inode's block array.  However, there
2869                          * is no block array if the inode is flagged
2870                          * DIRECTDATA.  The DIRECTDATA case typicaly only
2871                          * occurs when a hardlink has been shifted up the
2872                          * tree and the original inode gets replaced with
2873                          * an OBJTYPE_HARDLINK placeholding inode.
2874                          */
2875                         if (parent->data &&
2876                             (parent->data->ipdata.meta.op_flags &
2877                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
2878                                 base =
2879                                    &parent->data->ipdata.u.blockset.blockref[0];
2880                         } else {
2881                                 base = NULL;
2882                         }
2883                         count = HAMMER2_SET_COUNT;
2884                         break;
2885                 case HAMMER2_BREF_TYPE_INDIRECT:
2886                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2887                         if (parent->data)
2888                                 base = &parent->data->npdata[0];
2889                         else
2890                                 base = NULL;
2891                         count = parent->bytes / sizeof(hammer2_blockref_t);
2892                         break;
2893                 case HAMMER2_BREF_TYPE_VOLUME:
2894                         base = &parent->data->voldata.
2895                                         sroot_blockset.blockref[0];
2896                         count = HAMMER2_SET_COUNT;
2897                         break;
2898                 case HAMMER2_BREF_TYPE_FREEMAP:
2899                         base = &parent->data->blkset.blockref[0];
2900                         count = HAMMER2_SET_COUNT;
2901                         break;
2902                 default:
2903                         base = NULL;
2904                         count = 0;
2905                         panic("hammer2_flush_pass2: "
2906                               "unrecognized blockref type: %d",
2907                               parent->bref.type);
2908                 }
2909
2910                 /*
2911                  * delete blockmapped chain from its parent.
2912                  *
2913                  * The parent is not affected by any statistics in chain
2914                  * which are pending synchronization.  That is, there is
2915                  * nothing to undo in the parent since they have not yet
2916                  * been incorporated into the parent.
2917                  *
2918                  * The parent is affected by statistics stored in inodes.
2919                  * Those have already been synchronized, so they must be
2920                  * undone.  XXX split update possible w/delete in middle?
2921                  */
2922                 if (base) {
2923                         int cache_index = -1;
2924                         hammer2_base_delete(parent, base, count,
2925                                             &cache_index, chain);
2926                 }
2927                 hammer2_spin_unex(&parent->core.spin);
2928         } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
2929                 /*
2930                  * Chain is not blockmapped but a parent is present.
2931                  * Atomically remove the chain from the parent.  There is
2932                  * no blockmap entry to remove.
2933                  *
2934                  * Because chain was associated with a parent but not
2935                  * synchronized, the chain's *_count_up fields contain
2936                  * inode adjustment statistics which must be undone.
2937                  */
2938                 hammer2_spin_ex(&parent->core.spin);
2939                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2940                 atomic_add_int(&parent->core.live_count, -1);
2941                 ++parent->core.generation;
2942                 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain);
2943                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
2944                 --parent->core.chain_count;
2945                 chain->parent = NULL;
2946                 hammer2_spin_unex(&parent->core.spin);
2947         } else {
2948                 /*
2949                  * Chain is not blockmapped and has no parent.  This
2950                  * is a degenerate case.
2951                  */
2952                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2953         }
2954 }
2955
2956 /*
2957  * Create an indirect block that covers one or more of the elements in the
2958  * current parent.  Either returns the existing parent with no locking or
2959  * ref changes or returns the new indirect block locked and referenced
2960  * and leaving the original parent lock/ref intact as well.
2961  *
2962  * If an error occurs, NULL is returned and *errorp is set to the error.
2963  *
2964  * The returned chain depends on where the specified key falls.
2965  *
2966  * The key/keybits for the indirect mode only needs to follow three rules:
2967  *
2968  * (1) That all elements underneath it fit within its key space and
2969  *
2970  * (2) That all elements outside it are outside its key space.
2971  *
2972  * (3) When creating the new indirect block any elements in the current
2973  *     parent that fit within the new indirect block's keyspace must be
2974  *     moved into the new indirect block.
2975  *
2976  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
2977  *     keyspace the the current parent, but lookup/iteration rules will
2978  *     ensure (and must ensure) that rule (2) for all parents leading up
2979  *     to the nearest inode or the root volume header is adhered to.  This
2980  *     is accomplished by always recursing through matching keyspaces in
2981  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
2982  *
2983  * The current implementation calculates the current worst-case keyspace by
2984  * iterating the current parent and then divides it into two halves, choosing
2985  * whichever half has the most elements (not necessarily the half containing
2986  * the requested key).
2987  *
2988  * We can also opt to use the half with the least number of elements.  This
2989  * causes lower-numbered keys (aka logical file offsets) to recurse through
2990  * fewer indirect blocks and higher-numbered keys to recurse through more.
2991  * This also has the risk of not moving enough elements to the new indirect
2992  * block and being forced to create several indirect blocks before the element
2993  * can be inserted.
2994  *
2995  * Must be called with an exclusively locked parent.
2996  */
2997 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent,
2998                                 hammer2_key_t *keyp, int keybits,
2999                                 hammer2_blockref_t *base, int count);
3000 static int hammer2_chain_indkey_normal(hammer2_chain_t *parent,
3001                                 hammer2_key_t *keyp, int keybits,
3002                                 hammer2_blockref_t *base, int count);
3003 static
3004 hammer2_chain_t *
3005 hammer2_chain_create_indirect(hammer2_chain_t *parent,
3006                               hammer2_key_t create_key, int create_bits,
3007                               hammer2_tid_t mtid, int for_type, int *errorp)
3008 {
3009         hammer2_dev_t *hmp;
3010         hammer2_blockref_t *base;
3011         hammer2_blockref_t *bref;
3012         hammer2_blockref_t bcopy;
3013         hammer2_chain_t *chain;
3014         hammer2_chain_t *ichain;
3015         hammer2_chain_t dummy;
3016         hammer2_key_t key = create_key;
3017         hammer2_key_t key_beg;
3018         hammer2_key_t key_end;
3019         hammer2_key_t key_next;
3020         int keybits = create_bits;
3021         int count;
3022         int nbytes;
3023         int cache_index;
3024         int loops;
3025         int reason;
3026         int generation;
3027         int maxloops = 300000;
3028
3029         /*
3030          * Calculate the base blockref pointer or NULL if the chain
3031          * is known to be empty.  We need to calculate the array count
3032          * for RB lookups either way.
3033          */
3034         hmp = parent->hmp;
3035         *errorp = 0;
3036         KKASSERT(hammer2_mtx_owned(&parent->lock));
3037
3038         /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/
3039         base = hammer2_chain_base_and_count(parent, &count);
3040
3041         /*
3042          * dummy used in later chain allocation (no longer used for lookups).
3043          */
3044         bzero(&dummy, sizeof(dummy));
3045
3046         /*
3047          * When creating an indirect block for a freemap node or leaf
3048          * the key/keybits must be fitted to static radix levels because
3049          * particular radix levels use particular reserved blocks in the
3050          * related zone.
3051          *
3052          * This routine calculates the key/radix of the indirect block
3053          * we need to create, and whether it is on the high-side or the
3054          * low-side.
3055          */
3056         if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3057             for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3058                 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits,
3059                                                        base, count);
3060         } else {
3061                 keybits = hammer2_chain_indkey_normal(parent, &key, keybits,
3062                                                       base, count);
3063         }
3064
3065         /*
3066          * Normalize the key for the radix being represented, keeping the
3067          * high bits and throwing away the low bits.
3068          */
3069         key &= ~(((hammer2_key_t)1 << keybits) - 1);
3070
3071         /*
3072          * How big should our new indirect block be?  It has to be at least
3073          * as large as its parent.
3074          *
3075          * The freemap uses a specific indirect block size.
3076          *
3077          * The first indirect block level down from an inode typically
3078          * uses LBUFSIZE (16384), else it uses PBUFSIZE (65536).
3079          */
3080         if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3081             for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3082                 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE;
3083         } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
3084                 nbytes = HAMMER2_IND_BYTES_MIN;
3085         } else {
3086                 nbytes = HAMMER2_IND_BYTES_MAX;
3087         }
3088         if (nbytes < count * sizeof(hammer2_blockref_t)) {
3089                 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE &&
3090                          for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF);
3091                 nbytes = count * sizeof(hammer2_blockref_t);
3092         }
3093
3094         /*
3095          * Ok, create our new indirect block
3096          */
3097         if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3098             for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3099                 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE;
3100         } else {
3101                 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
3102         }
3103         dummy.bref.key = key;
3104         dummy.bref.keybits = keybits;
3105         dummy.bref.data_off = hammer2_getradix(nbytes);
3106         dummy.bref.methods = parent->bref.methods;
3107
3108         ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy.bref);
3109         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
3110         hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE);
3111         /* ichain has one ref at this point */
3112
3113         /*
3114          * We have to mark it modified to allocate its block, but use
3115          * OPTDATA to allow it to remain in the INITIAL state.  Otherwise
3116          * it won't be acted upon by the flush code.
3117          */
3118         hammer2_chain_modify(ichain, mtid, 0, HAMMER2_MODIFY_OPTDATA);
3119
3120         /*
3121          * Iterate the original parent and move the matching brefs into
3122          * the new indirect block.
3123          *
3124          * XXX handle flushes.
3125          */
3126         key_beg = 0;
3127         key_end = HAMMER2_KEY_MAX;
3128         cache_index = 0;
3129         hammer2_spin_ex(&parent->core.spin);
3130         loops = 0;
3131         reason = 0;
3132
3133         for (;;) {
3134                 /*
3135                  * Parent may have been modified, relocating its block array.
3136                  * Reload the base pointer.
3137                  */
3138                 base = hammer2_chain_base_and_count(parent, &count);
3139
3140                 if (++loops > 100000) {
3141                     hammer2_spin_unex(&parent->core.spin);
3142                     panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n",
3143                           reason, parent, base, count, key_next);
3144                 }
3145
3146                 /*
3147                  * NOTE: spinlock stays intact, returned chain (if not NULL)
3148                  *       is not referenced or locked which means that we
3149                  *       cannot safely check its flagged / deletion status
3150                  *       until we lock it.
3151                  */
3152                 chain = hammer2_combined_find(parent, base, count,
3153                                               &cache_index, &key_next,
3154                                               key_beg, key_end,
3155                                               &bref);
3156                 generation = parent->core.generation;
3157                 if (bref == NULL)
3158                         break;
3159                 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
3160
3161                 /*
3162                  * Skip keys that are not within the key/radix of the new
3163                  * indirect block.  They stay in the parent.
3164                  */
3165                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
3166                     (key ^ bref->key)) != 0) {
3167                         goto next_key_spinlocked;
3168                 }
3169
3170                 /*
3171                  * Load the new indirect block by acquiring the related
3172                  * chains (potentially from media as it might not be
3173                  * in-memory).  Then move it to the new parent (ichain)
3174                  * via DELETE-DUPLICATE.
3175                  *
3176                  * chain is referenced but not locked.  We must lock the
3177                  * chain to obtain definitive DUPLICATED/DELETED state
3178                  */
3179                 if (chain) {
3180                         /*
3181                          * Use chain already present in the RBTREE
3182                          */
3183                         hammer2_chain_ref(chain);
3184                         hammer2_spin_unex(&parent->core.spin);
3185                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
3186                 } else {
3187                         /*
3188                          * Get chain for blockref element.  _get returns NULL
3189                          * on insertion race.
3190                          */
3191                         bcopy = *bref;
3192                         hammer2_spin_unex(&parent->core.spin);
3193                         chain = hammer2_chain_get(parent, generation, &bcopy);
3194                         if (chain == NULL) {
3195                                 reason = 1;
3196                                 hammer2_spin_ex(&parent->core.spin);
3197                                 continue;
3198                         }
3199                         if (bcmp(&bcopy, bref, sizeof(bcopy))) {
3200                                 kprintf("REASON 2\n");
3201                                 reason = 2;
3202                                 hammer2_chain_drop(chain);
3203                                 hammer2_spin_ex(&parent->core.spin);
3204                                 continue;
3205                         }
3206                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
3207                 }
3208
3209                 /*
3210                  * This is always live so if the chain has been deleted
3211                  * we raced someone and we have to retry.
3212                  *
3213                  * NOTE: Lookups can race delete-duplicate because
3214                  *       delete-duplicate does not lock the parent's core
3215                  *       (they just use the spinlock on the core).  We must
3216                  *       check for races by comparing the DUPLICATED flag before
3217                  *       releasing the spinlock with the flag after locking the
3218                  *       chain.
3219                  *
3220                  *       (note reversed logic for this one)
3221                  */
3222                 if (chain->flags & HAMMER2_CHAIN_DELETED) {
3223                         hammer2_chain_unlock(chain);
3224                         hammer2_chain_drop(chain);
3225                         goto next_key;
3226                 }
3227
3228                 /*
3229                  * Shift the chain to the indirect block.
3230                  *
3231                  * WARNING! No reason for us to load chain data, pass NOSTATS
3232                  *          to prevent delete/insert from trying to access
3233                  *          inode stats (and thus asserting if there is no
3234                  *          chain->data loaded).
3235                  *
3236                  * WARNING! The (parent, chain) deletion may modify the parent
3237                  *          and invalidate the base pointer.
3238                  */
3239                 hammer2_chain_delete(parent, chain, mtid, 0);
3240                 hammer2_chain_rename(NULL, &ichain, chain, mtid, 0);
3241                 hammer2_chain_unlock(chain);
3242                 hammer2_chain_drop(chain);
3243                 KKASSERT(parent->refs > 0);
3244                 chain = NULL;
3245                 base = NULL;    /* safety */
3246 next_key:
3247                 hammer2_spin_ex(&parent->core.spin);
3248 next_key_spinlocked:
3249                 if (--maxloops == 0)
3250                         panic("hammer2_chain_create_indirect: maxloops");
3251                 reason = 4;
3252                 if (key_next == 0 || key_next > key_end)
3253                         break;
3254                 key_beg = key_next;
3255                 /* loop */
3256         }
3257         hammer2_spin_unex(&parent->core.spin);
3258
3259         /*
3260          * Insert the new indirect block into the parent now that we've
3261          * cleared out some entries in the parent.  We calculated a good
3262          * insertion index in the loop above (ichain->index).
3263          *
3264          * We don't have to set UPDATE here because we mark ichain
3265          * modified down below (so the normal modified -> flush -> set-moved
3266          * sequence applies).
3267          *
3268          * The insertion shouldn't race as this is a completely new block
3269          * and the parent is locked.
3270          */
3271         base = NULL;    /* safety, parent modify may change address */
3272         KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
3273         hammer2_chain_insert(parent, ichain,
3274                              HAMMER2_CHAIN_INSERT_SPIN |
3275                              HAMMER2_CHAIN_INSERT_LIVE,
3276                              0);
3277
3278         /*
3279          * Make sure flushes propogate after our manual insertion.
3280          */
3281         hammer2_chain_setflush(ichain);
3282         hammer2_chain_setflush(parent);
3283
3284         /*
3285          * Figure out what to return.
3286          */
3287         if (~(((hammer2_key_t)1 << keybits) - 1) &
3288                    (create_key ^ key)) {
3289                 /*
3290                  * Key being created is outside the key range,
3291                  * return the original parent.
3292                  */
3293                 hammer2_chain_unlock(ichain);
3294                 hammer2_chain_drop(ichain);
3295         } else {
3296                 /*
3297                  * Otherwise its in the range, return the new parent.
3298                  * (leave both the new and old parent locked).
3299                  */
3300                 parent = ichain;
3301         }
3302
3303         return(parent);
3304 }
3305
3306 /*
3307  * Calculate the keybits and highside/lowside of the freemap node the
3308  * caller is creating.
3309  *
3310  * This routine will specify the next higher-level freemap key/radix
3311  * representing the lowest-ordered set.  By doing so, eventually all
3312  * low-ordered sets will be moved one level down.
3313  *
3314  * We have to be careful here because the freemap reserves a limited
3315  * number of blocks for a limited number of levels.  So we can't just
3316  * push indiscriminately.
3317  */
3318 int
3319 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
3320                              int keybits, hammer2_blockref_t *base, int count)
3321 {
3322         hammer2_chain_t *chain;
3323         hammer2_blockref_t *bref;
3324         hammer2_key_t key;
3325         hammer2_key_t key_beg;
3326         hammer2_key_t key_end;
3327         hammer2_key_t key_next;
3328         int cache_index;
3329         int locount;
3330         int hicount;
3331         int maxloops = 300000;
3332
3333         key = *keyp;
3334         locount = 0;
3335         hicount = 0;
3336         keybits = 64;
3337
3338         /*
3339          * Calculate the range of keys in the array being careful to skip
3340          * slots which are overridden with a deletion.
3341          */
3342         key_beg = 0;
3343         key_end = HAMMER2_KEY_MAX;
3344         cache_index = 0;
3345         hammer2_spin_ex(&parent->core.spin);
3346
3347         for (;;) {
3348                 if (--maxloops == 0) {
3349                         panic("indkey_freemap shit %p %p:%d\n",
3350                               parent, base, count);
3351                 }
3352                 chain = hammer2_combined_find(parent, base, count,
3353                                               &cache_index, &key_next,
3354                                               key_beg, key_end,
3355                                               &bref);
3356
3357                 /*
3358                  * Exhausted search
3359                  */
3360                 if (bref == NULL)
3361                         break;
3362
3363                 /*
3364                  * Skip deleted chains.
3365                  */
3366                 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
3367                         if (key_next == 0 || key_next > key_end)
3368                                 break;
3369                         key_beg = key_next;
3370                         continue;
3371                 }
3372
3373                 /*
3374                  * Use the full live (not deleted) element for the scan
3375                  * iteration.  HAMMER2 does not allow partial replacements.
3376                  *
3377                  * XXX should be built into hammer2_combined_find().
3378                  */
3379                 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
3380
3381                 if (keybits > bref->keybits) {
3382                         key = bref->key;
3383                         keybits = bref->keybits;
3384                 } else if (keybits == bref->keybits && bref->key < key) {
3385                         key = bref->key;
3386                 }
3387                 if (key_next == 0)
3388                         break;
3389                 key_beg = key_next;
3390         }
3391         hammer2_spin_unex(&parent->core.spin);
3392
3393         /*
3394          * Return the keybits for a higher-level FREEMAP_NODE covering
3395          * this node.
3396          */
3397         switch(keybits) {
3398         case HAMMER2_FREEMAP_LEVEL0_RADIX:
3399                 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX;
3400                 break;
3401         case HAMMER2_FREEMAP_LEVEL1_RADIX:
3402                 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX;
3403                 break;
3404         case HAMMER2_FREEMAP_LEVEL2_RADIX:
3405                 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX;
3406                 break;
3407         case HAMMER2_FREEMAP_LEVEL3_RADIX:
3408                 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX;
3409                 break;
3410         case HAMMER2_FREEMAP_LEVEL4_RADIX:
3411                 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX;
3412                 break;
3413         case HAMMER2_FREEMAP_LEVEL5_RADIX:
3414                 panic("hammer2_chain_indkey_freemap: level too high");
3415                 break;
3416         default:
3417                 panic("hammer2_chain_indkey_freemap: bad radix");
3418                 break;
3419         }
3420         *keyp = key;
3421
3422         return (keybits);
3423 }
3424
3425 /*
3426  * Calculate the keybits and highside/lowside of the indirect block the
3427  * caller is creating.
3428  */
3429 static int
3430 hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
3431                             int keybits, hammer2_blockref_t *base, int count)
3432 {
3433         hammer2_blockref_t *bref;
3434         hammer2_chain_t *chain;
3435         hammer2_key_t key_beg;
3436         hammer2_key_t key_end;
3437         hammer2_key_t key_next;
3438         hammer2_key_t key;
3439         int nkeybits;
3440         int locount;
3441         int hicount;
3442         int cache_index;
3443         int maxloops = 300000;
3444
3445         key = *keyp;
3446         locount = 0;
3447         hicount = 0;
3448
3449         /*
3450          * Calculate the range of keys in the array being careful to skip
3451          * slots which are overridden with a deletion.  Once the scan
3452          * completes we will cut the key range in half and shift half the
3453          * range into the new indirect block.
3454          */
3455         key_beg = 0;
3456         key_end = HAMMER2_KEY_MAX;
3457         cache_index = 0;
3458         hammer2_spin_ex(&parent->core.spin);
3459
3460         for (;;) {
3461                 if (--maxloops == 0) {
3462                         panic("indkey_freemap shit %p %p:%d\n",
3463                               parent, base, count);
3464                 }
3465                 chain = hammer2_combined_find(parent, base, count,
3466                                               &cache_index, &key_next,
3467                                               key_beg, key_end,
3468                                               &bref);
3469
3470                 /*
3471                  * Exhausted search
3472                  */
3473                 if (bref == NULL)
3474                         break;
3475
3476                 /*
3477                  * NOTE: No need to check DUPLICATED here because we do
3478                  *       not release the spinlock.
3479                  */
3480                 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
3481                         if (key_next == 0 || key_next > key_end)
3482                                 break;
3483                         key_beg = key_next;
3484                         continue;
3485                 }
3486
3487                 /*
3488                  * Use the full live (not deleted) element for the scan
3489                  * iteration.  HAMMER2 does not allow partial replacements.
3490                  *
3491                  * XXX should be built into hammer2_combined_find().
3492                  */
3493                 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
3494
3495                 /*
3496                  * Expand our calculated key range (key, keybits) to fit
3497                  * the scanned key.  nkeybits represents the full range
3498                  * that we will later cut in half (two halves @ nkeybits - 1).
3499                  */
3500                 nkeybits = keybits;
3501                 if (nkeybits < bref->keybits) {
3502                         if (bref->keybits > 64) {
3503                                 kprintf("bad bref chain %p bref %p\n",
3504                                         chain, bref);
3505                                 Debugger("fubar");
3506                         }
3507                         nkeybits = bref->keybits;
3508                 }
3509                 while (nkeybits < 64 &&
3510                        (~(((hammer2_key_t)1 << nkeybits) - 1) &
3511                         (key ^ bref->key)) != 0) {
3512                         ++nkeybits;
3513                 }
3514
3515                 /*
3516                  * If the new key range is larger we have to determine
3517                  * which side of the new key range the existing keys fall
3518                  * under by checking the high bit, then collapsing the
3519                  * locount into the hicount or vise-versa.
3520                  */
3521                 if (keybits != nkeybits) {
3522                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
3523                                 hicount += locount;
3524                                 locount = 0;
3525                         } else {
3526                                 locount += hicount;
3527                                 hicount = 0;
3528                         }
3529                         keybits = nkeybits;
3530                 }
3531
3532                 /*
3533                  * The newly scanned key will be in the lower half or the
3534                  * upper half of the (new) key range.
3535                  */
3536                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
3537                         ++hicount;
3538                 else
3539                         ++locount;
3540
3541                 if (key_next == 0)
3542                         break;
3543                 key_beg = key_next;
3544         }
3545         hammer2_spin_unex(&parent->core.spin);
3546         bref = NULL;    /* now invalid (safety) */
3547
3548         /*
3549          * Adjust keybits to represent half of the full range calculated
3550          * above (radix 63 max)
3551          */
3552         --keybits;
3553
3554         /*
3555          * Select whichever half contains the most elements.  Theoretically
3556          * we can select either side as long as it contains at least one
3557          * element (in order to ensure that a free slot is present to hold
3558          * the indirect block).
3559          */
3560         if (hammer2_indirect_optimize) {
3561                 /*
3562                  * Insert node for least number of keys, this will arrange
3563                  * the first few blocks of a large file or the first few
3564                  * inodes in a directory with fewer indirect blocks when
3565                  * created linearly.
3566                  */
3567                 if (hicount < locount && hicount != 0)
3568                         key |= (hammer2_key_t)1 << keybits;
3569                 else
3570                         key &= ~(hammer2_key_t)1 << keybits;
3571         } else {
3572                 /*
3573                  * Insert node for most number of keys, best for heavily
3574                  * fragmented files.
3575                  */
3576                 if (hicount > locount)
3577                         key |= (hammer2_key_t)1 << keybits;
3578                 else
3579                         key &= ~(hammer2_key_t)1 << keybits;
3580         }
3581         *keyp = key;
3582
3583         return (keybits);
3584 }
3585
3586 /*
3587  * Sets CHAIN_DELETED and remove the chain's blockref from the parent if
3588  * it exists.
3589  *
3590  * Both parent and chain must be locked exclusively.
3591  *
3592  * This function will modify the parent if the blockref requires removal
3593  * from the parent's block table.
3594  *
3595  * This function is NOT recursive.  Any entity already pushed into the
3596  * chain (such as an inode) may still need visibility into its contents,
3597  * as well as the ability to read and modify the contents.  For example,
3598  * for an unlinked file which is still open.
3599  */
3600 void
3601 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain,
3602                      hammer2_tid_t mtid, int flags)
3603 {
3604         KKASSERT(hammer2_mtx_owned(&chain->lock));
3605
3606         /*
3607          * Nothing to do if already marked.
3608          *
3609          * We need the spinlock on the core whos RBTREE contains chain
3610          * to protect against races.
3611          */
3612         if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
3613                 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
3614                          chain->parent == parent);
3615                 _hammer2_chain_delete_helper(parent, chain, mtid, flags);
3616         }
3617
3618         /*
3619          * Permanent deletions mark the chain as destroyed.  H
3620          */
3621         if (flags & HAMMER2_DELETE_PERMANENT) {
3622                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
3623         } else {
3624                 /* XXX might not be needed */
3625                 hammer2_chain_setflush(chain);
3626         }
3627 }
3628
3629 /*
3630  * Returns the index of the nearest element in the blockref array >= elm.
3631  * Returns (count) if no element could be found.
3632  *
3633  * Sets *key_nextp to the next key for loop purposes but does not modify
3634  * it if the next key would be higher than the current value of *key_nextp.
3635  * Note that *key_nexp can overflow to 0, which should be tested by the
3636  * caller.
3637  *
3638  * (*cache_indexp) is a heuristic and can be any value without effecting
3639  * the result.
3640  *
3641  * WARNING!  Must be called with parent's spinlock held.  Spinlock remains
3642  *           held through the operation.
3643  */
3644 static int
3645 hammer2_base_find(hammer2_chain_t *parent,
3646                   hammer2_blockref_t *base, int count,
3647                   int *cache_indexp, hammer2_key_t *key_nextp,
3648                   hammer2_key_t key_beg, hammer2_key_t key_end)
3649 {
3650         hammer2_blockref_t *scan;
3651         hammer2_key_t scan_end;
3652         int i;
3653         int limit;
3654
3655         /*
3656          * Require the live chain's already have their core's counted
3657          * so we can optimize operations.
3658          */
3659         KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS);
3660
3661         /*
3662          * Degenerate case
3663          */
3664         if (count == 0 || base == NULL)
3665                 return(count);
3666
3667         /*
3668          * Sequential optimization using *cache_indexp.  This is the most
3669          * likely scenario.
3670          *
3671          * We can avoid trailing empty entries on live chains, otherwise
3672          * we might have to check the whole block array.
3673          */
3674         i = *cache_indexp;
3675         cpu_ccfence();
3676         limit = parent->core.live_zero;
3677         if (i >= limit)
3678                 i = limit - 1;
3679         if (i < 0)
3680                 i = 0;
3681         KKASSERT(i < count);
3682
3683         /*
3684          * Search backwards
3685          */
3686         scan = &base[i];
3687         while (i > 0 && (scan->type == 0 || scan->key > key_beg)) {
3688                 --scan;
3689                 --i;
3690         }
3691         *cache_indexp = i;
3692
3693         /*
3694          * Search forwards, stop when we find a scan element which
3695          * encloses the key or until we know that there are no further
3696          * elements.
3697          */
3698         while (i < count) {
3699                 if (scan->type != 0) {
3700                         scan_end = scan->key +
3701                                    ((hammer2_key_t)1 << scan->keybits) - 1;
3702                         if (scan->key > key_beg || scan_end >= key_beg)
3703                                 break;
3704                 }
3705                 if (i >= limit)
3706                         return (count);
3707                 ++scan;
3708                 ++i;
3709         }
3710         if (i != count) {
3711                 *cache_indexp = i;
3712                 if (i >= limit) {
3713                         i = count;
3714                 } else {
3715                         scan_end = scan->key +
3716                                    ((hammer2_key_t)1 << scan->keybits);
3717                         if (scan_end && (*key_nextp > scan_end ||
3718                                          *key_nextp == 0)) {
3719                                 *key_nextp = scan_end;
3720                         }
3721                 }
3722         }
3723         return (i);
3724 }
3725
3726 /*
3727  * Do a combined search and return the next match either from the blockref
3728  * array or from the in-memory chain.  Sets *bresp to the returned bref in
3729  * both cases, or sets it to NULL if the search exhausted.  Only returns
3730  * a non-NULL chain if the search matched from the in-memory chain.
3731  *
3732  * When no in-memory chain has been found and a non-NULL bref is returned
3733  * in *bresp.
3734  *
3735  *
3736  * The returned chain is not locked or referenced.  Use the returned bref
3737  * to determine if the search exhausted or not.  Iterate if the base find
3738  * is chosen but matches a deleted chain.
3739  *
3740  * WARNING!  Must be called with parent's spinlock held.  Spinlock remains
3741  *           held through the operation.
3742  */
3743 static hammer2_chain_t *
3744 hammer2_combined_find(hammer2_chain_t *parent,
3745                       hammer2_blockref_t *base, int count,
3746                       int *cache_indexp, hammer2_key_t *key_nextp,
3747                       hammer2_key_t key_beg, hammer2_key_t key_end,
3748                       hammer2_blockref_t **bresp)
3749 {
3750         hammer2_blockref_t *bref;
3751         hammer2_chain_t *chain;
3752         int i;
3753
3754         /*
3755          * Lookup in block array and in rbtree.
3756          */
3757         *key_nextp = key_end + 1;
3758         i = hammer2_base_find(parent, base, count, cache_indexp,
3759                               key_nextp, key_beg, key_end);
3760         chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end);
3761
3762         /*
3763          * Neither matched
3764          */
3765         if (i == count && chain == NULL) {
3766                 *bresp = NULL;
3767                 return(NULL);
3768         }
3769
3770         /*
3771          * Only chain matched.
3772          */
3773         if (i == count) {
3774                 bref = &chain->bref;
3775                 goto found;
3776         }
3777
3778         /*
3779          * Only blockref matched.
3780          */
3781         if (chain == NULL) {
3782                 bref = &base[i];
3783                 goto found;
3784         }
3785
3786         /*
3787          * Both in-memory and blockref matched, select the nearer element.
3788          *
3789          * If both are flush with the left-hand side or both are the
3790          * same distance away, select the chain.  In this situation the
3791          * chain must have been loaded from the matching blockmap.
3792          */
3793         if ((chain->bref.key <= key_beg && base[i].key <= key_beg) ||
3794             chain->bref.key == base[i].key) {
3795                 KKASSERT(chain->bref.key == base[i].key);
3796                 bref = &chain->bref;
3797                 goto found;
3798         }
3799
3800         /*
3801          * Select the nearer key
3802          */
3803         if (chain->bref.key < base[i].key) {
3804                 bref = &chain->bref;
3805         } else {
3806                 bref = &base[i];
3807                 chain = NULL;
3808         }
3809
3810         /*
3811          * If the bref is out of bounds we've exhausted our search.
3812          */
3813 found:
3814         if (bref->key > key_end) {
3815                 *bresp = NULL;
3816                 chain = NULL;
3817         } else {
3818                 *bresp = bref;
3819         }
3820         return(chain);
3821 }
3822
3823 /*
3824  * Locate the specified block array element and delete it.  The element
3825  * must exist.
3826  *
3827  * The spin lock on the related chain must be held.
3828  *
3829  * NOTE: live_count was adjusted when the chain was deleted, so it does not
3830  *       need to be adjusted when we commit the media change.
3831  */
3832 void
3833 hammer2_base_delete(hammer2_chain_t *parent,
3834                     hammer2_blockref_t *base, int count,
3835                     int *cache_indexp, hammer2_chain_t *chain)
3836 {
3837         hammer2_blockref_t *elm = &chain->bref;
3838         hammer2_key_t key_next;
3839         int i;
3840
3841         /*
3842          * Delete element.  Expect the element to exist.
3843          *
3844          * XXX see caller, flush code not yet sophisticated enough to prevent
3845          *     re-flushed in some cases.
3846          */
3847         key_next = 0; /* max range */
3848         i = hammer2_base_find(parent, base, count, cache_indexp,
3849                               &key_next, elm->key, elm->key);
3850         if (i == count || base[i].type == 0 ||
3851             base[i].key != elm->key ||
3852             ((chain->flags & HAMMER2_CHAIN_BMAPUPD) == 0 &&
3853              base[i].keybits != elm->keybits)) {
3854                 hammer2_spin_unex(&parent->core.spin);
3855                 panic("delete base %p element not found at %d/%d elm %p\n",
3856                       base, i, count, elm);
3857                 return;
3858         }
3859
3860         /*
3861          * Update stats and zero the entry
3862          */
3863         parent->bref.data_count -= base[i].data_count;
3864         parent->bref.data_count -= (hammer2_off_t)1 <<
3865                         (int)(base[i].data_off & HAMMER2_OFF_MASK_RADIX);
3866         parent->bref.inode_count -= base[i].inode_count;
3867         if (base[i].type == HAMMER2_BREF_TYPE_INODE)
3868                 parent->bref.inode_count -= 1;
3869
3870         bzero(&base[i], sizeof(*base));
3871
3872         /*
3873          * We can only optimize parent->core.live_zero for live chains.
3874          */
3875         if (parent->core.live_zero == i + 1) {
3876                 while (--i >= 0 && base[i].type == 0)
3877                         ;
3878                 parent->core.live_zero = i + 1;
3879         }
3880
3881         /*
3882          * Clear appropriate blockmap flags in chain.
3883          */
3884         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED |
3885                                         HAMMER2_CHAIN_BMAPUPD);
3886 }
3887
3888 /*
3889  * Insert the specified element.  The block array must not already have the
3890  * element and must have space available for the insertion.
3891  *
3892  * The spin lock on the related chain must be held.
3893  *
3894  * NOTE: live_count was adjusted when the chain was deleted, so it does not
3895  *       need to be adjusted when we commit the media change.
3896  */
3897 void
3898 hammer2_base_insert(hammer2_chain_t *parent,
3899                     hammer2_blockref_t *base, int count,
3900                     int *cache_indexp, hammer2_chain_t *chain)
3901 {
3902         hammer2_blockref_t *elm = &chain->bref;
3903         hammer2_key_t key_next;
3904         hammer2_key_t xkey;
3905         int i;
3906         int j;
3907         int k;
3908         int l;
3909         int u = 1;
3910
3911         /*
3912          * Insert new element.  Expect the element to not already exist
3913          * unless we are replacing it.
3914          *
3915          * XXX see caller, flush code not yet sophisticated enough to prevent
3916          *     re-flushed in some cases.
3917          */
3918         key_next = 0; /* max range */
3919         i = hammer2_base_find(parent, base, count, cache_indexp,
3920                               &key_next, elm->key, elm->key);
3921
3922         /*
3923          * Shortcut fill optimization, typical ordered insertion(s) may not
3924          * require a search.
3925          */
3926         KKASSERT(i >= 0 && i <= count);
3927
3928         /*
3929          * Set appropriate blockmap flags in chain.
3930          */
3931         atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED);
3932
3933         /*
3934          * Update stats and zero the entry
3935          */
3936         parent->bref.data_count += elm->data_count;
3937         parent->bref.data_count += (hammer2_off_t)1 <<
3938                         (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX);
3939         parent->bref.inode_count += elm->inode_count;
3940         if (elm->type == HAMMER2_BREF_TYPE_INODE)
3941                 parent->bref.inode_count += 1;
3942
3943
3944         /*
3945          * We can only optimize parent->core.live_zero for live chains.
3946          */
3947         if (i == count && parent->core.live_zero < count) {
3948                 i = parent->core.live_zero++;
3949                 base[i] = *elm;
3950                 return;
3951         }
3952
3953         xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1;
3954         if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) {
3955                 hammer2_spin_unex(&parent->core.spin);
3956                 panic("insert base %p overlapping elements at %d elm %p\n",
3957                       base, i, elm);
3958         }
3959
3960         /*
3961          * Try to find an empty slot before or after.
3962          */
3963         j = i;
3964         k = i;
3965         while (j > 0 || k < count) {
3966                 --j;
3967                 if (j >= 0 && base[j].type == 0) {
3968                         if (j == i - 1) {
3969                                 base[j] = *elm;
3970                         } else {
3971                                 bcopy(&base[j+1], &base[j],
3972                                       (i - j - 1) * sizeof(*base));
3973                                 base[i - 1] = *elm;
3974                         }
3975                         goto validate;
3976                 }
3977                 ++k;
3978                 if (k < count && base[k].type == 0) {
3979                         bcopy(&base[i], &base[i+1],
3980                               (k - i) * sizeof(hammer2_blockref_t));
3981                         base[i] = *elm;
3982
3983                         /*
3984                          * We can only update parent->core.live_zero for live
3985                          * chains.
3986                          */
3987                         if (parent->core.live_zero <= k)
3988                                 parent->core.live_zero = k + 1;
3989                         u = 2;
3990                         goto validate;
3991                 }
3992         }
3993         panic("hammer2_base_insert: no room!");
3994
3995         /*
3996          * Debugging
3997          */
3998 validate:
3999         key_next = 0;
4000         for (l = 0; l < count; ++l) {
4001                 if (base[l].type) {
4002                         key_next = base[l].key +
4003                                    ((hammer2_key_t)1 << base[l].keybits) - 1;
4004                         break;
4005                 }
4006         }
4007         while (++l < count) {
4008                 if (base[l].type) {
4009                         if (base[l].key <= key_next)
4010                                 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l);
4011                         key_next = base[l].key +
4012                                    ((hammer2_key_t)1 << base[l].keybits) - 1;
4013
4014                 }
4015         }
4016
4017 }
4018
4019 #if 0
4020
4021 /*
4022  * Sort the blockref array for the chain.  Used by the flush code to
4023  * sort the blockref[] array.
4024  *
4025  * The chain must be exclusively locked AND spin-locked.
4026  */
4027 typedef hammer2_blockref_t *hammer2_blockref_p;
4028
4029 static
4030 int
4031 hammer2_base_sort_callback(const void *v1, const void *v2)
4032 {
4033         hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1;
4034         hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2;
4035
4036         /*
4037          * Make sure empty elements are placed at the end of the array
4038          */
4039         if (bref1->type == 0) {
4040                 if (bref2->type == 0)
4041                         return(0);
4042                 return(1);
4043         } else if (bref2->type == 0) {
4044                 return(-1);
4045         }
4046
4047         /*
4048          * Sort by key
4049          */
4050         if (bref1->key < bref2->key)
4051                 return(-1);
4052         if (bref1->key > bref2->key)
4053                 return(1);
4054         return(0);
4055 }
4056
4057 void
4058 hammer2_base_sort(hammer2_chain_t *chain)
4059 {
4060         hammer2_blockref_t *base;
4061         int count;
4062
4063         switch(chain->bref.type) {
4064         case HAMMER2_BREF_TYPE_INODE:
4065                 /*
4066                  * Special shortcut for embedded data returns the inode
4067                  * itself.  Callers must detect this condition and access
4068                  * the embedded data (the strategy code does this for us).
4069                  *
4070                  * This is only applicable to regular files and softlinks.
4071                  */
4072                 if (chain->data->ipdata.meta.op_flags &
4073                     HAMMER2_OPFLAG_DIRECTDATA) {
4074                         return;
4075                 }
4076                 base = &chain->data->ipdata.u.blockset.blockref[0];
4077                 count = HAMMER2_SET_COUNT;
4078                 break;
4079         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
4080         case HAMMER2_BREF_TYPE_INDIRECT:
4081                 /*
4082                  * Optimize indirect blocks in the INITIAL state to avoid
4083                  * I/O.
4084                  */
4085                 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0);
4086                 base = &chain->data->npdata[0];
4087                 count = chain->bytes / sizeof(hammer2_blockref_t);
4088                 break;
4089         case HAMMER2_BREF_TYPE_VOLUME:
4090                 base = &chain->data->voldata.sroot_blockset.blockref[0];
4091                 count = HAMMER2_SET_COUNT;
4092                 break;
4093         case HAMMER2_BREF_TYPE_FREEMAP:
4094                 base = &chain->data->blkset.blockref[0];
4095                 count = HAMMER2_SET_COUNT;
4096                 break;
4097         default:
4098                 kprintf("hammer2_chain_lookup: unrecognized "
4099                         "blockref(A) type: %d",
4100                         chain->bref.type);
4101                 while (1)
4102                         tsleep(&base, 0, "dead", 0);
4103                 panic("hammer2_chain_lookup: unrecognized "
4104                       "blockref(A) type: %d",
4105                       chain->bref.type);
4106                 base = NULL;    /* safety */
4107                 count = 0;      /* safety */
4108         }
4109         kqsort(base, count, sizeof(*base), hammer2_base_sort_callback);
4110 }
4111
4112 #endif
4113
4114 /*
4115  * Chain memory management
4116  */
4117 void
4118 hammer2_chain_wait(hammer2_chain_t *chain)
4119 {
4120         tsleep(chain, 0, "chnflw", 1);
4121 }
4122
4123 const hammer2_media_data_t *
4124 hammer2_chain_rdata(hammer2_chain_t *chain)
4125 {
4126         KKASSERT(chain->data != NULL);
4127         return (chain->data);
4128 }
4129
4130 hammer2_media_data_t *
4131 hammer2_chain_wdata(hammer2_chain_t *chain)
4132 {
4133         KKASSERT(chain->data != NULL);
4134         return (chain->data);
4135 }
4136
4137 /*
4138  * Set the check data for a chain.  This can be a heavy-weight operation
4139  * and typically only runs on-flush.  For file data check data is calculated
4140  * when the logical buffers are flushed.
4141  */
4142 void
4143 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata)
4144 {
4145         chain->bref.flags &= ~HAMMER2_BREF_FLAG_ZERO;
4146
4147         switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
4148         case HAMMER2_CHECK_NONE:
4149                 break;
4150         case HAMMER2_CHECK_DISABLED:
4151                 break;
4152         case HAMMER2_CHECK_ISCSI32:
4153                 chain->bref.check.iscsi32.value =
4154                         hammer2_icrc32(bdata, chain->bytes);
4155                 break;
4156         case HAMMER2_CHECK_CRC64:
4157                 chain->bref.check.crc64.value = 0;
4158                 /* XXX */
4159                 break;
4160         case HAMMER2_CHECK_SHA192:
4161                 {
4162                         SHA256_CTX hash_ctx;
4163                         union {
4164                                 uint8_t digest[SHA256_DIGEST_LENGTH];
4165                                 uint64_t digest64[SHA256_DIGEST_LENGTH/8];
4166                         } u;
4167
4168                         SHA256_Init(&hash_ctx);
4169                         SHA256_Update(&hash_ctx, bdata, chain->bytes);
4170                         SHA256_Final(u.digest, &hash_ctx);
4171                         u.digest64[2] ^= u.digest64[3];
4172                         bcopy(u.digest,
4173                               chain->bref.check.sha192.data,
4174                               sizeof(chain->bref.check.sha192.data));
4175                 }
4176                 break;
4177         case HAMMER2_CHECK_FREEMAP:
4178                 chain->bref.check.freemap.icrc32 =
4179                         hammer2_icrc32(bdata, chain->bytes);
4180                 break;
4181         default:
4182                 kprintf("hammer2_chain_setcheck: unknown check type %02x\n",
4183                         chain->bref.methods);
4184                 break;
4185         }
4186 }
4187
4188 int
4189 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata)
4190 {
4191         int r;
4192
4193         if (chain->bref.flags & HAMMER2_BREF_FLAG_ZERO)
4194                 return 1;
4195
4196         switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
4197         case HAMMER2_CHECK_NONE:
4198                 r = 1;
4199                 break;
4200         case HAMMER2_CHECK_DISABLED:
4201                 r = 1;
4202                 break;
4203         case HAMMER2_CHECK_ISCSI32:
4204                 r = (chain->bref.check.iscsi32.value ==
4205                      hammer2_icrc32(bdata, chain->bytes));
4206                 break;
4207         case HAMMER2_CHECK_CRC64:
4208                 r = (chain->bref.check.crc64.value == 0);
4209                 /* XXX */
4210                 break;
4211         case HAMMER2_CHECK_SHA192:
4212                 {
4213                         SHA256_CTX hash_ctx;
4214                         union {
4215                                 uint8_t digest[SHA256_DIGEST_LENGTH];
4216                                 uint64_t digest64[SHA256_DIGEST_LENGTH/8];
4217                         } u;
4218
4219                         SHA256_Init(&hash_ctx);
4220                         SHA256_Update(&hash_ctx, bdata, chain->bytes);
4221                         SHA256_Final(u.digest, &hash_ctx);
4222                         u.digest64[2] ^= u.digest64[3];
4223                         if (bcmp(u.digest,
4224                                  chain->bref.check.sha192.data,
4225                                  sizeof(chain->bref.check.sha192.data)) == 0) {
4226                                 r = 1;
4227                         } else {
4228                                 r = 0;
4229                         }
4230                 }
4231                 break;
4232         case HAMMER2_CHECK_FREEMAP:
4233                 r = (chain->bref.check.freemap.icrc32 ==
4234                      hammer2_icrc32(bdata, chain->bytes));
4235                 if (r == 0) {
4236                         kprintf("freemap.icrc %08x icrc32 %08x (%d)\n",
4237                                 chain->bref.check.freemap.icrc32,
4238                                 hammer2_icrc32(bdata, chain->bytes), chain->bytes);
4239                         if (chain->dio)
4240                                 kprintf("dio %p buf %016jx,%d bdata %p/%p\n",
4241                                         chain->dio, chain->dio->bp->b_loffset, chain->dio->bp->b_bufsize, bdata, chain->dio->bp->b_data);
4242                 }
4243
4244                 break;
4245         default:
4246                 kprintf("hammer2_chain_setcheck: unknown check type %02x\n",
4247                         chain->bref.methods);
4248                 r = 1;
4249                 break;
4250         }
4251         return r;
4252 }
4253
4254 /*
4255  * The caller presents a shared-locked (parent, chain) where the chain
4256  * is of type HAMMER2_OBJTYPE_HARDLINK.  The caller must hold the ip
4257  * structure representing the inode locked to prevent
4258  * consolidation/deconsolidation races.
4259  *
4260  * The flags passed in are LOOKUP flags, not RESOLVE flags.  Only
4261  * HAMMER2_LOOKUP_SHARED is supported.
4262  *
4263  * We locate the hardlink in the current or a common parent directory.
4264  *
4265  * If we are unable to locate the hardlink, EIO is returned and
4266  * (*chainp) is unlocked and dropped.
4267  */
4268 int
4269 hammer2_chain_hardlink_find(hammer2_inode_t *dip,
4270                         hammer2_chain_t **parentp,
4271                         hammer2_chain_t **chainp,
4272                         int flags)
4273 {
4274         hammer2_chain_t *parent;
4275         hammer2_chain_t *rchain;
4276         hammer2_key_t key_dummy;
4277         hammer2_key_t lhc;
4278         int cache_index = -1;
4279         int resolve_flags;
4280
4281         resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ?
4282                         HAMMER2_RESOLVE_SHARED : 0;
4283
4284         /*
4285          * Obtain the key for the hardlink from *chainp.
4286          */
4287         rchain = *chainp;
4288         lhc = rchain->data->ipdata.meta.inum;
4289         hammer2_chain_unlock(rchain);
4290         hammer2_chain_drop(rchain);
4291         rchain = NULL;
4292
4293         for (;;) {
4294                 int again;
4295
4296                 rchain = hammer2_chain_lookup(parentp, &key_dummy,
4297                                               lhc, lhc,
4298                                               &cache_index, flags);
4299                 if (rchain)
4300                         break;
4301
4302                 /*
4303                  * Iterate parents, handle parent rename races by retrying
4304                  * the operation.
4305                  */
4306                 again = 0;
4307                 for (;;) {
4308                         parent = *parentp;
4309                         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
4310                                 if (again == 1)
4311                                         break;
4312                                 ++again;
4313                         }
4314                         if (parent->bref.flags & HAMMER2_BREF_FLAG_PFSROOT)
4315                                 goto done;
4316                         for (;;) {
4317                                 if (parent->parent == NULL)
4318                                         goto done;
4319                                 parent = parent->parent;
4320                                 hammer2_chain_ref(parent);
4321                                 hammer2_chain_unlock(*parentp);
4322                                 hammer2_chain_lock(parent,
4323                                                    HAMMER2_RESOLVE_ALWAYS |
4324                                                    resolve_flags);
4325                                 if ((*parentp)->parent == parent) {
4326                                         hammer2_chain_drop(*parentp);
4327                                         *parentp = parent;
4328                                         break;
4329                                 }
4330                                 hammer2_chain_unlock(parent);
4331                                 hammer2_chain_drop(parent);
4332                                 hammer2_chain_lock(*parentp,
4333                                                    HAMMER2_RESOLVE_ALWAYS |
4334                                                    resolve_flags);
4335                                 parent = *parentp;
4336                         }
4337                 }
4338         }
4339 done:
4340
4341         *chainp = rchain;
4342         return (rchain ? 0 : EINVAL);
4343 }
4344
4345 /*
4346  * Used by the bulkscan code to snapshot the synchronized storage for
4347  * a volume, allowing it to be scanned concurrently against normal
4348  * operation.
4349  */
4350 hammer2_chain_t *
4351 hammer2_chain_bulksnap(hammer2_chain_t *chain)
4352 {
4353         hammer2_chain_t *copy;
4354
4355         copy = hammer2_chain_alloc(chain->hmp, chain->pmp, &chain->bref);
4356         switch(chain->bref.type) {
4357         case HAMMER2_BREF_TYPE_VOLUME:
4358                 copy->data = kmalloc(sizeof(copy->data->voldata),
4359                                      chain->hmp->mchain,
4360                                      M_WAITOK | M_ZERO);
4361                 hammer2_spin_ex(&chain->core.spin);
4362                 copy->data->voldata = chain->data->voldata;
4363                 hammer2_spin_unex(&chain->core.spin);
4364                 break;
4365         case HAMMER2_BREF_TYPE_FREEMAP:
4366                 copy->data = kmalloc(sizeof(hammer2_blockset_t),
4367                                      chain->hmp->mchain,
4368                                      M_WAITOK | M_ZERO);
4369                 hammer2_spin_ex(&chain->core.spin);
4370                 copy->data->blkset = chain->data->blkset;
4371                 hammer2_spin_unex(&chain->core.spin);
4372                 break;
4373         default:
4374                 break;
4375         }
4376         return copy;
4377 }
4378
4379 void
4380 hammer2_chain_bulkdrop(hammer2_chain_t *copy)
4381 {
4382         switch(copy->bref.type) {
4383         case HAMMER2_BREF_TYPE_VOLUME:
4384         case HAMMER2_BREF_TYPE_FREEMAP:
4385                 KKASSERT(copy->data);
4386                 kfree(copy->data, copy->hmp->mchain);
4387                 copy->data = NULL;
4388         default:
4389                 break;
4390         }
4391         hammer2_chain_drop(copy);
4392 }
4393
4394 /*
4395  * Create a snapshot of the specified {parent, ochain} with the specified
4396  * label.  The originating hammer2_inode must be exclusively locked for
4397  * safety.
4398  *
4399  * The ioctl code has already synced the filesystem.
4400  */
4401 int
4402 hammer2_chain_snapshot(hammer2_chain_t *chain, hammer2_ioc_pfs_t *pmp,
4403                        hammer2_tid_t mtid)
4404 {
4405         hammer2_dev_t *hmp;
4406         const hammer2_inode_data_t *ripdata;
4407         hammer2_inode_data_t *wipdata;
4408         hammer2_chain_t *nchain;
4409         hammer2_inode_t *nip;
4410         size_t name_len;
4411         hammer2_key_t lhc;
4412         struct vattr vat;
4413 #if 0
4414         uuid_t opfs_clid;
4415 #endif
4416         int error;
4417
4418         kprintf("snapshot %s\n", pmp->name);
4419
4420         name_len = strlen(pmp->name);
4421         lhc = hammer2_dirhash(pmp->name, name_len);
4422
4423         /*
4424          * Get the clid
4425          */
4426         ripdata = &chain->data->ipdata;
4427 #if 0
4428         opfs_clid = ripdata->meta.pfs_clid;
4429 #endif
4430         hmp = chain->hmp;
4431
4432         /*
4433          * Create the snapshot directory under the super-root
4434          *
4435          * Set PFS type, generate a unique filesystem id, and generate
4436          * a cluster id.  Use the same clid when snapshotting a PFS root,
4437          * which theoretically allows the snapshot to be used as part of
4438          * the same cluster (perhaps as a cache).
4439          *
4440          * Copy the (flushed) blockref array.  Theoretically we could use
4441          * chain_duplicate() but it becomes difficult to disentangle
4442          * the shared core so for now just brute-force it.
4443          */
4444         VATTR_NULL(&vat);
4445         vat.va_type = VDIR;
4446         vat.va_mode = 0755;
4447         nip = hammer2_inode_create(hmp->spmp->iroot, &vat, proc0.p_ucred,
4448                                    pmp->name, name_len, 0,
4449                                    1, 0, 0,
4450                                    HAMMER2_INSERT_PFSROOT, &error);
4451
4452         if (nip) {
4453                 hammer2_inode_modify(nip);
4454                 nchain = hammer2_inode_chain(nip, 0, HAMMER2_RESOLVE_ALWAYS);
4455                 hammer2_chain_modify(nchain, mtid, 0, 0);
4456                 wipdata = &nchain->data->ipdata;
4457
4458                 nip->meta.pfs_type = HAMMER2_PFSTYPE_MASTER;
4459                 nip->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
4460                 nip->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT;
4461                 kern_uuidgen(&nip->meta.pfs_fsid, 1);
4462
4463                 /*
4464                  * Give the snapshot its own private cluster id.  As a
4465                  * snapshot no further synchronization with the original
4466                  * cluster will be done.
4467                  */
4468 #if 0
4469                 if (chain->flags & HAMMER2_CHAIN_PFSBOUNDARY)
4470                         nip->meta.pfs_clid = opfs_clid;
4471                 else
4472                         kern_uuidgen(&nip->meta.pfs_clid, 1);
4473 #endif
4474                 kern_uuidgen(&nip->meta.pfs_clid, 1);
4475                 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
4476
4477                 /* XXX hack blockset copy */
4478                 /* XXX doesn't work with real cluster */
4479                 wipdata->meta = nip->meta;
4480                 wipdata->u.blockset = ripdata->u.blockset;
4481                 hammer2_flush(nchain, 1);
4482                 hammer2_chain_unlock(nchain);
4483                 hammer2_chain_drop(nchain);
4484                 hammer2_inode_unlock(nip);
4485         }
4486         return (error);
4487 }