hammer2 - Rewrite internal chain algorithms
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem implements most of the core support functions for
37  * the hammer2_chain structure.
38  *
39  * Chains are the in-memory version on media objects (volume header, inodes,
40  * indirect blocks, data blocks, etc).  Chains represent a portion of the
41  * HAMMER2 topology.
42  *
43  * A chain is topologically stable once it has been inserted into the
44  * in-memory topology.  Modifications which copy, move, or resize the chain
45  * are handled via the DELETE-DUPLICATE mechanic where the original chain
46  * stays intact but is marked deleted and a new chain is allocated which
47  * shares the old chain's children.
48  *
49  * This sharing is handled via the hammer2_chain_core structure.
50  *
51  * The DELETE-DUPLICATE mechanism allows the same topological level to contain
52  * many overloadings.  However, our RBTREE mechanics require that there be
53  * no overlaps so we accomplish the overloading by moving conflicting chains
54  * with smaller or equal radii into a sub-RBTREE under the chain being
55  * overloaded.
56  *
57  * DELETE-DUPLICATE is also used when a modification to a chain crosses a
58  * flush synchronization boundary, allowing the flush code to continue flushing
59  * the older version of the topology and not be disrupted by new frontend
60  * operations.
61  */
62 #include <sys/cdefs.h>
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/types.h>
66 #include <sys/lock.h>
67 #include <sys/kern_syscall.h>
68 #include <sys/uuid.h>
69
70 #include "hammer2.h"
71
72 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
73
74 static hammer2_chain_t *hammer2_chain_create_indirect(
75                 hammer2_trans_t *trans, hammer2_chain_t *parent,
76                 hammer2_key_t key, int keybits, int for_type, int *errorp);
77 static void hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop);
78 static void adjreadcounter(hammer2_blockref_t *bref, size_t bytes);
79
80 /*
81  * Basic RBTree for chains.  Chains cannot overlap within any given
82  * core->rbtree without recursing through chain->rbtree.  We effectively
83  * guarantee this by checking the full range rather than just the first
84  * key element.  By matching on the full range callers can detect when
85  * recursrion through chain->rbtree is needed.
86  *
87  * NOTE: This also means the a delete-duplicate on the same key will
88  *       overload by placing the deleted element in the new element's
89  *       chain->rbtree (when doing a direct replacement).
90  */
91 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
92
93 int
94 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
95 {
96         hammer2_key_t c1_beg;
97         hammer2_key_t c1_end;
98         hammer2_key_t c2_beg;
99         hammer2_key_t c2_end;
100
101         c1_beg = chain1->bref.key;
102         c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1;
103         c2_beg = chain2->bref.key;
104         c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1;
105
106         if (c1_end < c2_beg)    /* fully to the left */
107                 return(-1);
108         if (c1_beg > c2_end)    /* fully to the right */
109                 return(1);
110         return(0);              /* overlap (must not cross edge boundary) */
111 }
112
113 static __inline
114 int
115 hammer2_isclusterable(hammer2_chain_t *chain)
116 {
117         if (hammer2_cluster_enable) {
118                 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
119                     chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
120                     chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
121                         return(1);
122                 }
123         }
124         return(0);
125 }
126
127 /*
128  * Recursively set the SUBMODIFIED flag up to the root starting at chain's
129  * parent.  SUBMODIFIED is not set in chain itself.
130  *
131  * This function only operates on current-time transactions and is not
132  * used during flushes.  Instead, the flush code manages the flag itself.
133  */
134 void
135 hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain)
136 {
137         hammer2_chain_core_t *above;
138
139         if (trans->flags & HAMMER2_TRANS_ISFLUSH)
140                 return;
141         while ((above = chain->above) != NULL) {
142                 spin_lock(&above->cst.spin);
143                 chain = TAILQ_FIRST(&above->ownerq);
144                 while (hammer2_chain_refactor_test(chain, 1))
145                         chain = TAILQ_NEXT(chain, core_entry);
146                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
147                 spin_unlock(&above->cst.spin);
148         }
149 }
150
151 /*
152  * Allocate a new disconnected chain element representing the specified
153  * bref.  chain->refs is set to 1 and the passed bref is copied to
154  * chain->bref.  chain->bytes is derived from the bref.
155  *
156  * chain->core is NOT allocated and the media data and bp pointers are left
157  * NULL.  The caller must call chain_core_alloc() to allocate or associate
158  * a core with the chain.
159  *
160  * NOTE: Returns a referenced but unlocked (because there is no core) chain.
161  */
162 hammer2_chain_t *
163 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp,
164                     hammer2_trans_t *trans, hammer2_blockref_t *bref)
165 {
166         hammer2_chain_t *chain;
167         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
168
169         /*
170          * Construct the appropriate system structure.
171          */
172         switch(bref->type) {
173         case HAMMER2_BREF_TYPE_INODE:
174         case HAMMER2_BREF_TYPE_INDIRECT:
175         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
176         case HAMMER2_BREF_TYPE_DATA:
177         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
178                 /*
179                  * Chain's are really only associated with the hmp but we
180                  * maintain a pmp association for per-mount memory tracking
181                  * purposes.  The pmp can be NULL.
182                  */
183                 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO);
184                 if (pmp) {
185                         chain->pmp = pmp;
186                         atomic_add_long(&pmp->inmem_chains, 1);
187                 }
188                 break;
189         case HAMMER2_BREF_TYPE_VOLUME:
190         case HAMMER2_BREF_TYPE_FREEMAP:
191                 chain = NULL;
192                 panic("hammer2_chain_alloc volume type illegal for op");
193         default:
194                 chain = NULL;
195                 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
196                       bref->type);
197         }
198
199         chain->hmp = hmp;
200         chain->bref = *bref;
201         chain->bytes = bytes;
202         chain->refs = 1;
203         chain->flags = HAMMER2_CHAIN_ALLOCATED;
204         chain->delete_tid = HAMMER2_MAX_TID;
205         if (trans)
206                 chain->modify_tid = trans->sync_tid;
207
208         return (chain);
209 }
210
211 /*
212  * Associate an existing core with the chain or allocate a new core.
213  *
214  * The core is not locked.  No additional refs on the chain are made.
215  * (trans) must not be NULL if (core) is not NULL.
216  *
217  * When chains are delete-duplicated during flushes we insert nchain on
218  * the ownerq after ochain instead of at the end in order to give the
219  * drop code visibility in the correct order, otherwise drops can be missed.
220  */
221 void
222 hammer2_chain_core_alloc(hammer2_trans_t *trans,
223                          hammer2_chain_t *nchain, hammer2_chain_t *ochain)
224 {
225         hammer2_chain_core_t *core;
226
227         KKASSERT(nchain->core == NULL);
228
229         if (ochain == NULL) {
230                 core = kmalloc(sizeof(*core), nchain->hmp->mchain,
231                                M_WAITOK | M_ZERO);
232                 TAILQ_INIT(&core->layerq);
233                 TAILQ_INIT(&core->ownerq);
234                 core->sharecnt = 1;
235                 nchain->core = core;
236                 ccms_cst_init(&core->cst, nchain);
237                 TAILQ_INSERT_TAIL(&core->ownerq, nchain, core_entry);
238         } else {
239                 core = ochain->core;
240                 atomic_add_int(&core->sharecnt, 1);
241
242                 spin_lock(&core->cst.spin);
243                 nchain->core = core;
244 #if 1
245                 TAILQ_INSERT_TAIL(&core->ownerq, nchain, core_entry);
246 #else
247                 if (trans->flags & HAMMER2_TRANS_ISFLUSH) {
248                         TAILQ_INSERT_AFTER(&core->ownerq, ochain, nchain,
249                                            core_entry);
250                 } else {
251                         TAILQ_INSERT_TAIL(&core->ownerq, nchain, core_entry);
252                 }
253 #endif
254                 spin_unlock(&core->cst.spin);
255         }
256 }
257
258 /*
259  * Add a reference to a chain element, preventing its destruction.
260  */
261 void
262 hammer2_chain_ref(hammer2_chain_t *chain)
263 {
264         atomic_add_int(&chain->refs, 1);
265 }
266
267 /*
268  * Insert the chain in the core rbtree at the first layer
269  * which accepts it (for now we don't sort layers by the transaction tid)
270  */
271 #define HAMMER2_CHAIN_INSERT_SPIN       0x0001
272 #define HAMMER2_CHAIN_INSERT_LIVE       0x0002
273 #define HAMMER2_CHAIN_INSERT_RACE       0x0004
274
275 static
276 void
277 hammer2_chain_insert(hammer2_chain_core_t *above, hammer2_chain_t *chain,
278                      int flags)
279 {
280         hammer2_chain_layer_t *layer;
281         hammer2_chain_t *xchain;
282
283         if (flags & HAMMER2_CHAIN_INSERT_SPIN)
284                 spin_lock(&above->cst.spin);
285         chain->above = above;
286         layer = TAILQ_FIRST(&above->layerq);
287         xchain = NULL;
288
289         /*
290          * Try to insert
291          */
292         if (layer == NULL ||
293             (xchain = RB_INSERT(hammer2_chain_tree,
294                                 &layer->rbtree, chain)) != NULL) {
295                 /*
296                  * Either no layers have been allocated or the insertion
297                  * failed.  This is fatal if the conflicted xchain is not
298                  * flagged as deleted.  Caller may or may allow the failure.
299                  */
300                 if (xchain && (xchain->flags & HAMMER2_CHAIN_DELETED) == 0) {
301                         if (flags & HAMMER2_CHAIN_INSERT_RACE) {
302                                 chain->above = NULL;
303                                 chain->inlayer = NULL;
304                                 goto failed;
305                         }
306                         panic("hammer2_chain_insert: collision2 %p", xchain);
307                 }
308
309                 /*
310                  * Allocate a new layer to resolve the issue.
311                  */
312                 spin_unlock(&above->cst.spin);
313                 layer = kmalloc(sizeof(*layer), chain->hmp->mchain,
314                                 M_WAITOK | M_ZERO);
315                 spin_lock(&above->cst.spin);
316                 RB_INIT(&layer->rbtree);
317                 TAILQ_INSERT_HEAD(&above->layerq, layer, entry);
318                 RB_INSERT(hammer2_chain_tree, &layer->rbtree, chain);
319         }
320         chain->inlayer = layer;
321         ++above->chain_count;
322
323         if (flags & HAMMER2_CHAIN_INSERT_LIVE)
324                 atomic_add_int(&above->live_count, 1);
325         atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
326 failed:
327         if (flags & HAMMER2_CHAIN_INSERT_SPIN)
328                 spin_unlock(&above->cst.spin);
329 }
330
331 /*
332  * Drop the caller's reference to the chain.  When the ref count drops to
333  * zero this function will disassociate the chain from its parent and
334  * deallocate it, then recursely drop the parent using the implied ref
335  * from the chain's chain->parent.
336  *
337  * WARNING! Just because we are able to deallocate a chain doesn't mean
338  *          that chain->core->rbtree is empty.  There can still be a sharecnt
339  *          on chain->core and RBTREE entries that refer to different parents.
340  */
341 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain);
342
343 void
344 hammer2_chain_drop(hammer2_chain_t *chain)
345 {
346         u_int refs;
347         u_int need = 0;
348
349 #if 1
350         if (chain->flags & HAMMER2_CHAIN_MOVED)
351                 ++need;
352         if (chain->flags & HAMMER2_CHAIN_MODIFIED)
353                 ++need;
354         KKASSERT(chain->refs > need);
355 #endif
356         while (chain) {
357                 refs = chain->refs;
358                 cpu_ccfence();
359                 KKASSERT(refs > 0);
360
361                 if (refs == 1) {
362                         chain = hammer2_chain_lastdrop(chain);
363                 } else {
364                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1))
365                                 break;
366                         /* retry the same chain */
367                 }
368         }
369 }
370
371 /*
372  * Safe handling of the 1->0 transition on chain.  Returns a chain for
373  * recursive drop or NULL, possibly returning the same chain if the atomic
374  * op fails.
375  *
376  * The cst spinlock is allowed nest child-to-parent (not parent-to-child).
377  */
378 static
379 hammer2_chain_t *
380 hammer2_chain_lastdrop(hammer2_chain_t *chain)
381 {
382         hammer2_pfsmount_t *pmp;
383         hammer2_mount_t *hmp;
384         hammer2_chain_core_t *above;
385         hammer2_chain_core_t *core;
386         hammer2_chain_layer_t *layer;
387         hammer2_chain_t *rdrop1;
388         hammer2_chain_t *rdrop2;
389
390         /*
391          * Spinlock the core and check to see if it is empty.  If it is
392          * not empty we leave chain intact with refs == 0.  The elements
393          * in core->rbtree are associated with other chains contemporary
394          * with ours but not with our chain directly.
395          */
396         if ((core = chain->core) != NULL) {
397                 spin_lock(&core->cst.spin);
398
399                 /*
400                  * We can't drop any chains if they have children because
401                  * there might be a flush dependency.
402                  *
403                  * NOTE: We return (chain) on failure to retry.
404                  */
405                 if (core->chain_count) {
406                         if (atomic_cmpset_int(&chain->refs, 1, 0))
407                                 chain = NULL;   /* success */
408                         spin_unlock(&core->cst.spin);
409                         return(chain);
410                 }
411                 /* no chains left under us */
412
413                 /*
414                  * We can't drop a live chain unless it is a the head
415                  * of its ownerq.  If we were to then the go-to chain
416                  * would revert to the prior deleted chain.
417                  */
418                 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
419                     TAILQ_FIRST(&core->ownerq) != chain) {
420                         if (atomic_cmpset_int(&chain->refs, 1, 0))
421                                 chain = NULL;   /* success */
422                         spin_unlock(&core->cst.spin);
423                         return(chain);
424                 }
425         }
426
427
428         hmp = chain->hmp;
429         pmp = chain->pmp;       /* can be NULL */
430         rdrop1 = NULL;
431         rdrop2 = NULL;
432         layer = NULL;
433
434         /*
435          * Spinlock the parent and try to drop the last ref.  On success
436          * remove chain from its parent, otherwise return NULL.
437          *
438          * (multiple spinlocks on core's are allowed in a bottom-up fashion).
439          */
440         if ((above = chain->above) != NULL) {
441                 spin_lock(&above->cst.spin);
442                 if (!atomic_cmpset_int(&chain->refs, 1, 0)) {
443                         /* 1->0 transition failed */
444                         spin_unlock(&above->cst.spin);
445                         if (core)
446                                 spin_unlock(&core->cst.spin);
447                         return(chain);  /* retry */
448                 }
449
450                 /*
451                  * 1->0 transition successful, remove chain from its
452                  * above core.  Track layer for removal/freeing.
453                  */
454                 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
455                 layer = chain->inlayer;
456                 RB_REMOVE(hammer2_chain_tree, &layer->rbtree, chain);
457                 --above->chain_count;
458                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
459                 chain->above = NULL;
460                 chain->inlayer = NULL;
461
462                 if (RB_EMPTY(&layer->rbtree) && layer->refs == 0) {
463                         TAILQ_REMOVE(&above->layerq, layer, entry);
464                 } else {
465                         layer = NULL;
466                 }
467
468                 /*
469                  * Removal of our chain might allow the parent chains
470                  * owning the above core to be freed if they are also
471                  * sitting at 0 refs.  Multi-homed above cores are flushed
472                  * in forward order (insertions during flushes are made
473                  * carefully).
474                  */
475                 if (above->chain_count == 0) {
476                         rdrop1 = TAILQ_FIRST(&above->ownerq);
477                         if (rdrop1 &&
478                             atomic_cmpset_int(&rdrop1->refs, 0, 1) == 0) {
479                                 rdrop1 = NULL;
480                         }
481                 }
482                 spin_unlock(&above->cst.spin);
483                 above = NULL;   /* safety */
484         }
485
486         /*
487          * We still have the core spinlock (if core is non-NULL).  The
488          * above spinlock is gone.
489          *
490          * Remove chain from ownerq.  This may change the first element of
491          * ownerq to something we can remove.
492          */
493         if (core) {
494                 chain->core = NULL;
495
496                 TAILQ_REMOVE(&core->ownerq, chain, core_entry);
497                 rdrop2 = TAILQ_FIRST(&core->ownerq);
498                 if (rdrop2 &&
499                     atomic_cmpset_int(&rdrop2->refs, 0, 1) == 0) {
500                         rdrop2 = NULL;
501                 }
502
503                 if (atomic_fetchadd_int(&core->sharecnt, -1) == 1) {
504                         /*
505                          * On the 1->0 transition of core we can destroy
506                          * it.
507                          */
508                         spin_unlock(&core->cst.spin);
509                         KKASSERT(core->cst.count == 0);
510                         KKASSERT(core->cst.upgrade == 0);
511                         kfree(core, hmp->mchain);
512                 } else {
513                         spin_unlock(&core->cst.spin);
514                 }
515                 core = NULL;    /* safety */
516         }
517
518         /*
519          * All spin locks are gone, finish freeing stuff.
520          */
521         KKASSERT((chain->flags & (HAMMER2_CHAIN_MOVED |
522                                   HAMMER2_CHAIN_MODIFIED)) == 0);
523         hammer2_chain_drop_data(chain, 1);
524
525         KKASSERT(chain->bp == NULL);
526         chain->hmp = NULL;
527
528         if (chain->flags & HAMMER2_CHAIN_ALLOCATED) {
529                 chain->flags &= ~HAMMER2_CHAIN_ALLOCATED;
530                 kfree(chain, hmp->mchain);
531                 if (pmp) {
532                         atomic_add_long(&pmp->inmem_chains, -1);
533                         hammer2_chain_memory_wakeup(pmp);
534                 }
535         }
536
537         /*
538          * Free saved empty layer and return chained drop.
539          */
540         if (layer)
541                 kfree(layer, hmp->mchain);
542         if (rdrop2)
543                 hammer2_chain_drop(rdrop2);
544         return (rdrop1);
545 }
546
547 /*
548  * On either last lock release or last drop
549  */
550 static void
551 hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop)
552 {
553         hammer2_mount_t *hmp = chain->hmp;
554
555         switch(chain->bref.type) {
556         case HAMMER2_BREF_TYPE_VOLUME:
557         case HAMMER2_BREF_TYPE_FREEMAP:
558                 if (lastdrop)
559                         chain->data = NULL;
560                 break;
561         case HAMMER2_BREF_TYPE_INODE:
562                 if (chain->data) {
563                         kfree(chain->data, hmp->mchain);
564                         chain->data = NULL;
565                 }
566                 break;
567         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
568                 if (chain->data) {
569                         kfree(chain->data, hmp->mchain);
570                         chain->data = NULL;
571                 }
572                 break;
573         default:
574                 KKASSERT(chain->data == NULL);
575                 break;
576         }
577 }
578
579 /*
580  * Ref and lock a chain element, acquiring its data with I/O if necessary,
581  * and specify how you would like the data to be resolved.
582  *
583  * Returns 0 on success or an error code if the data could not be acquired.
584  * The chain element is locked on return regardless of whether an error
585  * occurred or not.
586  *
587  * The lock is allowed to recurse, multiple locking ops will aggregate
588  * the requested resolve types.  Once data is assigned it will not be
589  * removed until the last unlock.
590  *
591  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
592  *                         (typically used to avoid device/logical buffer
593  *                          aliasing for data)
594  *
595  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
596  *                         the INITIAL-create state (indirect blocks only).
597  *
598  *                         Do not resolve data elements for DATA chains.
599  *                         (typically used to avoid device/logical buffer
600  *                          aliasing for data)
601  *
602  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
603  *
604  * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
605  *                         it will be locked exclusive.
606  *
607  * NOTE: Embedded elements (volume header, inodes) are always resolved
608  *       regardless.
609  *
610  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
611  *       element will instantiate and zero its buffer, and flush it on
612  *       release.
613  *
614  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
615  *       so as not to instantiate a device buffer, which could alias against
616  *       a logical file buffer.  However, if ALWAYS is specified the
617  *       device buffer will be instantiated anyway.
618  *
619  * WARNING! If data must be fetched a shared lock will temporarily be
620  *          upgraded to exclusive.  However, a deadlock can occur if
621  *          the caller owns more than one shared lock.
622  */
623 int
624 hammer2_chain_lock(hammer2_chain_t *chain, int how)
625 {
626         hammer2_mount_t *hmp;
627         hammer2_chain_core_t *core;
628         hammer2_blockref_t *bref;
629         hammer2_off_t pbase;
630         hammer2_off_t pmask;
631         hammer2_off_t peof;
632         ccms_state_t ostate;
633         size_t boff;
634         size_t psize;
635         int error;
636         char *bdata;
637
638         /*
639          * Ref and lock the element.  Recursive locks are allowed.
640          */
641         if ((how & HAMMER2_RESOLVE_NOREF) == 0)
642                 hammer2_chain_ref(chain);
643         atomic_add_int(&chain->lockcnt, 1);
644
645         hmp = chain->hmp;
646         KKASSERT(hmp != NULL);
647
648         /*
649          * Get the appropriate lock.
650          */
651         core = chain->core;
652         if (how & HAMMER2_RESOLVE_SHARED)
653                 ccms_thread_lock(&core->cst, CCMS_STATE_SHARED);
654         else
655                 ccms_thread_lock(&core->cst, CCMS_STATE_EXCLUSIVE);
656
657         /*
658          * If we already have a valid data pointer no further action is
659          * necessary.
660          */
661         if (chain->data)
662                 return (0);
663
664         /*
665          * Do we have to resolve the data?
666          */
667         switch(how & HAMMER2_RESOLVE_MASK) {
668         case HAMMER2_RESOLVE_NEVER:
669                 return(0);
670         case HAMMER2_RESOLVE_MAYBE:
671                 if (chain->flags & HAMMER2_CHAIN_INITIAL)
672                         return(0);
673                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
674                         return(0);
675 #if 0
676                 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE)
677                         return(0);
678 #endif
679                 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF)
680                         return(0);
681                 /* fall through */
682         case HAMMER2_RESOLVE_ALWAYS:
683                 break;
684         }
685
686         /*
687          * Upgrade to an exclusive lock so we can safely manipulate the
688          * buffer cache.  If another thread got to it before us we
689          * can just return.
690          */
691         ostate = ccms_thread_lock_upgrade(&core->cst);
692         if (chain->data) {
693                 ccms_thread_lock_downgrade(&core->cst, ostate);
694                 return (0);
695         }
696
697         /*
698          * We must resolve to a device buffer, either by issuing I/O or
699          * by creating a zero-fill element.  We do not mark the buffer
700          * dirty when creating a zero-fill element (the hammer2_chain_modify()
701          * API must still be used to do that).
702          *
703          * The device buffer is variable-sized in powers of 2 down
704          * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
705          * chunk always contains buffers of the same size. (XXX)
706          *
707          * The minimum physical IO size may be larger than the variable
708          * block size.
709          */
710         bref = &chain->bref;
711
712         psize = hammer2_devblksize(chain->bytes);
713         pmask = (hammer2_off_t)psize - 1;
714         pbase = bref->data_off & ~pmask;
715         boff = bref->data_off & (HAMMER2_OFF_MASK & pmask);
716         KKASSERT(pbase != 0);
717         peof = (pbase + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64;
718
719         /*
720          * The getblk() optimization can only be used on newly created
721          * elements if the physical block size matches the request.
722          */
723         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
724             chain->bytes == psize) {
725                 chain->bp = getblk(hmp->devvp, pbase, psize, 0, 0);
726                 error = 0;
727         } else if (hammer2_isclusterable(chain)) {
728                 error = cluster_read(hmp->devvp, peof, pbase, psize,
729                                      psize, HAMMER2_PBUFSIZE*4,
730                                      &chain->bp);
731                 adjreadcounter(&chain->bref, chain->bytes);
732         } else {
733                 error = bread(hmp->devvp, pbase, psize, &chain->bp);
734                 adjreadcounter(&chain->bref, chain->bytes);
735         }
736
737         if (error) {
738                 kprintf("hammer2_chain_lock: I/O error %016jx: %d\n",
739                         (intmax_t)pbase, error);
740                 bqrelse(chain->bp);
741                 chain->bp = NULL;
742                 ccms_thread_lock_downgrade(&core->cst, ostate);
743                 return (error);
744         }
745
746         /*
747          * Zero the data area if the chain is in the INITIAL-create state.
748          * Mark the buffer for bdwrite().  This clears the INITIAL state
749          * but does not mark the chain modified.
750          */
751         bdata = (char *)chain->bp->b_data + boff;
752         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
753                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
754                 bzero(bdata, chain->bytes);
755                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
756         }
757
758         /*
759          * Setup the data pointer, either pointing it to an embedded data
760          * structure and copying the data from the buffer, or pointing it
761          * into the buffer.
762          *
763          * The buffer is not retained when copying to an embedded data
764          * structure in order to avoid potential deadlocks or recursions
765          * on the same physical buffer.
766          */
767         switch (bref->type) {
768         case HAMMER2_BREF_TYPE_VOLUME:
769         case HAMMER2_BREF_TYPE_FREEMAP:
770                 /*
771                  * Copy data from bp to embedded buffer
772                  */
773                 panic("hammer2_chain_lock: called on unresolved volume header");
774 #if 0
775                 /* NOT YET */
776                 KKASSERT(pbase == 0);
777                 KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
778                 bcopy(bdata, &hmp->voldata, chain->bytes);
779                 chain->data = (void *)&hmp->voldata;
780                 bqrelse(chain->bp);
781                 chain->bp = NULL;
782 #endif
783                 break;
784         case HAMMER2_BREF_TYPE_INODE:
785                 /*
786                  * Copy data from bp to embedded buffer, do not retain the
787                  * device buffer.
788                  */
789                 KKASSERT(chain->bytes == sizeof(chain->data->ipdata));
790                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED);
791                 chain->data = kmalloc(sizeof(chain->data->ipdata),
792                                       hmp->mchain, M_WAITOK | M_ZERO);
793                 bcopy(bdata, &chain->data->ipdata, chain->bytes);
794                 bqrelse(chain->bp);
795                 chain->bp = NULL;
796                 break;
797         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
798                 KKASSERT(chain->bytes == sizeof(chain->data->bmdata));
799                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED);
800                 chain->data = kmalloc(sizeof(chain->data->bmdata),
801                                       hmp->mchain, M_WAITOK | M_ZERO);
802                 bcopy(bdata, &chain->data->bmdata, chain->bytes);
803                 bqrelse(chain->bp);
804                 chain->bp = NULL;
805                 break;
806         case HAMMER2_BREF_TYPE_INDIRECT:
807         case HAMMER2_BREF_TYPE_DATA:
808         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
809         default:
810                 /*
811                  * Point data at the device buffer and leave bp intact.
812                  */
813                 chain->data = (void *)bdata;
814                 break;
815         }
816
817         /*
818          * Make sure the bp is not specifically owned by this thread before
819          * restoring to a possibly shared lock, so another hammer2 thread
820          * can release it.
821          */
822         if (chain->bp)
823                 BUF_KERNPROC(chain->bp);
824         ccms_thread_lock_downgrade(&core->cst, ostate);
825         return (0);
826 }
827
828 /*
829  * Asynchronously read the device buffer (dbp) and execute the specified
830  * callback.  The caller should pass-in a locked chain (shared lock is ok).
831  * The function is responsible for unlocking the chain and for disposing
832  * of dbp.
833  *
834  * NOTE!  A NULL dbp (but non-NULL data) will be passed to the function
835  *        if the dbp is integrated into the chain, because we do not want
836  *        the caller to dispose of dbp in that situation.
837  */
838 static void hammer2_chain_load_async_callback(struct bio *bio);
839
840 void
841 hammer2_chain_load_async(hammer2_chain_t *chain,
842         void (*func)(hammer2_chain_t *, struct buf *, char *, void *),
843         void *arg)
844 {
845         hammer2_cbinfo_t *cbinfo;
846         hammer2_mount_t *hmp;
847         hammer2_blockref_t *bref;
848         hammer2_off_t pbase;
849         hammer2_off_t pmask;
850         hammer2_off_t peof;
851         struct buf *dbp;
852         size_t boff;
853         size_t psize;
854         char *bdata;
855
856         if (chain->data) {
857                 func(chain, NULL, (char *)chain->data, arg);
858                 return;
859         }
860
861         /*
862          * We must resolve to a device buffer, either by issuing I/O or
863          * by creating a zero-fill element.  We do not mark the buffer
864          * dirty when creating a zero-fill element (the hammer2_chain_modify()
865          * API must still be used to do that).
866          *
867          * The device buffer is variable-sized in powers of 2 down
868          * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
869          * chunk always contains buffers of the same size. (XXX)
870          *
871          * The minimum physical IO size may be larger than the variable
872          * block size.
873          */
874         bref = &chain->bref;
875
876         psize = hammer2_devblksize(chain->bytes);
877         pmask = (hammer2_off_t)psize - 1;
878         pbase = bref->data_off & ~pmask;
879         boff = bref->data_off & (HAMMER2_OFF_MASK & pmask);
880         KKASSERT(pbase != 0);
881         peof = (pbase + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64;
882
883         hmp = chain->hmp;
884
885         /*
886          * The getblk() optimization can only be used on newly created
887          * elements if the physical block size matches the request.
888          */
889         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
890             chain->bytes == psize) {
891                 dbp = getblk(hmp->devvp, pbase, psize, 0, 0);
892                 /*atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);*/
893                 bdata = (char *)dbp->b_data + boff;
894                 bzero(bdata, chain->bytes);
895                 /*atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);*/
896                 func(chain, dbp, bdata, arg);
897                 bqrelse(dbp);
898                 return;
899         }
900
901         adjreadcounter(&chain->bref, chain->bytes);
902         cbinfo = kmalloc(sizeof(*cbinfo), hmp->mchain, M_INTWAIT | M_ZERO);
903         cbinfo->chain = chain;
904         cbinfo->func = func;
905         cbinfo->arg = arg;
906         cbinfo->boff = boff;
907
908         cluster_readcb(hmp->devvp, peof, pbase, psize,
909                 HAMMER2_PBUFSIZE*4, HAMMER2_PBUFSIZE*4,
910                 hammer2_chain_load_async_callback, cbinfo);
911 }
912
913 static void
914 hammer2_chain_load_async_callback(struct bio *bio)
915 {
916         hammer2_cbinfo_t *cbinfo;
917         hammer2_mount_t *hmp;
918         struct buf *dbp;
919         char *data;
920
921         /*
922          * Nobody is waiting for bio/dbp to complete, we are
923          * responsible for handling the biowait() equivalent
924          * on dbp which means clearing BIO_DONE and BIO_SYNC
925          * and calling bpdone() if it hasn't already been called
926          * to restore any covered holes in the buffer's backing
927          * store.
928          */
929         dbp = bio->bio_buf;
930         if ((bio->bio_flags & BIO_DONE) == 0)
931                 bpdone(dbp, 0);
932         bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
933
934         /*
935          * Extract the auxillary info and issue the callback.
936          * Finish up with the dbp after it returns.
937          */
938         cbinfo = bio->bio_caller_info1.ptr;
939         /*ccms_thread_lock_setown(cbinfo->chain->core);*/
940         data = dbp->b_data + cbinfo->boff;
941         hmp = cbinfo->chain->hmp;
942
943         cbinfo = bio->bio_caller_info1.ptr;
944         if (cbinfo->chain->flags & HAMMER2_CHAIN_INITIAL)
945                 bzero(data, cbinfo->chain->bytes);
946         cbinfo->func(cbinfo->chain, dbp, data, cbinfo->arg);
947         /* cbinfo->chain is stale now */
948         bqrelse(dbp);
949         kfree(cbinfo, hmp->mchain);
950 }
951
952 /*
953  * Unlock and deref a chain element.
954  *
955  * On the last lock release any non-embedded data (chain->bp) will be
956  * retired.
957  */
958 void
959 hammer2_chain_unlock(hammer2_chain_t *chain)
960 {
961         hammer2_chain_core_t *core = chain->core;
962         ccms_state_t ostate;
963         long *counterp;
964         u_int lockcnt;
965
966         /*
967          * The core->cst lock can be shared across several chains so we
968          * need to track the per-chain lockcnt separately.
969          *
970          * If multiple locks are present (or being attempted) on this
971          * particular chain we can just unlock, drop refs, and return.
972          *
973          * Otherwise fall-through on the 1->0 transition.
974          */
975         for (;;) {
976                 lockcnt = chain->lockcnt;
977                 KKASSERT(lockcnt > 0);
978                 cpu_ccfence();
979                 if (lockcnt > 1) {
980                         if (atomic_cmpset_int(&chain->lockcnt,
981                                               lockcnt, lockcnt - 1)) {
982                                 ccms_thread_unlock(&core->cst);
983                                 hammer2_chain_drop(chain);
984                                 return;
985                         }
986                 } else {
987                         if (atomic_cmpset_int(&chain->lockcnt, 1, 0))
988                                 break;
989                 }
990                 /* retry */
991         }
992
993         /*
994          * On the 1->0 transition we upgrade the core lock (if necessary)
995          * to exclusive for terminal processing.  If after upgrading we find
996          * that lockcnt is non-zero, another thread is racing us and will
997          * handle the unload for us later on, so just cleanup and return
998          * leaving the data/bp intact
999          *
1000          * Otherwise if lockcnt is still 0 it is possible for it to become
1001          * non-zero and race, but since we hold the core->cst lock
1002          * exclusively all that will happen is that the chain will be
1003          * reloaded after we unload it.
1004          */
1005         ostate = ccms_thread_lock_upgrade(&core->cst);
1006         if (chain->lockcnt) {
1007                 ccms_thread_unlock_upgraded(&core->cst, ostate);
1008                 hammer2_chain_drop(chain);
1009                 return;
1010         }
1011
1012         /*
1013          * Shortcut the case if the data is embedded or not resolved.
1014          *
1015          * Do NOT NULL out chain->data (e.g. inode data), it might be
1016          * dirty.
1017          *
1018          * The DIRTYBP flag is non-applicable in this situation and can
1019          * be cleared to keep the flags state clean.
1020          */
1021         if (chain->bp == NULL) {
1022                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
1023                 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0)
1024                         hammer2_chain_drop_data(chain, 0);
1025                 ccms_thread_unlock_upgraded(&core->cst, ostate);
1026                 hammer2_chain_drop(chain);
1027                 return;
1028         }
1029
1030         /*
1031          * Statistics
1032          */
1033         if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
1034                 ;
1035         } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
1036                 switch(chain->bref.type) {
1037                 case HAMMER2_BREF_TYPE_DATA:
1038                         counterp = &hammer2_ioa_file_write;
1039                         break;
1040                 case HAMMER2_BREF_TYPE_INODE:
1041                         counterp = &hammer2_ioa_meta_write;
1042                         break;
1043                 case HAMMER2_BREF_TYPE_INDIRECT:
1044                         counterp = &hammer2_ioa_indr_write;
1045                         break;
1046                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1047                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1048                         counterp = &hammer2_ioa_fmap_write;
1049                         break;
1050                 default:
1051                         counterp = &hammer2_ioa_volu_write;
1052                         break;
1053                 }
1054                 *counterp += chain->bytes;
1055         } else {
1056                 switch(chain->bref.type) {
1057                 case HAMMER2_BREF_TYPE_DATA:
1058                         counterp = &hammer2_iod_file_write;
1059                         break;
1060                 case HAMMER2_BREF_TYPE_INODE:
1061                         counterp = &hammer2_iod_meta_write;
1062                         break;
1063                 case HAMMER2_BREF_TYPE_INDIRECT:
1064                         counterp = &hammer2_iod_indr_write;
1065                         break;
1066                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1067                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1068                         counterp = &hammer2_iod_fmap_write;
1069                         break;
1070                 default:
1071                         counterp = &hammer2_iod_volu_write;
1072                         break;
1073                 }
1074                 *counterp += chain->bytes;
1075         }
1076
1077         /*
1078          * Clean out the bp.
1079          *
1080          * If a device buffer was used for data be sure to destroy the
1081          * buffer when we are done to avoid aliases (XXX what about the
1082          * underlying VM pages?).
1083          *
1084          * NOTE: Freemap leaf's use reserved blocks and thus no aliasing
1085          *       is possible.
1086          */
1087 #if 0
1088         /*
1089          * XXX our primary cache is now the block device, not
1090          * the logical file. don't release the buffer.
1091          */
1092         if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
1093                 chain->bp->b_flags |= B_RELBUF;
1094 #endif
1095
1096         /*
1097          * The DIRTYBP flag tracks whether we have to bdwrite() the buffer
1098          * or not.  The flag will get re-set when chain_modify() is called,
1099          * even if MODIFIED is already set, allowing the OS to retire the
1100          * buffer independent of a hammer2 flus.
1101          */
1102         chain->data = NULL;
1103         if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
1104                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
1105                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
1106                         atomic_clear_int(&chain->flags,
1107                                          HAMMER2_CHAIN_IOFLUSH);
1108                         chain->bp->b_flags |= B_RELBUF;
1109                         cluster_awrite(chain->bp);
1110                 } else {
1111                         chain->bp->b_flags |= B_CLUSTEROK;
1112                         bdwrite(chain->bp);
1113                 }
1114         } else {
1115                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
1116                         atomic_clear_int(&chain->flags,
1117                                          HAMMER2_CHAIN_IOFLUSH);
1118                         chain->bp->b_flags |= B_RELBUF;
1119                         brelse(chain->bp);
1120                 } else {
1121                         /* bp might still be dirty */
1122                         bqrelse(chain->bp);
1123                 }
1124         }
1125         chain->bp = NULL;
1126         ccms_thread_unlock_upgraded(&core->cst, ostate);
1127         hammer2_chain_drop(chain);
1128 }
1129
1130 /*
1131  * This counts the number of live blockrefs in a block array and
1132  * also calculates the point at which all remaining blockrefs are empty.
1133  *
1134  * NOTE: Flag is not set until after the count is complete, allowing
1135  *       callers to test the flag without holding the spinlock.
1136  *
1137  * NOTE: If base is NULL the related chain is still in the INITIAL
1138  *       state and there are no blockrefs to count.
1139  *
1140  * NOTE: live_count may already have some counts accumulated due to
1141  *       creation and deletion and could even be initially negative.
1142  */
1143 void
1144 hammer2_chain_countbrefs(hammer2_chain_core_t *above,
1145                          hammer2_blockref_t *base, int count)
1146 {
1147         spin_lock(&above->cst.spin);
1148         if ((above->flags & HAMMER2_CORE_COUNTEDBREFS) == 0) {
1149                 if (base) {
1150                         while (--count >= 0) {
1151                                 if (base[count].type)
1152                                         break;
1153                         }
1154                         above->live_zero = count + 1;
1155                         while (count >= 0) {
1156                                 if (base[count].type)
1157                                         atomic_add_int(&above->live_count, 1);
1158                                 --count;
1159                         }
1160                 }
1161                 /* else do not modify live_count */
1162                 atomic_set_int(&above->flags, HAMMER2_CORE_COUNTEDBREFS);
1163         }
1164         spin_unlock(&above->cst.spin);
1165 }
1166
1167 /*
1168  * Resize the chain's physical storage allocation in-place.  This may
1169  * replace the passed-in chain with a new chain.
1170  *
1171  * Chains can be resized smaller without reallocating the storage.
1172  * Resizing larger will reallocate the storage.
1173  *
1174  * Must be passed an exclusively locked parent and chain, returns a new
1175  * exclusively locked chain at the same index and unlocks the old chain.
1176  * Flushes the buffer if necessary.
1177  *
1178  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
1179  * to avoid instantiating a device buffer that conflicts with the vnode
1180  * data buffer.  That is, the passed-in bp is a logical buffer, whereas
1181  * any chain-oriented bp would be a device buffer.
1182  *
1183  * XXX flags currently ignored, uses chain->bp to detect data/no-data.
1184  * XXX return error if cannot resize.
1185  */
1186 void
1187 hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
1188                      hammer2_chain_t *parent, hammer2_chain_t **chainp,
1189                      int nradix, int flags)
1190 {
1191         hammer2_mount_t *hmp;
1192         hammer2_chain_t *chain;
1193         hammer2_off_t pbase;
1194         size_t obytes;
1195         size_t nbytes;
1196         size_t bbytes;
1197         int boff;
1198
1199         chain = *chainp;
1200         hmp = chain->hmp;
1201
1202         /*
1203          * Only data and indirect blocks can be resized for now.
1204          * (The volu root, inodes, and freemap elements use a fixed size).
1205          */
1206         KKASSERT(chain != &hmp->vchain);
1207         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1208                  chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
1209
1210         /*
1211          * Nothing to do if the element is already the proper size
1212          */
1213         obytes = chain->bytes;
1214         nbytes = 1U << nradix;
1215         if (obytes == nbytes)
1216                 return;
1217
1218         /*
1219          * Delete the old chain and duplicate it at the same (parent, index),
1220          * returning a new chain.  This allows the old chain to still be
1221          * used by the flush code.  Duplication occurs in-place.
1222          *
1223          * The parent does not have to be locked for the delete/duplicate call,
1224          * but is in this particular code path.
1225          *
1226          * NOTE: If we are not crossing a synchronization point the
1227          *       duplication code will simply reuse the existing chain
1228          *       structure.
1229          */
1230         hammer2_chain_delete_duplicate(trans, &chain, 0);
1231
1232         /*
1233          * Set MODIFIED and add a chain ref to prevent destruction.  Both
1234          * modified flags share the same ref.  (duplicated chains do not
1235          * start out MODIFIED unless possibly if the duplication code
1236          * decided to reuse the existing chain as-is).
1237          *
1238          * If the chain is already marked MODIFIED then we can safely
1239          * return the previous allocation to the pool without having to
1240          * worry about snapshots.  XXX check flush synchronization.
1241          */
1242         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1243                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1244                 hammer2_chain_ref(chain);
1245         }
1246
1247         /*
1248          * Relocate the block, even if making it smaller (because different
1249          * block sizes may be in different regions).
1250          */
1251         hammer2_freemap_alloc(trans, chain->hmp, &chain->bref, nbytes);
1252         chain->bytes = nbytes;
1253         /*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */
1254
1255         /*
1256          * The device buffer may be larger than the allocation size.
1257          */
1258         bbytes = hammer2_devblksize(chain->bytes);
1259         pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
1260         boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
1261
1262         /*
1263          * For now just support it on DATA chains (and not on indirect
1264          * blocks).
1265          */
1266         KKASSERT(chain->bp == NULL);
1267
1268         /*
1269          * Make sure the chain is marked MOVED and SUBMOD is set in the
1270          * parent(s) so the adjustments are picked up by flush.
1271          */
1272         if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1273                 hammer2_chain_ref(chain);
1274                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1275         }
1276         hammer2_chain_setsubmod(trans, chain);
1277         *chainp = chain;
1278 }
1279
1280 /*
1281  * Set a chain modified, making it read-write and duplicating it if necessary.
1282  * This function will assign a new physical block to the chain if necessary
1283  *
1284  * Duplication of already-modified chains is possible when the modification
1285  * crosses a flush synchronization boundary.
1286  *
1287  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
1288  *                   level or the COW operation will not work.
1289  *
1290  * Data blocks     - The chain is usually locked RESOLVE_NEVER so as not to
1291  *                   run the data through the device buffers.
1292  *
1293  * This function may return a different chain than was passed, in which case
1294  * the old chain will be unlocked and the new chain will be locked.
1295  *
1296  * ip->chain may be adjusted by hammer2_chain_modify_ip().
1297  */
1298 hammer2_inode_data_t *
1299 hammer2_chain_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
1300                         hammer2_chain_t **chainp, int flags)
1301 {
1302         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
1303         hammer2_chain_modify(trans, chainp, flags);
1304         if (ip->chain != *chainp)
1305                 hammer2_inode_repoint(ip, NULL, *chainp);
1306         return(&ip->chain->data->ipdata);
1307 }
1308
1309 void
1310 hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
1311                      int flags)
1312 {
1313         hammer2_mount_t *hmp;
1314         hammer2_chain_t *chain;
1315         hammer2_off_t pbase;
1316         hammer2_off_t pmask;
1317         hammer2_off_t peof;
1318         hammer2_tid_t flush_tid;
1319         struct buf *nbp;
1320         int error;
1321         int wasinitial;
1322         size_t psize;
1323         size_t boff;
1324         void *bdata;
1325
1326         chain = *chainp;
1327         hmp = chain->hmp;
1328
1329         /*
1330          * Data must be resolved if already assigned unless explicitly
1331          * flagged otherwise.
1332          */
1333         if (chain->data == NULL && (flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
1334             (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
1335                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1336                 hammer2_chain_unlock(chain);
1337         }
1338
1339         /*
1340          * data is not optional for freemap chains (we must always be sure
1341          * to copy the data on COW storage allocations).
1342          */
1343         if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
1344             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
1345                 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) ||
1346                          (flags & HAMMER2_MODIFY_OPTDATA) == 0);
1347         }
1348
1349         /*
1350          * If the chain is already marked MODIFIED we can usually just
1351          * return.  However, if a modified chain is modified again in
1352          * a synchronization-point-crossing manner we have to issue a
1353          * delete/duplicate on the chain to avoid flush interference.
1354          */
1355         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
1356                 /*
1357                  * Which flush_tid do we need to check?  If the chain is
1358                  * related to the freemap we have to use the freemap flush
1359                  * tid (free_flush_tid), otherwise we use the normal filesystem
1360                  * flush tid (topo_flush_tid).  The two flush domains are
1361                  * almost completely independent of each other.
1362                  */
1363                 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
1364                     chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
1365                         flush_tid = hmp->topo_flush_tid; /* XXX */
1366                         goto skipxx;    /* XXX */
1367                 } else {
1368                         flush_tid = hmp->topo_flush_tid;
1369                 }
1370
1371                 /*
1372                  * Main tests
1373                  */
1374                 if (chain->modify_tid <= flush_tid &&
1375                     trans->sync_tid > flush_tid) {
1376                         /*
1377                          * Modifications cross synchronization point,
1378                          * requires delete-duplicate.
1379                          */
1380                         KKASSERT((flags & HAMMER2_MODIFY_ASSERTNOCOPY) == 0);
1381                         hammer2_chain_delete_duplicate(trans, chainp, 0);
1382                         chain = *chainp;
1383                         /* fall through using duplicate */
1384                 }
1385 skipxx: /* XXX */
1386                 /*
1387                  * Quick return path, set DIRTYBP to ensure that
1388                  * the later retirement of bp will write it out.
1389                  *
1390                  * quick return path also needs the modify_tid
1391                  * logic.
1392                  */
1393                 if (chain->bp)
1394                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
1395                 if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
1396                         chain->bref.modify_tid = trans->sync_tid;
1397                 chain->modify_tid = trans->sync_tid;
1398                 return;
1399         }
1400
1401         /*
1402          * modify_tid is only update for primary modifications, not for
1403          * propagated brefs.  mirror_tid will be updated regardless during
1404          * the flush, no need to set it here.
1405          */
1406         if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
1407                 chain->bref.modify_tid = trans->sync_tid;
1408
1409         /*
1410          * Set MODIFIED and add a chain ref to prevent destruction.  Both
1411          * modified flags share the same ref.
1412          */
1413         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1414                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1415                 hammer2_chain_ref(chain);
1416         }
1417
1418         /*
1419          * Adjust chain->modify_tid so the flusher knows when the
1420          * modification occurred.
1421          */
1422         chain->modify_tid = trans->sync_tid;
1423
1424         /*
1425          * The modification or re-modification requires an allocation and
1426          * possible COW.
1427          *
1428          * We normally always allocate new storage here.  If storage exists
1429          * and MODIFY_NOREALLOC is passed in, we do not allocate new storage.
1430          */
1431         if (chain != &hmp->vchain &&
1432             chain != &hmp->fchain &&
1433             ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 ||
1434              (flags & HAMMER2_MODIFY_NOREALLOC) == 0)
1435         ) {
1436                 hammer2_freemap_alloc(trans, chain->hmp,
1437                                       &chain->bref, chain->bytes);
1438                 /* XXX failed allocation */
1439         }
1440
1441         /*
1442          * Do not COW if OPTDATA is set.  INITIAL flag remains unchanged.
1443          * (OPTDATA does not prevent [re]allocation of storage, only the
1444          * related copy-on-write op).
1445          */
1446         if (flags & HAMMER2_MODIFY_OPTDATA)
1447                 goto skip2;
1448
1449         /*
1450          * Clearing the INITIAL flag (for indirect blocks) indicates that
1451          * we've processed the uninitialized storage allocation.
1452          *
1453          * If this flag is already clear we are likely in a copy-on-write
1454          * situation but we have to be sure NOT to bzero the storage if
1455          * no data is present.
1456          */
1457         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1458                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1459                 wasinitial = 1;
1460         } else {
1461                 wasinitial = 0;
1462         }
1463
1464 #if 0
1465         /*
1466          * We currently should never instantiate a device buffer for a
1467          * file data chain.  (We definitely can for a freemap chain).
1468          *
1469          * XXX we can now do this
1470          */
1471         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
1472 #endif
1473
1474         /*
1475          * Instantiate data buffer and possibly execute COW operation
1476          */
1477         switch(chain->bref.type) {
1478         case HAMMER2_BREF_TYPE_VOLUME:
1479         case HAMMER2_BREF_TYPE_FREEMAP:
1480         case HAMMER2_BREF_TYPE_INODE:
1481         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1482                 /*
1483                  * The data is embedded, no copy-on-write operation is
1484                  * needed.
1485                  */
1486                 KKASSERT(chain->bp == NULL);
1487                 break;
1488         case HAMMER2_BREF_TYPE_DATA:
1489         case HAMMER2_BREF_TYPE_INDIRECT:
1490         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1491                 /*
1492                  * Perform the copy-on-write operation
1493                  */
1494                 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain);
1495
1496                 psize = hammer2_devblksize(chain->bytes);
1497                 pmask = (hammer2_off_t)psize - 1;
1498                 pbase = chain->bref.data_off & ~pmask;
1499                 boff = chain->bref.data_off & (HAMMER2_OFF_MASK & pmask);
1500                 KKASSERT(pbase != 0);
1501                 peof = (pbase + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64;
1502
1503                 /*
1504                  * The getblk() optimization can only be used if the
1505                  * chain element size matches the physical block size.
1506                  */
1507                 if (chain->bp && chain->bp->b_loffset == pbase) {
1508                         nbp = chain->bp;
1509                         error = 0;
1510                 } else if (chain->bytes == psize) {
1511                         nbp = getblk(hmp->devvp, pbase, psize, 0, 0);
1512                         error = 0;
1513                 } else if (hammer2_isclusterable(chain)) {
1514                         error = cluster_read(hmp->devvp, peof, pbase, psize,
1515                                              psize, HAMMER2_PBUFSIZE*4,
1516                                              &nbp);
1517                         adjreadcounter(&chain->bref, chain->bytes);
1518                 } else {
1519                         error = bread(hmp->devvp, pbase, psize, &nbp);
1520                         adjreadcounter(&chain->bref, chain->bytes);
1521                 }
1522                 KKASSERT(error == 0);
1523                 bdata = (char *)nbp->b_data + boff;
1524
1525                 /*
1526                  * Copy or zero-fill on write depending on whether
1527                  * chain->data exists or not.  Retire the existing bp
1528                  * based on the DIRTYBP flag.  Set the DIRTYBP flag to
1529                  * indicate that retirement of nbp should use bdwrite().
1530                  */
1531                 if (chain->data) {
1532                         KKASSERT(chain->bp != NULL);
1533                         if (chain->data != bdata) {
1534                                 bcopy(chain->data, bdata, chain->bytes);
1535                         }
1536                 } else if (wasinitial) {
1537                         bzero(bdata, chain->bytes);
1538                 } else {
1539                         /*
1540                          * We have a problem.  We were asked to COW but
1541                          * we don't have any data to COW with!
1542                          */
1543                         panic("hammer2_chain_modify: having a COW %p\n",
1544                               chain);
1545                 }
1546                 if (chain->bp != nbp) {
1547                         if (chain->bp) {
1548                                 if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
1549                                         chain->bp->b_flags |= B_CLUSTEROK;
1550                                         bdwrite(chain->bp);
1551                                 } else {
1552                                         chain->bp->b_flags |= B_RELBUF;
1553                                         brelse(chain->bp);
1554                                 }
1555                         }
1556                         chain->bp = nbp;
1557                         BUF_KERNPROC(chain->bp);
1558                 }
1559                 chain->data = bdata;
1560                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
1561                 break;
1562         default:
1563                 panic("hammer2_chain_modify: illegal non-embedded type %d",
1564                       chain->bref.type);
1565                 break;
1566
1567         }
1568 skip2:
1569         hammer2_chain_setsubmod(trans, chain);
1570 }
1571
1572 /*
1573  * Mark the volume as having been modified.  This short-cut version
1574  * does not have to lock the volume's chain, which allows the ioctl
1575  * code to make adjustments to connections without deadlocking.  XXX
1576  *
1577  * No ref is made on vchain when flagging it MODIFIED.
1578  */
1579 void
1580 hammer2_modify_volume(hammer2_mount_t *hmp)
1581 {
1582         hammer2_voldata_lock(hmp);
1583         hammer2_voldata_unlock(hmp, 1);
1584 }
1585
1586 /*
1587  * This function returns the chain at the nearest key within the specified
1588  * range with the highest delete_tid.  The core spinlock must be held on
1589  * call and the returned chain will be referenced but not locked.
1590  *
1591  * The returned chain may or may not be in a deleted state.  Note that
1592  * live chains have a delete_tid = MAX_TID.
1593  *
1594  * This function will recurse through chain->rbtree as necessary and will
1595  * return a *key_nextp suitable for iteration.  *key_nextp is only set if
1596  * the iteration value is less than the current value of *key_nextp.
1597  *
1598  * The caller should use (*key_nextp) to calculate the actual range of
1599  * the returned element, which will be (key_beg to *key_nextp - 1).
1600  *
1601  * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL
1602  * chains continue to be returned.  On EOF (*key_nextp) may overflow since
1603  * it will wind up being (key_end + 1).
1604  */
1605 struct hammer2_chain_find_info {
1606         hammer2_chain_t         *best;
1607         hammer2_key_t           key_beg;
1608         hammer2_key_t           key_end;
1609         hammer2_key_t           key_next;
1610 };
1611
1612 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data);
1613 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data);
1614
1615 static
1616 hammer2_chain_t *
1617 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp,
1618                           hammer2_key_t key_beg, hammer2_key_t key_end)
1619 {
1620         struct hammer2_chain_find_info info;
1621         hammer2_chain_layer_t *layer;
1622
1623         info.best = NULL;
1624         info.key_beg = key_beg;
1625         info.key_end = key_end;
1626         info.key_next = key_end + 1;    /* can overflow to 0 */
1627
1628 /*
1629         kprintf("chain_find: %p %016jx %016jx\n", parent, key_beg, key_end);
1630 */
1631         TAILQ_FOREACH(layer, &parent->core->layerq, entry) {
1632                 RB_SCAN(hammer2_chain_tree, &layer->rbtree,
1633                         hammer2_chain_find_cmp, hammer2_chain_find_callback,
1634                         &info);
1635         }
1636         if (info.key_next && (*key_nextp > info.key_next || *key_nextp == 0))
1637                 *key_nextp = info.key_next;
1638         return (info.best);
1639 }
1640
1641 static
1642 int
1643 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data)
1644 {
1645         struct hammer2_chain_find_info *info = data;
1646         hammer2_key_t child_beg;
1647         hammer2_key_t child_end;
1648
1649         child_beg = child->bref.key;
1650         child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1;
1651
1652         if (child_end < info->key_beg)
1653                 return(-1);
1654         if (child_beg > info->key_end)
1655                 return(1);
1656         return(0);
1657 }
1658
1659 static
1660 int
1661 hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
1662 {
1663         struct hammer2_chain_find_info *info = data;
1664         hammer2_chain_t *best;
1665         hammer2_key_t child_end;
1666
1667         /*
1668          * NOTE: Can overflow to 0
1669          */
1670         child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits);
1671
1672         /*
1673          * Skip deleted chains which have been flushed (MOVED no longer set),
1674          * causes caller to check blockref array.
1675          */
1676         if ((child->flags & (HAMMER2_CHAIN_DELETED | HAMMER2_CHAIN_MOVED)) ==
1677             HAMMER2_CHAIN_DELETED) {
1678                 /* continue scan */
1679                 return(0);
1680         }
1681
1682         /*
1683          * General cases
1684          */
1685         if ((best = info->best) == NULL) {
1686                 /*
1687                  * No previous best.  Assign best
1688                  */
1689                 info->best = best = child;
1690                 info->key_next = child_end;
1691         } else if (best->bref.key > info->key_beg &&
1692                    child->bref.key < best->bref.key) {
1693                 /*
1694                  * Choose child because it has a nearer key.
1695                  * Adjust key_next downward.
1696                  */
1697                 info->best = child;
1698                 if (info->key_next > best->bref.key || info->key_next == 0)
1699                         info->key_next = best->bref.key;
1700                 if (child_end && (info->key_next > child_end ||
1701                                   info->key_next == 0)) {
1702                         info->key_next = child_end;
1703                 }
1704         } else if (child->bref.key > best->bref.key &&
1705                    child->bref.key > info->key_beg) {
1706                 /*
1707                  * Child has a further key, adjust key_next downward if
1708                  * it has the same or better delete_tid.  Keep current best.
1709                  */
1710                 if (best->delete_tid <= child->delete_tid) {
1711                         if (info->key_next > child->bref.key ||
1712                             info->key_next == 0)
1713                                 info->key_next = child->bref.key;
1714                 }
1715         } else if (best->delete_tid < child->delete_tid) {
1716                 /*
1717                  * Child has the same key_beg (capped at key_beg).
1718                  * Choose child if it has a better delete_tid and
1719                  * adjust key_next downward.  Otherwise keep current
1720                  * best.
1721                  */
1722                 if (best->delete_tid < child->delete_tid) {
1723                         info->best = child;
1724                         if (child_end && (info->key_next > child_end ||
1725                                           info->key_next == 0)) {
1726                                 info->key_next = child_end;
1727                         }
1728                 }
1729         }
1730         return(0);
1731 }
1732
1733 /*
1734  * Retrieve the specified chain from a media blockref, creating the
1735  * in-memory chain structure and setting HAMMER2_CHAIN_REPLACE to
1736  * indicate that modifications must replace an existing bref in the
1737  * block table.
1738  *
1739  * NULL is returned if the insertion races.
1740  *
1741  * Caller must hold the parent locked shared or exclusive since we may
1742  * need the parent's bref array to find our block.
1743  */
1744 hammer2_chain_t *
1745 hammer2_chain_get(hammer2_chain_t *parent, hammer2_blockref_t *bref)
1746 {
1747         hammer2_mount_t *hmp = parent->hmp;
1748         hammer2_chain_core_t *above = parent->core;
1749         hammer2_chain_t *chain;
1750
1751         /*
1752          * Allocate a chain structure representing the existing media
1753          * entry.  Resulting chain has one ref and is not locked.
1754          */
1755         chain = hammer2_chain_alloc(hmp, parent->pmp, NULL, bref);
1756         hammer2_chain_core_alloc(NULL, chain, NULL);
1757         atomic_set_int(&chain->flags, HAMMER2_CHAIN_REPLACE);
1758         /* ref'd chain returned */
1759
1760         /*
1761          * Link the chain into its parent.  A spinlock is required to safely
1762          * access the RBTREE, and it is possible to collide with another
1763          * hammer2_chain_get() operation because the caller might only hold
1764          * a shared lock on the parent.
1765          */
1766         KKASSERT(parent->refs > 0);
1767         hammer2_chain_insert(above, chain, HAMMER2_CHAIN_INSERT_SPIN |
1768                                            HAMMER2_CHAIN_INSERT_RACE);
1769         if ((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0) {
1770                 kprintf("chain not on RBTREE\n");
1771                 hammer2_chain_drop(chain);
1772                 chain = NULL;
1773         }
1774
1775         /*
1776          * Return our new chain referenced but not locked.
1777          */
1778         return (chain);
1779 }
1780
1781 /*
1782  * Lookup initialization/completion API
1783  */
1784 hammer2_chain_t *
1785 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags)
1786 {
1787         if (flags & HAMMER2_LOOKUP_SHARED) {
1788                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
1789                                            HAMMER2_RESOLVE_SHARED);
1790         } else {
1791                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
1792         }
1793         return (parent);
1794 }
1795
1796 void
1797 hammer2_chain_lookup_done(hammer2_chain_t *parent)
1798 {
1799         if (parent)
1800                 hammer2_chain_unlock(parent);
1801 }
1802
1803 static
1804 hammer2_chain_t *
1805 hammer2_chain_getparent(hammer2_chain_t **parentp, int how)
1806 {
1807         hammer2_chain_t *oparent;
1808         hammer2_chain_t *nparent;
1809         hammer2_chain_core_t *above;
1810
1811         oparent = *parentp;
1812         above = oparent->above;
1813
1814         spin_lock(&above->cst.spin);
1815         nparent = TAILQ_FIRST(&above->ownerq);
1816         while (hammer2_chain_refactor_test(nparent, 1))
1817                 nparent = TAILQ_NEXT(nparent, core_entry);
1818         hammer2_chain_ref(nparent);     /* protect nparent, use in lock */
1819         spin_unlock(&above->cst.spin);
1820
1821         hammer2_chain_unlock(oparent);
1822         hammer2_chain_lock(nparent, how | HAMMER2_RESOLVE_NOREF);
1823         *parentp = nparent;
1824
1825         return (nparent);
1826 }
1827
1828 /*
1829  * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive.
1830  * (*parentp) typically points to an inode but can also point to a related
1831  * indirect block and this function will recurse upwards and find the inode
1832  * again.
1833  *
1834  * (*parentp) must be exclusively locked and referenced and can be an inode
1835  * or an existing indirect block within the inode.
1836  *
1837  * On return (*parentp) will be modified to point at the deepest parent chain
1838  * element encountered during the search, as a helper for an insertion or
1839  * deletion.   The new (*parentp) will be locked and referenced and the old
1840  * will be unlocked and dereferenced (no change if they are both the same).
1841  *
1842  * The matching chain will be returned exclusively locked.  If NOLOCK is
1843  * requested the chain will be returned only referenced.
1844  *
1845  * NULL is returned if no match was found, but (*parentp) will still
1846  * potentially be adjusted.
1847  *
1848  * On return (*key_nextp) will point to an iterative value for key_beg.
1849  * (If NULL is returned (*key_nextp) is set to key_end).
1850  *
1851  * This function will also recurse up the chain if the key is not within the
1852  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
1853  * can simply allow (*parentp) to float inside the loop.
1854  *
1855  * NOTE!  chain->data is not always resolved.  By default it will not be
1856  *        resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF.  Use
1857  *        HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/
1858  *        BREF_TYPE_DATA as the device buffer can alias the logical file
1859  *        buffer).
1860  */
1861 hammer2_chain_t *
1862 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp,
1863                      hammer2_key_t key_beg, hammer2_key_t key_end,
1864                      int *cache_indexp, int flags)
1865 {
1866         hammer2_mount_t *hmp;
1867         hammer2_chain_t *parent;
1868         hammer2_chain_t *chain;
1869         hammer2_blockref_t *base;
1870         hammer2_blockref_t getref;
1871         hammer2_key_t scan_beg;
1872         hammer2_key_t scan_end;
1873         hammer2_chain_core_t *above;
1874         int count = 0;
1875         int i;
1876         int how_always = HAMMER2_RESOLVE_ALWAYS;
1877         int how_maybe = HAMMER2_RESOLVE_MAYBE;
1878         int how;
1879
1880         if (flags & HAMMER2_LOOKUP_ALWAYS) {
1881                 how_maybe = how_always;
1882                 how = HAMMER2_RESOLVE_ALWAYS;
1883         } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) {
1884                 how = HAMMER2_RESOLVE_NEVER;
1885         } else {
1886                 how = HAMMER2_RESOLVE_MAYBE;
1887         }
1888         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) {
1889                 how_maybe |= HAMMER2_RESOLVE_SHARED;
1890                 how_always |= HAMMER2_RESOLVE_SHARED;
1891                 how |= HAMMER2_RESOLVE_SHARED;
1892         }
1893
1894         /*
1895          * Recurse (*parentp) upward if necessary until the parent completely
1896          * encloses the key range or we hit the inode.
1897          *
1898          * This function handles races against the flusher doing a delete-
1899          * duplicate above us and re-homes the parent to the duplicate in
1900          * that case, otherwise we'd wind up recursing down a stale chain.
1901          */
1902         parent = *parentp;
1903         hmp = parent->hmp;
1904
1905         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1906                parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1907                 scan_beg = parent->bref.key;
1908                 scan_end = scan_beg +
1909                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1910                 if (key_beg >= scan_beg && key_end <= scan_end)
1911                         break;
1912                 parent = hammer2_chain_getparent(parentp, how_maybe);
1913         }
1914
1915 again:
1916         /*
1917          * Locate the blockref array.  Currently we do a fully associative
1918          * search through the array.
1919          */
1920         switch(parent->bref.type) {
1921         case HAMMER2_BREF_TYPE_INODE:
1922                 /*
1923                  * Special shortcut for embedded data returns the inode
1924                  * itself.  Callers must detect this condition and access
1925                  * the embedded data (the strategy code does this for us).
1926                  *
1927                  * This is only applicable to regular files and softlinks.
1928                  */
1929                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1930                         if (flags & HAMMER2_LOOKUP_NOLOCK)
1931                                 hammer2_chain_ref(parent);
1932                         else
1933                                 hammer2_chain_lock(parent, how_always);
1934                         *key_nextp = key_end + 1;
1935                         return (parent);
1936                 }
1937                 base = &parent->data->ipdata.u.blockset.blockref[0];
1938                 count = HAMMER2_SET_COUNT;
1939                 break;
1940         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1941         case HAMMER2_BREF_TYPE_INDIRECT:
1942                 /*
1943                  * Handle MATCHIND on the parent
1944                  */
1945                 if (flags & HAMMER2_LOOKUP_MATCHIND) {
1946                         scan_beg = parent->bref.key;
1947                         scan_end = scan_beg +
1948                                ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1949                         if (key_beg == scan_beg && key_end == scan_end) {
1950                                 chain = parent;
1951                                 hammer2_chain_lock(chain, how_maybe);
1952                                 *key_nextp = scan_end + 1;
1953                                 goto done;
1954                         }
1955                 }
1956                 /*
1957                  * Optimize indirect blocks in the INITIAL state to avoid
1958                  * I/O.
1959                  */
1960                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1961                         base = NULL;
1962                 } else {
1963                         if (parent->data == NULL)
1964                                 panic("parent->data is NULL");
1965                         base = &parent->data->npdata[0];
1966                 }
1967                 count = parent->bytes / sizeof(hammer2_blockref_t);
1968                 break;
1969         case HAMMER2_BREF_TYPE_VOLUME:
1970                 base = &hmp->voldata.sroot_blockset.blockref[0];
1971                 count = HAMMER2_SET_COUNT;
1972                 break;
1973         case HAMMER2_BREF_TYPE_FREEMAP:
1974                 base = &hmp->voldata.freemap_blockset.blockref[0];
1975                 count = HAMMER2_SET_COUNT;
1976                 break;
1977         default:
1978                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
1979                       parent->bref.type);
1980                 base = NULL;    /* safety */
1981                 count = 0;      /* safety */
1982         }
1983
1984         /*
1985          * Merged scan to find next candidate.
1986          *
1987          * hammer2_base_*() functions require the above->live_* fields
1988          * to be synchronized.
1989          *
1990          * We need to hold the spinlock to access the block array and RB tree
1991          * and to interlock chain creation.
1992          */
1993         above = parent->core;
1994         if ((above->flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
1995                 hammer2_chain_countbrefs(above, base, count);
1996         spin_lock(&above->cst.spin);
1997
1998         /*
1999          * (i) is set to (count) on failure.  Otherwise the element at
2000          * base[i] points to an entry >= key_beg.
2001          */
2002         *key_nextp = key_end + 1;
2003         i = hammer2_base_find(base, count, above, cache_indexp,
2004                               key_nextp, key_beg);
2005         chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end);
2006
2007         if (i != count && base[i].key > key_end)
2008                 i = count;
2009
2010         if (i == count) {
2011                 /*
2012                  * Nothing in block array, select the chain
2013                  */
2014                 if (chain)
2015                         hammer2_chain_ref(chain);
2016                 spin_unlock(&above->cst.spin);
2017         } else if (chain == NULL || base[i].key < chain->bref.key) {
2018                 /*
2019                  * RBTREE is empty or Nearest key is in block array.
2020                  *
2021                  * chain returns NULL on insertion race.
2022                  */
2023                 getref = base[i];
2024                 spin_unlock(&above->cst.spin);
2025                 chain = hammer2_chain_get(parent, &getref);
2026                 if (chain == NULL)
2027                         goto again;     /* retry */
2028                 if (bcmp(&base[i], &getref, sizeof(getref)) != 0) {
2029                         hammer2_chain_drop(chain);
2030                         goto again;     /* retry */
2031                 }
2032         } else {
2033                 /*
2034                  * Chain overrides base[i] or nearest key is chain
2035                  */
2036                 if (chain)
2037                         hammer2_chain_ref(chain);
2038                 spin_unlock(&above->cst.spin);
2039         }
2040
2041         /*
2042          * Exhausted parent chain, iterate.
2043          *
2044          * (cannot use *key_nextp on exhaustion, see special case in
2045          *  hammer2_chain_next())
2046          */
2047         if (chain == NULL) {
2048                 if (key_beg == key_end) /* short cut single-key case */
2049                         return (NULL);
2050                 return (hammer2_chain_next(parentp, NULL, key_nextp,
2051                                            key_beg, key_end,
2052                                            cache_indexp, flags));
2053         }
2054
2055         /*
2056          * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
2057          *
2058          * NOTE: chain's key range is not relevant as there might be
2059          *       one-offs within the range that are not deleted.
2060          */
2061         if (chain->flags & HAMMER2_CHAIN_DELETED) {
2062                 hammer2_chain_drop(chain);
2063                 key_beg = *key_nextp;
2064                 if (key_beg == 0 || key_beg > key_end)
2065                         return(NULL);
2066                 goto again;
2067         }
2068
2069         /*
2070          * If the chain element is an indirect block it becomes the new
2071          * parent and we loop on it.  We must maintain our top-down locks
2072          * to prevent the flusher from interfering (i.e. doing a
2073          * delete-duplicate and leaving us recursing down a deleted chain).
2074          *
2075          * The parent always has to be locked with at least RESOLVE_MAYBE
2076          * so we can access its data.  It might need a fixup if the caller
2077          * passed incompatible flags.  Be careful not to cause a deadlock
2078          * as a data-load requires an exclusive lock.
2079          *
2080          * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key
2081          * range is within the requested key range we return the indirect
2082          * block and do NOT loop.  This is usually only used to acquire
2083          * freemap nodes.
2084          */
2085         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2086             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2087                 hammer2_chain_lock(chain, how_maybe | HAMMER2_RESOLVE_NOREF);
2088                 hammer2_chain_unlock(parent);
2089                 *parentp = parent = chain;
2090 #if 0
2091                 hammer2_chain_unlock(parent);
2092                 *parentp = parent = chain;
2093                 hammer2_chain_lock(chain, how_maybe | HAMMER2_RESOLVE_NOREF);
2094 #endif
2095                 goto again;
2096         }
2097
2098         hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF);
2099 done:
2100         /*
2101          * All done, return the chain
2102          */
2103         return (chain);
2104 }
2105
2106 /*
2107  * After having issued a lookup we can iterate all matching keys.
2108  *
2109  * If chain is non-NULL we continue the iteration from just after it's index.
2110  *
2111  * If chain is NULL we assume the parent was exhausted and continue the
2112  * iteration at the next parent.
2113  *
2114  * parent must be locked on entry and remains locked throughout.  chain's
2115  * lock status must match flags.  Chain is always at least referenced.
2116  *
2117  * WARNING!  The MATCHIND flag does not apply to this function.
2118  */
2119 hammer2_chain_t *
2120 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain,
2121                    hammer2_key_t *key_nextp,
2122                    hammer2_key_t key_beg, hammer2_key_t key_end,
2123                    int *cache_indexp, int flags)
2124 {
2125         hammer2_chain_t *parent;
2126         int how_maybe;
2127
2128         /*
2129          * Calculate locking flags for upward recursion.
2130          */
2131         how_maybe = HAMMER2_RESOLVE_MAYBE;
2132         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
2133                 how_maybe |= HAMMER2_RESOLVE_SHARED;
2134
2135         parent = *parentp;
2136
2137         /*
2138          * Calculate the next index and recalculate the parent if necessary.
2139          */
2140         if (chain) {
2141                 key_beg = chain->bref.key +
2142                           ((hammer2_key_t)1 << chain->bref.keybits);
2143                 if (flags & HAMMER2_LOOKUP_NOLOCK)
2144                         hammer2_chain_drop(chain);
2145                 else
2146                         hammer2_chain_unlock(chain);
2147
2148                 /*
2149                  * Any scan where the lookup returned degenerate data embedded
2150                  * in the inode has an invalid index and must terminate.
2151                  */
2152                 if (chain == parent)
2153                         return(NULL);
2154                 if (key_beg == 0 || key_beg > key_end)
2155                         return(NULL);
2156                 chain = NULL;
2157         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
2158                    parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2159                 /*
2160                  * We reached the end of the iteration.
2161                  */
2162                 return (NULL);
2163         } else {
2164                 /*
2165                  * Continue iteration with next parent unless the current
2166                  * parent covers the range.
2167                  */
2168                 key_beg = parent->bref.key +
2169                           ((hammer2_key_t)1 << parent->bref.keybits);
2170                 if (key_beg == 0 || key_beg > key_end)
2171                         return (NULL);
2172                 parent = hammer2_chain_getparent(parentp, how_maybe);
2173         }
2174
2175         /*
2176          * And execute
2177          */
2178         return (hammer2_chain_lookup(parentp, key_nextp,
2179                                      key_beg, key_end,
2180                                      cache_indexp, flags));
2181 }
2182
2183 /*
2184  * Create and return a new hammer2 system memory structure of the specified
2185  * key, type and size and insert it under (*parentp).  This is a full
2186  * insertion, based on the supplied key/keybits, and may involve creating
2187  * indirect blocks and moving other chains around via delete/duplicate.
2188  *
2189  * (*parentp) must be exclusive locked and may be replaced on return
2190  * depending on how much work the function had to do.
2191  *
2192  * (*chainp) usually starts out NULL and returns the newly created chain,
2193  * but if the caller desires the caller may allocate a disconnected chain
2194  * and pass it in instead.  (It is also possible for the caller to use
2195  * chain_duplicate() to create a disconnected chain, manipulate it, then
2196  * pass it into this function to insert it).
2197  *
2198  * This function should NOT be used to insert INDIRECT blocks.  It is
2199  * typically used to create/insert inodes and data blocks.
2200  *
2201  * Caller must pass-in an exclusively locked parent the new chain is to
2202  * be inserted under, and optionally pass-in a disconnected, exclusively
2203  * locked chain to insert (else we create a new chain).  The function will
2204  * adjust (*parentp) as necessary, create or connect the chain, and
2205  * return an exclusively locked chain in *chainp.
2206  */
2207 int
2208 hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
2209                      hammer2_chain_t **chainp,
2210                      hammer2_key_t key, int keybits, int type, size_t bytes)
2211 {
2212         hammer2_mount_t *hmp;
2213         hammer2_chain_t *chain;
2214         hammer2_chain_t *parent = *parentp;
2215         hammer2_chain_core_t *above;
2216         hammer2_blockref_t *base;
2217         hammer2_blockref_t dummy;
2218         int allocated = 0;
2219         int error = 0;
2220         int count;
2221
2222         above = parent->core;
2223         KKASSERT(ccms_thread_lock_owned(&above->cst));
2224         hmp = parent->hmp;
2225         chain = *chainp;
2226
2227         if (chain == NULL) {
2228                 /*
2229                  * First allocate media space and construct the dummy bref,
2230                  * then allocate the in-memory chain structure.  Set the
2231                  * INITIAL flag for fresh chains.
2232                  */
2233                 bzero(&dummy, sizeof(dummy));
2234                 dummy.type = type;
2235                 dummy.key = key;
2236                 dummy.keybits = keybits;
2237                 dummy.data_off = hammer2_getradix(bytes);
2238                 dummy.methods = parent->bref.methods;
2239                 chain = hammer2_chain_alloc(hmp, parent->pmp, trans, &dummy);
2240                 hammer2_chain_core_alloc(trans, chain, NULL);
2241
2242                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
2243
2244                 /*
2245                  * Lock the chain manually, chain_lock will load the chain
2246                  * which we do NOT want to do.  (note: chain->refs is set
2247                  * to 1 by chain_alloc() for us, but lockcnt is not).
2248                  */
2249                 chain->lockcnt = 1;
2250                 ccms_thread_lock(&chain->core->cst, CCMS_STATE_EXCLUSIVE);
2251                 allocated = 1;
2252
2253                 /*
2254                  * We do NOT set INITIAL here (yet).  INITIAL is only
2255                  * used for indirect blocks.
2256                  *
2257                  * Recalculate bytes to reflect the actual media block
2258                  * allocation.
2259                  */
2260                 bytes = (hammer2_off_t)1 <<
2261                         (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
2262                 chain->bytes = bytes;
2263
2264                 switch(type) {
2265                 case HAMMER2_BREF_TYPE_VOLUME:
2266                 case HAMMER2_BREF_TYPE_FREEMAP:
2267                         panic("hammer2_chain_create: called with volume type");
2268                         break;
2269                 case HAMMER2_BREF_TYPE_INODE:
2270                         KKASSERT(bytes == HAMMER2_INODE_BYTES);
2271                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED);
2272                         chain->data = kmalloc(sizeof(chain->data->ipdata),
2273                                               hmp->mchain, M_WAITOK | M_ZERO);
2274                         break;
2275                 case HAMMER2_BREF_TYPE_INDIRECT:
2276                         panic("hammer2_chain_create: cannot be used to"
2277                               "create indirect block");
2278                         break;
2279                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2280                         panic("hammer2_chain_create: cannot be used to"
2281                               "create freemap root or node");
2282                         break;
2283                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2284                         KKASSERT(bytes == sizeof(chain->data->bmdata));
2285                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED);
2286                         chain->data = kmalloc(sizeof(chain->data->bmdata),
2287                                               hmp->mchain, M_WAITOK | M_ZERO);
2288                         break;
2289                 case HAMMER2_BREF_TYPE_DATA:
2290                 default:
2291                         /* leave chain->data NULL */
2292                         KKASSERT(chain->data == NULL);
2293                         break;
2294                 }
2295         } else {
2296                 /*
2297                  * Potentially update the existing chain's key/keybits.
2298                  *
2299                  * Do NOT mess with the current state of the INITIAL flag.
2300                  */
2301                 chain->bref.key = key;
2302                 chain->bref.keybits = keybits;
2303                 KKASSERT(chain->above == NULL);
2304         }
2305
2306         /*
2307          * Calculate how many entries we have in the blockref array and
2308          * determine if an indirect block is required.
2309          */
2310 again:
2311         above = parent->core;
2312
2313         switch(parent->bref.type) {
2314         case HAMMER2_BREF_TYPE_INODE:
2315                 KKASSERT((parent->data->ipdata.op_flags &
2316                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
2317                 KKASSERT(parent->data != NULL);
2318                 base = &parent->data->ipdata.u.blockset.blockref[0];
2319                 count = HAMMER2_SET_COUNT;
2320                 break;
2321         case HAMMER2_BREF_TYPE_INDIRECT:
2322         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2323                 if (parent->flags & HAMMER2_CHAIN_INITIAL)
2324                         base = NULL;
2325                 else
2326                         base = &parent->data->npdata[0];
2327                 count = parent->bytes / sizeof(hammer2_blockref_t);
2328                 break;
2329         case HAMMER2_BREF_TYPE_VOLUME:
2330                 KKASSERT(parent->data != NULL);
2331                 base = &hmp->voldata.sroot_blockset.blockref[0];
2332                 count = HAMMER2_SET_COUNT;
2333                 break;
2334         case HAMMER2_BREF_TYPE_FREEMAP:
2335                 KKASSERT(parent->data != NULL);
2336                 base = &hmp->voldata.freemap_blockset.blockref[0];
2337                 count = HAMMER2_SET_COUNT;
2338                 break;
2339         default:
2340                 panic("hammer2_chain_create: unrecognized blockref type: %d",
2341                       parent->bref.type);
2342                 base = NULL;
2343                 count = 0;
2344                 break;
2345         }
2346
2347         /*
2348          * Make sure we've counted the brefs
2349          */
2350         if ((above->flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
2351                 hammer2_chain_countbrefs(above, base, count);
2352
2353         KKASSERT(above->live_count >= 0 && above->live_count <= count);
2354
2355         /*
2356          * If no free blockref could be found we must create an indirect
2357          * block and move a number of blockrefs into it.  With the parent
2358          * locked we can safely lock each child in order to delete+duplicate
2359          * it without causing a deadlock.
2360          *
2361          * This may return the new indirect block or the old parent depending
2362          * on where the key falls.  NULL is returned on error.
2363          */
2364         if (above->live_count == count) {
2365                 hammer2_chain_t *nparent;
2366
2367                 nparent = hammer2_chain_create_indirect(trans, parent,
2368                                                         key, keybits,
2369                                                         type, &error);
2370                 if (nparent == NULL) {
2371                         if (allocated)
2372                                 hammer2_chain_drop(chain);
2373                         chain = NULL;
2374                         goto done;
2375                 }
2376                 if (parent != nparent) {
2377                         hammer2_chain_unlock(parent);
2378                         parent = *parentp = nparent;
2379                 }
2380                 goto again;
2381         }
2382
2383         /*
2384          * Link the chain into its parent.  Later on we will have to set
2385          * the MOVED bit in situations where we don't mark the new chain
2386          * as being modified.
2387          */
2388         if (chain->above != NULL)
2389                 panic("hammer2: hammer2_chain_create: chain already connected");
2390         KKASSERT(chain->above == NULL);
2391         KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
2392         hammer2_chain_insert(above, chain, HAMMER2_CHAIN_INSERT_SPIN |
2393                                            HAMMER2_CHAIN_INSERT_LIVE);
2394
2395         if (allocated) {
2396                 /*
2397                  * Mark the newly created chain modified.
2398                  *
2399                  * Device buffers are not instantiated for DATA elements
2400                  * as these are handled by logical buffers.
2401                  *
2402                  * Indirect and freemap node indirect blocks are handled
2403                  * by hammer2_chain_create_indirect() and not by this
2404                  * function.
2405                  *
2406                  * Data for all other bref types is expected to be
2407                  * instantiated (INODE, LEAF).
2408                  */
2409                 switch(chain->bref.type) {
2410                 case HAMMER2_BREF_TYPE_DATA:
2411                         hammer2_chain_modify(trans, &chain,
2412                                              HAMMER2_MODIFY_OPTDATA |
2413                                              HAMMER2_MODIFY_ASSERTNOCOPY);
2414                         break;
2415                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2416                 case HAMMER2_BREF_TYPE_INODE:
2417                         hammer2_chain_modify(trans, &chain,
2418                                              HAMMER2_MODIFY_ASSERTNOCOPY);
2419                         break;
2420                 default:
2421                         /*
2422                          * Remaining types are not supported by this function.
2423                          * In particular, INDIRECT and LEAF_NODE types are
2424                          * handled by create_indirect().
2425                          */
2426                         panic("hammer2_chain_create: bad type: %d",
2427                               chain->bref.type);
2428                         /* NOT REACHED */
2429                         break;
2430                 }
2431         } else {
2432                 /*
2433                  * When reconnecting a chain we must set MOVED and setsubmod
2434                  * so the flush recognizes that it must update the bref in
2435                  * the parent.
2436                  */
2437                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2438                         hammer2_chain_ref(chain);
2439                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2440                 }
2441                 hammer2_chain_setsubmod(trans, chain);
2442         }
2443
2444 done:
2445         *chainp = chain;
2446
2447         return (error);
2448 }
2449
2450 /*
2451  * Replace (*chainp) with a duplicate.  The original *chainp is unlocked
2452  * and the replacement will be returned locked.  Both the original and the
2453  * new chain will share the same RBTREE (have the same chain->core), with
2454  * the new chain becoming the 'current' chain (meaning it is the first in
2455  * the linked list at core->chain_first).
2456  *
2457  * If (parent) is non-NULL then the new duplicated chain is inserted under
2458  * the parent.
2459  *
2460  * If (parent) is NULL then the new duplicated chain is not inserted anywhere,
2461  * similar to if it had just been chain_alloc()'d (suitable for passing into
2462  * hammer2_chain_create() after this function returns).
2463  *
2464  * NOTE! Duplication is used in order to retain the original topology to
2465  *       support flush synchronization points.  Both the original and the
2466  *       new chain will have the same transaction id and thus the operation
2467  *       appears atomic w/regards to media flushes.
2468  */
2469 static void hammer2_chain_dup_fixup(hammer2_chain_t *ochain,
2470                                     hammer2_chain_t *nchain);
2471
2472 void
2473 hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t *parent,
2474                         hammer2_chain_t **chainp, hammer2_blockref_t *bref)
2475 {
2476         hammer2_mount_t *hmp;
2477         hammer2_blockref_t *base;
2478         hammer2_chain_t *ochain;
2479         hammer2_chain_t *nchain;
2480         hammer2_chain_core_t *above;
2481         size_t bytes;
2482         int count;
2483         int oflags;
2484         void *odata;
2485
2486         /*
2487          * First create a duplicate of the chain structure, associating
2488          * it with the same core, making it the same size, pointing it
2489          * to the same bref (the same media block).
2490          */
2491         ochain = *chainp;
2492         hmp = ochain->hmp;
2493         if (bref == NULL)
2494                 bref = &ochain->bref;
2495         nchain = hammer2_chain_alloc(hmp, ochain->pmp, trans, bref);
2496         hammer2_chain_core_alloc(trans, nchain, ochain);
2497         bytes = (hammer2_off_t)1 <<
2498                 (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
2499         nchain->bytes = bytes;
2500         nchain->modify_tid = ochain->modify_tid;
2501
2502         hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER |
2503                                    HAMMER2_RESOLVE_NOREF);
2504         hammer2_chain_dup_fixup(ochain, nchain);
2505         /* nchain has 1 ref */
2506
2507         /*
2508          * If parent is not NULL, insert the duplicated chain into the
2509          * parent.  The newly duplicated chain must be marked MOVED and
2510          * SUBMODIFIED set in its parent(s).
2511          *
2512          * Having both chains locked is extremely important for atomicy.
2513          */
2514         if (parent) {
2515                 /*
2516                  * Locate a free blockref in the parent's array
2517                  */
2518                 above = parent->core;
2519                 KKASSERT(ccms_thread_lock_owned(&above->cst));
2520
2521                 switch(parent->bref.type) {
2522                 case HAMMER2_BREF_TYPE_INODE:
2523                         KKASSERT((parent->data->ipdata.op_flags &
2524                                   HAMMER2_OPFLAG_DIRECTDATA) == 0);
2525                         KKASSERT(parent->data != NULL);
2526                         base = &parent->data->ipdata.u.blockset.blockref[0];
2527                         count = HAMMER2_SET_COUNT;
2528                         break;
2529                 case HAMMER2_BREF_TYPE_INDIRECT:
2530                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2531                         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2532                                 base = NULL;
2533                         } else {
2534                                 KKASSERT(parent->data != NULL);
2535                                 base = &parent->data->npdata[0];
2536                         }
2537                         count = parent->bytes / sizeof(hammer2_blockref_t);
2538                         break;
2539                 case HAMMER2_BREF_TYPE_VOLUME:
2540                         KKASSERT(parent->data != NULL);
2541                         base = &hmp->voldata.sroot_blockset.blockref[0];
2542                         count = HAMMER2_SET_COUNT;
2543                         break;
2544                 case HAMMER2_BREF_TYPE_FREEMAP:
2545                         KKASSERT(parent->data != NULL);
2546                         base = &hmp->voldata.freemap_blockset.blockref[0];
2547                         count = HAMMER2_SET_COUNT;
2548                         break;
2549                 default:
2550                         panic("hammer2_chain_create: unrecognized "
2551                               "blockref type: %d",
2552                               parent->bref.type);
2553                         count = 0;
2554                         break;
2555                 }
2556
2557                 KKASSERT((nchain->flags & HAMMER2_CHAIN_DELETED) == 0);
2558                 KKASSERT(parent->refs > 0);
2559                 hammer2_chain_insert(above, nchain, HAMMER2_CHAIN_INSERT_SPIN |
2560                                                     HAMMER2_CHAIN_INSERT_LIVE);
2561
2562                 if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2563                         hammer2_chain_ref(nchain);
2564                         atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED);
2565                 }
2566                 hammer2_chain_setsubmod(trans, nchain);
2567         }
2568
2569         /*
2570          * We have to unlock ochain to flush any dirty data, asserting the
2571          * case (data == NULL) to catch any extra locks that might have been
2572          * present, then transfer state to nchain.
2573          */
2574         oflags = ochain->flags;
2575         odata = ochain->data;
2576         hammer2_chain_unlock(ochain);
2577         KKASSERT((ochain->flags & HAMMER2_CHAIN_EMBEDDED) ||
2578                  ochain->data == NULL);
2579
2580         if (oflags & HAMMER2_CHAIN_INITIAL)
2581                 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL);
2582
2583         /*
2584          * WARNING!  We should never resolve DATA to device buffers
2585          *           (XXX allow it if the caller did?), and since
2586          *           we currently do not have the logical buffer cache
2587          *           buffer in-hand to fix its cached physical offset
2588          *           we also force the modify code to not COW it. XXX
2589          *
2590          * WARNING!  nchain should have only one manual ref plus additional
2591          *           refs related to flags or the hammer2_chain_modify()
2592          *           replacement could leave a ref hanging.
2593          */
2594         if (oflags & HAMMER2_CHAIN_MODIFIED) {
2595                 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2596                         hammer2_chain_modify(trans, &nchain,
2597                                              HAMMER2_MODIFY_OPTDATA |
2598                                              HAMMER2_MODIFY_NOREALLOC |
2599                                              HAMMER2_MODIFY_ASSERTNOCOPY);
2600                 } else if (oflags & HAMMER2_CHAIN_INITIAL) {
2601                         hammer2_chain_modify(trans, &nchain,
2602                                              HAMMER2_MODIFY_OPTDATA |
2603                                              HAMMER2_MODIFY_ASSERTNOCOPY);
2604                 } else {
2605                         hammer2_chain_modify(trans, &nchain,
2606                                              HAMMER2_MODIFY_ASSERTNOCOPY);
2607                 }
2608         } else {
2609                 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2610                         ;
2611                 } else if (oflags & HAMMER2_CHAIN_INITIAL) {
2612                         ;
2613                 } else {
2614                         hammer2_chain_lock(nchain, HAMMER2_RESOLVE_ALWAYS);
2615                         hammer2_chain_unlock(nchain);
2616                 }
2617         }
2618         atomic_set_int(&nchain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2619         *chainp = nchain;
2620 }
2621
2622 #if 0
2623
2624                 /*
2625                  * When the chain is in the INITIAL state we must still
2626                  * ensure that a block has been assigned so MOVED processing
2627                  * works as expected.
2628                  */
2629                 KKASSERT (nchain->bref.type != HAMMER2_BREF_TYPE_DATA);
2630                 hammer2_chain_modify(trans, &nchain,
2631                                      HAMMER2_MODIFY_OPTDATA |
2632                                      HAMMER2_MODIFY_ASSERTNOCOPY);
2633
2634
2635         hammer2_chain_lock(nchain, HAMMER2_RESOLVE_MAYBE |
2636                                    HAMMER2_RESOLVE_NOREF); /* eat excess ref */
2637         hammer2_chain_unlock(nchain);
2638 #endif
2639
2640 /*
2641  * Special in-place delete-duplicate sequence which does not require a
2642  * locked parent.  (*chainp) is marked DELETED and atomically replaced
2643  * with a duplicate.  Atomicy is at the very-fine spin-lock level in
2644  * order to ensure that lookups do not race us.
2645  *
2646  * If the input chain is already marked deleted the duplicated chain will
2647  * also be marked deleted.  This case can occur when an inode is removed
2648  * from the filesystem but programs still have an open descriptor to it.
2649  */
2650 void
2651 hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
2652                                int flags)
2653 {
2654         hammer2_mount_t *hmp;
2655         hammer2_chain_t *ochain;
2656         hammer2_chain_t *nchain;
2657         hammer2_chain_core_t *above;
2658         size_t bytes;
2659         int oflags;
2660         void *odata;
2661
2662         ochain = *chainp;
2663         oflags = ochain->flags;
2664         hmp = ochain->hmp;
2665
2666         /*
2667          * Shortcut DELETED case if possible (only if delete_tid already
2668          * matches the transaction id).
2669          */
2670         if ((oflags & HAMMER2_CHAIN_DELETED) &&
2671             ochain->delete_tid == trans->sync_tid) {
2672                 return;
2673         }
2674
2675         /*
2676          * First create a duplicate of the chain structure.
2677          * (nchain is allocated with one ref).
2678          */
2679         nchain = hammer2_chain_alloc(hmp, ochain->pmp, trans, &ochain->bref);
2680         if (flags & HAMMER2_DELDUP_RECORE)
2681                 hammer2_chain_core_alloc(trans, nchain, NULL);
2682         else
2683                 hammer2_chain_core_alloc(trans, nchain, ochain);
2684         above = ochain->above;
2685
2686         bytes = (hammer2_off_t)1 <<
2687                 (int)(ochain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
2688         nchain->bytes = bytes;
2689         nchain->modify_tid = ochain->modify_tid;
2690         nchain->data_count += ochain->data_count;
2691         nchain->inode_count += ochain->inode_count;
2692
2693         /*
2694          * Lock nchain so both chains are now locked (extremely important
2695          * for atomicy).  Mark ochain deleted and reinsert into the topology
2696          * and insert nchain all in one go.
2697          *
2698          * If the ochain is already deleted it is left alone and nchain
2699          * is inserted into the topology as a deleted chain.  This is
2700          * important because it allows ongoing operations to be executed
2701          * on a deleted inode which still has open descriptors.
2702          */
2703         hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER);
2704         hammer2_chain_dup_fixup(ochain, nchain);
2705         /* extra ref still present from original allocation */
2706
2707         KKASSERT(ochain->flags & HAMMER2_CHAIN_ONRBTREE);
2708         spin_lock(&above->cst.spin);
2709         KKASSERT(ochain->flags & HAMMER2_CHAIN_ONRBTREE);
2710
2711         if (oflags & HAMMER2_CHAIN_DELETED) {
2712                 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_DELETED);
2713                 nchain->delete_tid = trans->sync_tid;
2714                 /*nchain->delete_gen = ++trans->delete_gen;*/
2715         } else {
2716                 ochain->delete_tid = trans->sync_tid;
2717                 /*ochain->delete_gen = ++trans->delete_gen;*/
2718                 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_DELETED);
2719                 atomic_add_int(&above->live_count, -1);
2720         }
2721
2722         hammer2_chain_insert(above, nchain,
2723                              ((oflags & HAMMER2_CHAIN_DELETED) ?
2724                                 0: HAMMER2_CHAIN_INSERT_LIVE));
2725
2726         if ((ochain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2727                 hammer2_chain_ref(ochain);
2728                 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_MOVED);
2729         }
2730         spin_unlock(&above->cst.spin);
2731
2732         /*
2733          * We have to unlock ochain to flush any dirty data, asserting the
2734          * case (data == NULL) to catch any extra locks that might have been
2735          * present, then transfer state to nchain.
2736          */
2737         odata = ochain->data;
2738         hammer2_chain_unlock(ochain);   /* replacing ochain */
2739         KKASSERT(ochain->bref.type == HAMMER2_BREF_TYPE_INODE ||
2740                  ochain->data == NULL);
2741
2742         if (oflags & HAMMER2_CHAIN_INITIAL)
2743                 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL);
2744
2745         /*
2746          * WARNING!  We should never resolve DATA to device buffers
2747          *           (XXX allow it if the caller did?), and since
2748          *           we currently do not have the logical buffer cache
2749          *           buffer in-hand to fix its cached physical offset
2750          *           we also force the modify code to not COW it. XXX
2751          */
2752         if (oflags & HAMMER2_CHAIN_MODIFIED) {
2753                 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2754                         hammer2_chain_modify(trans, &nchain,
2755                                              HAMMER2_MODIFY_OPTDATA |
2756                                              HAMMER2_MODIFY_NOREALLOC |
2757                                              HAMMER2_MODIFY_ASSERTNOCOPY);
2758                 } else if (oflags & HAMMER2_CHAIN_INITIAL) {
2759                         hammer2_chain_modify(trans, &nchain,
2760                                              HAMMER2_MODIFY_OPTDATA |
2761                                              HAMMER2_MODIFY_ASSERTNOCOPY);
2762                 } else {
2763                         hammer2_chain_modify(trans, &nchain,
2764                                              HAMMER2_MODIFY_ASSERTNOCOPY);
2765                 }
2766                 hammer2_chain_drop(nchain);
2767         } else {
2768                 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2769                         hammer2_chain_drop(nchain);
2770                 } else if (oflags & HAMMER2_CHAIN_INITIAL) {
2771                         hammer2_chain_drop(nchain);
2772                 } else {
2773                         hammer2_chain_lock(nchain, HAMMER2_RESOLVE_ALWAYS |
2774                                                    HAMMER2_RESOLVE_NOREF);
2775                         hammer2_chain_unlock(nchain);
2776                 }
2777         }
2778
2779         /*
2780          * Unconditionally set the MOVED and SUBMODIFIED bit to force
2781          * update of parent bref and indirect blockrefs during flush.
2782          */
2783         if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2784                 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED);
2785                 hammer2_chain_ref(nchain);
2786         }
2787         atomic_set_int(&nchain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2788         hammer2_chain_setsubmod(trans, nchain);
2789         *chainp = nchain;
2790 }
2791
2792 /*
2793  * Helper function to fixup inodes.  The caller procedure stack may hold
2794  * multiple locks on ochain if it represents an inode, preventing our
2795  * unlock from retiring its state to the buffer cache.
2796  *
2797  * In this situation any attempt to access the buffer cache could result
2798  * either in stale data or a deadlock.  Work around the problem by copying
2799  * the embedded data directly.
2800  */
2801 static
2802 void
2803 hammer2_chain_dup_fixup(hammer2_chain_t *ochain, hammer2_chain_t *nchain)
2804 {
2805         if (ochain->data == NULL)
2806                 return;
2807         switch(ochain->bref.type) {
2808         case HAMMER2_BREF_TYPE_INODE:
2809                 KKASSERT(nchain->data == NULL);
2810                 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_EMBEDDED);
2811                 nchain->data = kmalloc(sizeof(nchain->data->ipdata),
2812                                        ochain->hmp->mchain, M_WAITOK | M_ZERO);
2813                 nchain->data->ipdata = ochain->data->ipdata;
2814                 break;
2815         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2816                 KKASSERT(nchain->data == NULL);
2817                 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_EMBEDDED);
2818                 nchain->data = kmalloc(sizeof(nchain->data->bmdata),
2819                                        ochain->hmp->mchain, M_WAITOK | M_ZERO);
2820                 bcopy(ochain->data->bmdata,
2821                       nchain->data->bmdata,
2822                       sizeof(nchain->data->bmdata));
2823                 break;
2824         default:
2825                 break;
2826         }
2827 }
2828
2829 /*
2830  * Create a snapshot of the specified {parent, chain} with the specified
2831  * label.
2832  *
2833  * (a) We create a duplicate connected to the super-root as the specified
2834  *     label.
2835  *
2836  * (b) We issue a restricted flush using the current transaction on the
2837  *     duplicate.
2838  *
2839  * (c) We disconnect and reallocate the duplicate's core.
2840  */
2841 int
2842 hammer2_chain_snapshot(hammer2_trans_t *trans, hammer2_inode_t *ip,
2843                        hammer2_ioc_pfs_t *pfs)
2844 {
2845         hammer2_cluster_t *cluster;
2846         hammer2_mount_t *hmp;
2847         hammer2_chain_t *chain;
2848         hammer2_chain_t *nchain;
2849         hammer2_chain_t *parent;
2850         hammer2_inode_data_t *ipdata;
2851         size_t name_len;
2852         hammer2_key_t key_dummy;
2853         hammer2_key_t lhc;
2854         int error;
2855         int cache_index = -1;
2856
2857         name_len = strlen(pfs->name);
2858         lhc = hammer2_dirhash(pfs->name, name_len);
2859         cluster = ip->pmp->mount_cluster;
2860         hmp = ip->chain->hmp;
2861         KKASSERT(hmp == cluster->hmp);  /* XXX */
2862
2863         /*
2864          * Create disconnected duplicate
2865          */
2866         KKASSERT((trans->flags & HAMMER2_TRANS_RESTRICTED) == 0);
2867         nchain = ip->chain;
2868         hammer2_chain_lock(nchain, HAMMER2_RESOLVE_MAYBE);
2869         hammer2_chain_duplicate(trans, NULL, &nchain, NULL);
2870         atomic_set_int(&nchain->flags, HAMMER2_CHAIN_RECYCLE |
2871                                        HAMMER2_CHAIN_SNAPSHOT);
2872
2873         /*
2874          * Create named entry in the super-root.
2875          */
2876         parent = hammer2_chain_lookup_init(hmp->schain, 0);
2877         error = 0;
2878         while (error == 0) {
2879                 chain = hammer2_chain_lookup(&parent, &key_dummy,
2880                                              lhc, lhc, &cache_index, 0);
2881                 if (chain == NULL)
2882                         break;
2883                 if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK)
2884                         error = ENOSPC;
2885                 hammer2_chain_unlock(chain);
2886                 chain = NULL;
2887                 ++lhc;
2888         }
2889         hammer2_chain_create(trans, &parent, &nchain, lhc, 0,
2890                              HAMMER2_BREF_TYPE_INODE,
2891                              HAMMER2_INODE_BYTES);
2892         hammer2_chain_modify(trans, &nchain, HAMMER2_MODIFY_ASSERTNOCOPY);
2893         hammer2_chain_lookup_done(parent);
2894         parent = NULL;  /* safety */
2895
2896         /*
2897          * Name fixup
2898          */
2899         ipdata = &nchain->data->ipdata;
2900         ipdata->name_key = lhc;
2901         ipdata->name_len = name_len;
2902         ksnprintf(ipdata->filename, sizeof(ipdata->filename), "%s", pfs->name);
2903
2904         /*
2905          * Set PFS type, generate a unique filesystem id, and generate
2906          * a cluster id.  Use the same clid when snapshotting a PFS root,
2907          * which theoretically allows the snapshot to be used as part of
2908          * the same cluster (perhaps as a cache).
2909          */
2910         ipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
2911         kern_uuidgen(&ipdata->pfs_fsid, 1);
2912         if (ip->chain == cluster->rchain)
2913                 ipdata->pfs_clid = ip->chain->data->ipdata.pfs_clid;
2914         else
2915                 kern_uuidgen(&ipdata->pfs_clid, 1);
2916
2917         /*
2918          * Issue a restricted flush of the snapshot.  This is a synchronous
2919          * operation.
2920          */
2921         trans->flags |= HAMMER2_TRANS_RESTRICTED;
2922         kprintf("SNAPSHOTA\n");
2923         tsleep(trans, 0, "snapslp", hz*4);
2924         kprintf("SNAPSHOTB\n");
2925         hammer2_chain_flush(trans, nchain);
2926         trans->flags &= ~HAMMER2_TRANS_RESTRICTED;
2927
2928 #if 0
2929         /*
2930          * Remove the link b/c nchain is a snapshot and snapshots don't
2931          * follow CHAIN_DELETED semantics ?
2932          */
2933         chain = ip->chain;
2934
2935
2936         KKASSERT(chain->duplink == nchain);
2937         KKASSERT(chain->core == nchain->core);
2938         KKASSERT(nchain->refs >= 2);
2939         chain->duplink = nchain->duplink;
2940         atomic_clear_int(&nchain->flags, HAMMER2_CHAIN_DUPTARGET);
2941         hammer2_chain_drop(nchain);
2942 #endif
2943
2944         kprintf("snapshot %s nchain->refs %d nchain->flags %08x\n",
2945                 pfs->name, nchain->refs, nchain->flags);
2946         hammer2_chain_unlock(nchain);
2947
2948         return (error);
2949 }
2950
2951 /*
2952  * Create an indirect block that covers one or more of the elements in the
2953  * current parent.  Either returns the existing parent with no locking or
2954  * ref changes or returns the new indirect block locked and referenced
2955  * and leaving the original parent lock/ref intact as well.
2956  *
2957  * If an error occurs, NULL is returned and *errorp is set to the error.
2958  *
2959  * The returned chain depends on where the specified key falls.
2960  *
2961  * The key/keybits for the indirect mode only needs to follow three rules:
2962  *
2963  * (1) That all elements underneath it fit within its key space and
2964  *
2965  * (2) That all elements outside it are outside its key space.
2966  *
2967  * (3) When creating the new indirect block any elements in the current
2968  *     parent that fit within the new indirect block's keyspace must be
2969  *     moved into the new indirect block.
2970  *
2971  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
2972  *     keyspace the the current parent, but lookup/iteration rules will
2973  *     ensure (and must ensure) that rule (2) for all parents leading up
2974  *     to the nearest inode or the root volume header is adhered to.  This
2975  *     is accomplished by always recursing through matching keyspaces in
2976  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
2977  *
2978  * The current implementation calculates the current worst-case keyspace by
2979  * iterating the current parent and then divides it into two halves, choosing
2980  * whichever half has the most elements (not necessarily the half containing
2981  * the requested key).
2982  *
2983  * We can also opt to use the half with the least number of elements.  This
2984  * causes lower-numbered keys (aka logical file offsets) to recurse through
2985  * fewer indirect blocks and higher-numbered keys to recurse through more.
2986  * This also has the risk of not moving enough elements to the new indirect
2987  * block and being forced to create several indirect blocks before the element
2988  * can be inserted.
2989  *
2990  * Must be called with an exclusively locked parent.
2991  */
2992 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent,
2993                                 hammer2_key_t *keyp, int keybits,
2994                                 hammer2_blockref_t *base, int count);
2995 static int hammer2_chain_indkey_normal(hammer2_chain_t *parent,
2996                                 hammer2_key_t *keyp, int keybits,
2997                                 hammer2_blockref_t *base, int count);
2998 static
2999 hammer2_chain_t *
3000 hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
3001                               hammer2_key_t create_key, int create_bits,
3002                               int for_type, int *errorp)
3003 {
3004         hammer2_mount_t *hmp;
3005         hammer2_chain_core_t *above;
3006         hammer2_chain_core_t *icore;
3007         hammer2_blockref_t *base;
3008         hammer2_blockref_t *bref;
3009         hammer2_chain_t *chain;
3010         hammer2_chain_t *ichain;
3011         hammer2_chain_t dummy;
3012         hammer2_key_t key = create_key;
3013         hammer2_key_t key_beg;
3014         hammer2_key_t key_end;
3015         hammer2_key_t key_next;
3016         int keybits = create_bits;
3017         int count;
3018         int nbytes;
3019         int i;
3020         int cache_index;
3021
3022         /*
3023          * Calculate the base blockref pointer or NULL if the chain
3024          * is known to be empty.  We need to calculate the array count
3025          * for RB lookups either way.
3026          */
3027         hmp = parent->hmp;
3028         *errorp = 0;
3029         KKASSERT(ccms_thread_lock_owned(&parent->core->cst));
3030         above = parent->core;
3031
3032         /*hammer2_chain_modify(trans, &parent, HAMMER2_MODIFY_OPTDATA);*/
3033         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
3034                 base = NULL;
3035
3036                 switch(parent->bref.type) {
3037                 case HAMMER2_BREF_TYPE_INODE:
3038                         count = HAMMER2_SET_COUNT;
3039                         break;
3040                 case HAMMER2_BREF_TYPE_INDIRECT:
3041                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3042                         count = parent->bytes / sizeof(hammer2_blockref_t);
3043                         break;
3044                 case HAMMER2_BREF_TYPE_VOLUME:
3045                         count = HAMMER2_SET_COUNT;
3046                         break;
3047                 case HAMMER2_BREF_TYPE_FREEMAP:
3048                         count = HAMMER2_SET_COUNT;
3049                         break;
3050                 default:
3051                         panic("hammer2_chain_create_indirect: "
3052                               "unrecognized blockref type: %d",
3053                               parent->bref.type);
3054                         count = 0;
3055                         break;
3056                 }
3057         } else {
3058                 switch(parent->bref.type) {
3059                 case HAMMER2_BREF_TYPE_INODE:
3060                         base = &parent->data->ipdata.u.blockset.blockref[0];
3061                         count = HAMMER2_SET_COUNT;
3062                         break;
3063                 case HAMMER2_BREF_TYPE_INDIRECT:
3064                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3065                         base = &parent->data->npdata[0];
3066                         count = parent->bytes / sizeof(hammer2_blockref_t);
3067                         break;
3068                 case HAMMER2_BREF_TYPE_VOLUME:
3069                         base = &hmp->voldata.sroot_blockset.blockref[0];
3070                         count = HAMMER2_SET_COUNT;
3071                         break;
3072                 case HAMMER2_BREF_TYPE_FREEMAP:
3073                         base = &hmp->voldata.freemap_blockset.blockref[0];
3074                         count = HAMMER2_SET_COUNT;
3075                         break;
3076                 default:
3077                         panic("hammer2_chain_create_indirect: "
3078                               "unrecognized blockref type: %d",
3079                               parent->bref.type);
3080                         count = 0;
3081                         break;
3082                 }
3083         }
3084
3085         /*
3086          * dummy used in later chain allocation (no longer used for lookups).
3087          */
3088         bzero(&dummy, sizeof(dummy));
3089         dummy.delete_tid = HAMMER2_MAX_TID;
3090
3091         /*
3092          * When creating an indirect block for a freemap node or leaf
3093          * the key/keybits must be fitted to static radix levels because
3094          * particular radix levels use particular reserved blocks in the
3095          * related zone.
3096          *
3097          * This routine calculates the key/radix of the indirect block
3098          * we need to create, and whether it is on the high-side or the
3099          * low-side.
3100          */
3101         if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3102             for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3103                 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits,
3104                                                        base, count);
3105         } else {
3106                 keybits = hammer2_chain_indkey_normal(parent, &key, keybits,
3107                                                       base, count);
3108         }
3109
3110         /*
3111          * Normalize the key for the radix being represented, keeping the
3112          * high bits and throwing away the low bits.
3113          */
3114         key &= ~(((hammer2_key_t)1 << keybits) - 1);
3115
3116         /*
3117          * How big should our new indirect block be?  It has to be at least
3118          * as large as its parent.
3119          */
3120         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
3121                 nbytes = HAMMER2_IND_BYTES_MIN;
3122         else
3123                 nbytes = HAMMER2_IND_BYTES_MAX;
3124         if (nbytes < count * sizeof(hammer2_blockref_t))
3125                 nbytes = count * sizeof(hammer2_blockref_t);
3126
3127         /*
3128          * Ok, create our new indirect block
3129          */
3130         if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3131             for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3132                 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE;
3133         } else {
3134                 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
3135         }
3136         dummy.bref.key = key;
3137         dummy.bref.keybits = keybits;
3138         dummy.bref.data_off = hammer2_getradix(nbytes);
3139         dummy.bref.methods = parent->bref.methods;
3140
3141         ichain = hammer2_chain_alloc(hmp, parent->pmp, trans, &dummy.bref);
3142         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
3143         hammer2_chain_core_alloc(trans, ichain, NULL);
3144         icore = ichain->core;
3145         hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE);
3146         hammer2_chain_drop(ichain);     /* excess ref from alloc */
3147
3148         /*
3149          * We have to mark it modified to allocate its block, but use
3150          * OPTDATA to allow it to remain in the INITIAL state.  Otherwise
3151          * it won't be acted upon by the flush code.
3152          *
3153          * XXX leave the node unmodified, depend on the SUBMODIFIED
3154          * flush to assign and modify parent blocks.
3155          */
3156         hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);
3157
3158         /*
3159          * Iterate the original parent and move the matching brefs into
3160          * the new indirect block.
3161          *
3162          * XXX handle flushes.
3163          */
3164         key_beg = 0;
3165         key_end = HAMMER2_MAX_TID;
3166         cache_index = 0;
3167         spin_lock(&above->cst.spin);
3168
3169         for (;;) {
3170                 /*
3171                  * (i) is set to (count) on failure.  Otherwise the element at
3172                  * base[i] points to an entry >= key_beg.
3173                  */
3174                 key_next = key_end + 1;
3175                 i = hammer2_base_find(base, count, above, &cache_index,
3176                                       &key_next, key_beg);
3177                 chain = hammer2_chain_find(parent, &key_next,
3178                                            key_beg, key_end);
3179                 if (i == count) {
3180                         /*
3181                          * Nothing in block array, select the chain
3182                          */
3183                         if (chain == NULL)
3184                                 break;
3185                         if (chain->flags & HAMMER2_CHAIN_DELETED) {
3186                                 if (key_next == 0)
3187                                         break;
3188                                 key_beg = key_next;
3189                                 continue;
3190                         }
3191                         bref = &chain->bref;
3192                 } else if (chain == NULL || base[i].key < chain->bref.key) {
3193                         /*
3194                          * RBTREE is empty or Nearest key is in block array.
3195                          * Set chain to NULL so the code down below knows
3196                          * where the bref is coming from.
3197                          */
3198                         bref = &base[i];
3199                         chain = NULL;
3200                 } else {
3201                         /*
3202                          * Chain overrides base[i] or nearest key is chain
3203                          */
3204                         if (chain->flags & HAMMER2_CHAIN_DELETED) {
3205                                 if (key_next == 0)
3206                                         break;
3207                                 key_beg = key_next;
3208                                 continue;
3209                         }
3210                         bref = &chain->bref;
3211                 }
3212
3213                 /*
3214                  * Skip keys that are not within the key/radix of the new
3215                  * indirect block.  They stay in the parent.
3216                  */
3217                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
3218                     (key ^ bref->key)) != 0) {
3219                         if (key_next == 0)
3220                                 break;
3221                         key_beg = key_next;
3222                         continue;
3223                 }
3224
3225                 /*
3226                  * Load the new indirect block by acquiring or allocating
3227                  * the related chain, then move it to the new parent (ichain)
3228                  * via DELETE-DUPLICATE.
3229                  *
3230                  * WARNING! above->cst.spin must be held when parent is
3231                  *          modified, even though we own the full blown lock,
3232                  *          to deal with setsubmod and rename races.
3233                  *          (XXX remove this req).
3234                  */
3235                 if (chain) {
3236                         /*
3237                          * Use chain already present in the RBTREE
3238                          */
3239                         hammer2_chain_ref(chain);
3240                         spin_unlock(&above->cst.spin);
3241                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER |
3242                                                   HAMMER2_RESOLVE_NOREF);
3243                 } else {
3244                         /*
3245                          * Get chain for blockref element.  _get returns NULL
3246                          * on insertion race.
3247                          */
3248                         spin_unlock(&above->cst.spin);
3249                         chain = hammer2_chain_get(parent, bref);
3250                         if (chain == NULL)
3251                                 continue;
3252                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER |
3253                                                   HAMMER2_RESOLVE_NOREF);
3254                 }
3255                 hammer2_chain_delete(trans, chain, HAMMER2_DELETE_WILLDUP);
3256                 hammer2_chain_duplicate(trans, ichain, &chain, NULL);
3257                 hammer2_chain_unlock(chain);
3258                 KKASSERT(parent->refs > 0);
3259                 chain = NULL;
3260                 spin_lock(&above->cst.spin);
3261                 if (key_next == 0)
3262                         break;
3263                 key_beg = key_next;
3264         }
3265         spin_unlock(&above->cst.spin);
3266
3267         /*
3268          * Insert the new indirect block into the parent now that we've
3269          * cleared out some entries in the parent.  We calculated a good
3270          * insertion index in the loop above (ichain->index).
3271          *
3272          * We don't have to set MOVED here because we mark ichain modified
3273          * down below (so the normal modified -> flush -> set-moved sequence
3274          * applies).
3275          *
3276          * The insertion shouldn't race as this is a completely new block
3277          * and the parent is locked.
3278          */
3279         KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
3280         hammer2_chain_insert(above, ichain, HAMMER2_CHAIN_INSERT_SPIN |
3281                                             HAMMER2_CHAIN_INSERT_LIVE);
3282
3283         /*
3284          * Mark the new indirect block modified after insertion, which
3285          * will propagate up through parent all the way to the root and
3286          * also allocate the physical block in ichain for our caller,
3287          * and assign ichain->data to a pre-zero'd space (because there
3288          * is not prior data to copy into it).
3289          *
3290          * We have to set SUBMODIFIED in ichain's flags manually so the
3291          * flusher knows it has to recurse through it to get to all of
3292          * our moved blocks, then call setsubmod() to set the bit
3293          * recursively.
3294          */
3295         /*hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);*/
3296         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
3297         hammer2_chain_setsubmod(trans, ichain);
3298
3299         /*
3300          * Figure out what to return.
3301          */
3302         if (~(((hammer2_key_t)1 << keybits) - 1) &
3303                    (create_key ^ key)) {
3304                 /*
3305                  * Key being created is outside the key range,
3306                  * return the original parent.
3307                  */
3308                 hammer2_chain_unlock(ichain);
3309         } else {
3310                 /*
3311                  * Otherwise its in the range, return the new parent.
3312                  * (leave both the new and old parent locked).
3313                  */
3314                 parent = ichain;
3315         }
3316
3317         return(parent);
3318 }
3319
3320 /*
3321  * Calculate the keybits and highside/lowside of the freemap node the
3322  * caller is creating.
3323  *
3324  * This routine will specify the next higher-level freemap key/radix
3325  * representing the lowest-ordered set.  By doing so, eventually all
3326  * low-ordered sets will be moved one level down.
3327  *
3328  * We have to be careful here because the freemap reserves a limited
3329  * number of blocks for a limited number of levels.  So we can't just
3330  * push indiscriminately.
3331  */
3332 int
3333 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
3334                              int keybits, hammer2_blockref_t *base, int count)
3335 {
3336         hammer2_chain_core_t *above;
3337         hammer2_chain_t *chain;
3338         hammer2_blockref_t *bref;
3339         hammer2_key_t key;
3340         hammer2_key_t key_beg;
3341         hammer2_key_t key_end;
3342         hammer2_key_t key_next;
3343         int cache_index;
3344         int locount;
3345         int hicount;
3346         int i;
3347
3348         key = *keyp;
3349         above = parent->core;
3350         locount = 0;
3351         hicount = 0;
3352         keybits = 64;
3353
3354         /*
3355          * Calculate the range of keys in the array being careful to skip
3356          * slots which are overridden with a deletion.
3357          */
3358         key_beg = 0;
3359         key_end = HAMMER2_MAX_TID;
3360         cache_index = 0;
3361         spin_lock(&above->cst.spin);
3362
3363         for (;;) {
3364                 /*
3365                  * (i) is set to (count) on failure.  Otherwise the element at
3366                  * base[i] points to an entry >= key_beg.
3367                  */
3368                 key_next = key_end + 1;
3369                 i = hammer2_base_find(base, count, above, &cache_index,
3370                                       &key_next, key_beg);
3371                 chain = hammer2_chain_find(parent, &key_next,
3372                                            key_beg, key_end);
3373                 if (i == count) {
3374                         /*
3375                          * Nothing in block array, select the chain
3376                          */
3377                         if (chain == NULL)
3378                                 break;
3379                         if (chain->flags & HAMMER2_CHAIN_DELETED) {
3380                                 if (key_next == 0)
3381                                         break;
3382                                 key_beg = key_next;
3383                                 continue;
3384                         }
3385                         bref = &chain->bref;
3386                 } else if (chain == NULL || base[i].key < chain->bref.key) {
3387                         /*
3388                          * RBTREE is empty or Nearest key is in block array.
3389                          *
3390                          * chain returns NULL on insertion race.
3391                          */
3392                         bref = &base[i];
3393                 } else {
3394                         /*
3395                          * Chain overrides base[i] or nearest key is chain
3396                          */
3397                         if (chain->flags & HAMMER2_CHAIN_DELETED) {
3398                                 if (key_next == 0)
3399                                         break;
3400                                 key_beg = key_next;
3401                                 continue;
3402                         }
3403                         bref = &chain->bref;
3404                 }
3405
3406                 if (keybits > bref->keybits) {
3407                         key = bref->key;
3408                         keybits = bref->keybits;
3409                 } else if (keybits == bref->keybits && bref->key < key) {
3410                         key = bref->key;
3411                 }
3412                 if (key_next == 0)
3413                         break;
3414                 key_beg = key_next;
3415         }
3416         spin_unlock(&above->cst.spin);
3417
3418         /*
3419          * Return the keybits for a higher-level FREEMAP_NODE covering
3420          * this node.
3421          */
3422         switch(keybits) {
3423         case HAMMER2_FREEMAP_LEVEL0_RADIX:
3424                 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX;
3425                 break;
3426         case HAMMER2_FREEMAP_LEVEL1_RADIX:
3427                 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX;
3428                 break;
3429         case HAMMER2_FREEMAP_LEVEL2_RADIX:
3430                 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX;
3431                 break;
3432         case HAMMER2_FREEMAP_LEVEL3_RADIX:
3433                 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX;
3434                 break;
3435         case HAMMER2_FREEMAP_LEVEL4_RADIX:
3436                 panic("hammer2_chain_indkey_freemap: level too high");
3437                 break;
3438         default:
3439                 panic("hammer2_chain_indkey_freemap: bad radix");
3440                 break;
3441         }
3442         *keyp = key;
3443
3444         return (keybits);
3445 }
3446
3447 /*
3448  * Calculate the keybits and highside/lowside of the indirect block the
3449  * caller is creating.
3450  */
3451 static int
3452 hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
3453                             int keybits, hammer2_blockref_t *base, int count)
3454 {
3455         hammer2_chain_core_t *above;
3456         hammer2_blockref_t *bref;
3457         hammer2_chain_t *chain;
3458         hammer2_key_t key_beg;
3459         hammer2_key_t key_end;
3460         hammer2_key_t key_next;
3461         hammer2_key_t key;
3462         int nkeybits;
3463         int locount;
3464         int hicount;
3465         int cache_index;
3466         int i;
3467
3468         key = *keyp;
3469         above = parent->core;
3470         locount = 0;
3471         hicount = 0;
3472
3473         /*
3474          * Calculate the range of keys in the array being careful to skip
3475          * slots which are overridden with a deletion.  Once the scan
3476          * completes we will cut the key range in half and shift half the
3477          * range into the new indirect block.
3478          */
3479         key_beg = 0;
3480         key_end = HAMMER2_MAX_TID;
3481         spin_lock(&above->cst.spin);
3482
3483         for (;;) {
3484                 /*
3485                  * (i) is set to (count) on failure.  Otherwise the element at
3486                  * base[i] points to an entry >= key_beg.
3487                  */
3488                 key_next = key_end + 1;
3489                 i = hammer2_base_find(base, count, above, &cache_index,
3490                                       &key_next, key_beg);
3491                 chain = hammer2_chain_find(parent, &key_next,
3492                                            key_beg, key_end);
3493                 if (i == count) {
3494                         /*
3495                          * Nothing in block array, select the chain
3496                          */
3497                         if (chain == NULL)
3498                                 break;
3499                         if (chain->flags & HAMMER2_CHAIN_DELETED) {
3500                                 if (key_next == 0)
3501                                         break;
3502                                 key_beg = key_next;
3503                                 continue;
3504                         }
3505                         bref = &chain->bref;
3506                 } else if (chain == NULL || base[i].key < chain->bref.key) {
3507                         /*
3508                          * RBTREE is empty or Nearest key is in block array.
3509                          *
3510                          * chain returns NULL on insertion race.
3511                          */
3512                         bref = &base[i];
3513                 } else {
3514                         /*
3515                          * Chain overrides base[i] or nearest key is chain
3516                          */
3517                         if (chain->flags & HAMMER2_CHAIN_DELETED) {
3518                                 if (key_next == 0)
3519                                         break;
3520                                 key_beg = key_next;
3521                                 continue;
3522                         }
3523                         bref = &chain->bref;
3524                 }
3525
3526                 /*
3527                  * Expand our calculated key range (key, keybits) to fit
3528                  * the scanned key.  nkeybits represents the full range
3529                  * that we will later cut in half (two halves @ nkeybits - 1).
3530                  */
3531                 nkeybits = keybits;
3532                 if (nkeybits < bref->keybits) {
3533                         if (bref->keybits > 64) {
3534                                 kprintf("bad bref index %d chain %p bref %p\n",
3535                                         i, chain, bref);
3536                                 Debugger("fubar");
3537                         }
3538                         nkeybits = bref->keybits;
3539                 }
3540                 while (nkeybits < 64 &&
3541                        (~(((hammer2_key_t)1 << nkeybits) - 1) &
3542                         (key ^ bref->key)) != 0) {
3543                         ++nkeybits;
3544                 }
3545
3546                 /*
3547                  * If the new key range is larger we have to determine
3548                  * which side of the new key range the existing keys fall
3549                  * under by checking the high bit, then collapsing the
3550                  * locount into the hicount or vise-versa.
3551                  */
3552                 if (keybits != nkeybits) {
3553                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
3554                                 hicount += locount;
3555                                 locount = 0;
3556                         } else {
3557                                 locount += hicount;
3558                                 hicount = 0;
3559                         }
3560                         keybits = nkeybits;
3561                 }
3562
3563                 /*
3564                  * The newly scanned key will be in the lower half or the
3565                  * higher half of the (new) key range.
3566                  */
3567                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
3568                         ++hicount;
3569                 else
3570                         ++locount;
3571
3572                 if (key_next == 0)
3573                         break;
3574                 key_beg = key_next;
3575         }
3576         spin_unlock(&above->cst.spin);
3577         bref = NULL;    /* now invalid (safety) */
3578
3579         /*
3580          * Adjust keybits to represent half of the full range calculated
3581          * above (radix 63 max)
3582          */
3583         --keybits;
3584
3585         /*
3586          * Select whichever half contains the most elements.  Theoretically
3587          * we can select either side as long as it contains at least one
3588          * element (in order to ensure that a free slot is present to hold
3589          * the indirect block).
3590          */
3591         if (hammer2_indirect_optimize) {
3592                 /*
3593                  * Insert node for least number of keys, this will arrange
3594                  * the first few blocks of a large file or the first few
3595                  * inodes in a directory with fewer indirect blocks when
3596                  * created linearly.
3597                  */
3598                 if (hicount < locount && hicount != 0)
3599                         key |= (hammer2_key_t)1 << keybits;
3600                 else
3601                         key &= ~(hammer2_key_t)1 << keybits;
3602         } else {
3603                 /*
3604                  * Insert node for most number of keys, best for heavily
3605                  * fragmented files.
3606                  */
3607                 if (hicount > locount)
3608                         key |= (hammer2_key_t)1 << keybits;
3609                 else
3610                         key &= ~(hammer2_key_t)1 << keybits;
3611         }
3612         *keyp = key;
3613
3614         return (keybits);
3615 }
3616
3617 /*
3618  * Sets CHAIN_DELETED and CHAIN_MOVED in the chain being deleted and
3619  * set chain->delete_tid.
3620  *
3621  * This function does NOT generate a modification to the parent.  It
3622  * would be nearly impossible to figure out which parent to modify anyway.
3623  * Such modifications are handled by the flush code and are properly merged
3624  * using the flush synchronization point.
3625  *
3626  * The find/get code will properly overload the RBTREE check on top of
3627  * the bref check to detect deleted entries.
3628  *
3629  * This function is NOT recursive.  Any entity already pushed into the
3630  * chain (such as an inode) may still need visibility into its contents,
3631  * as well as the ability to read and modify the contents.  For example,
3632  * for an unlinked file which is still open.
3633  *
3634  * NOTE: This function does NOT set chain->modify_tid, allowing future
3635  *       code to distinguish between live and deleted chains by testing
3636  *       sync_tid.
3637  *
3638  * NOTE: Deletions normally do not occur in the middle of a duplication
3639  *       chain but we use a trick for hardlink migration that refactors
3640  *       the originating inode without deleting it, so we make no assumptions
3641  *       here.
3642  */
3643 void
3644 hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags)
3645 {
3646         KKASSERT(ccms_thread_lock_owned(&chain->core->cst));
3647
3648         /*
3649          * Nothing to do if already marked.
3650          */
3651         if (chain->flags & HAMMER2_CHAIN_DELETED)
3652                 return;
3653
3654         /*
3655          * We must set MOVED along with DELETED for the flush code to
3656          * recognize the operation and properly disconnect the chain
3657          * in-memory.
3658          *
3659          * The setting of DELETED causes finds, lookups, and _next iterations
3660          * to no longer recognize the chain.  RB_SCAN()s will still have
3661          * visibility (needed for flush serialization points).
3662          *
3663          * We need the spinlock on the core whos RBTREE contains chain
3664          * to protect against races.
3665          */
3666         KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
3667         spin_lock(&chain->above->cst.spin);
3668
3669         chain->delete_tid = trans->sync_tid;
3670         /*chain->delete_gen = ++trans->delete_gen;*/
3671         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3672         atomic_add_int(&chain->above->live_count, -1);
3673
3674         if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
3675                 hammer2_chain_ref(chain);
3676                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
3677         }
3678         spin_unlock(&chain->above->cst.spin);
3679
3680         /*
3681          * Mark the underlying block as possibly being free unless WILLDUP
3682          * is set.  Duplication can occur in many situations, particularly
3683          * when chains are moved to indirect blocks.
3684          */
3685         if ((flags & HAMMER2_DELETE_WILLDUP) == 0)
3686                 hammer2_freemap_free(trans, chain->hmp, &chain->bref, 0);
3687         hammer2_chain_setsubmod(trans, chain);
3688 }
3689
3690 /*
3691  * Called with the core spinlock held to check for freeable layers.
3692  * Used by the flush code.  Layers can wind up not being freed due
3693  * to the temporary layer->refs count.  This function frees up any
3694  * layers that were missed.
3695  */
3696 void
3697 hammer2_chain_layer_check_locked(hammer2_mount_t *hmp,
3698                                  hammer2_chain_core_t *core)
3699 {
3700         hammer2_chain_layer_t *layer;
3701         hammer2_chain_layer_t *tmp;
3702
3703         tmp = TAILQ_FIRST(&core->layerq);
3704         while ((layer = tmp) != NULL) {
3705                 tmp = TAILQ_NEXT(tmp, entry);
3706                 if (layer->refs == 0 && RB_EMPTY(&layer->rbtree)) {
3707                         TAILQ_REMOVE(&core->layerq, layer, entry);
3708                         if (tmp)
3709                                 ++tmp->refs;
3710                         spin_unlock(&core->cst.spin);
3711                         kfree(layer, hmp->mchain);
3712                         spin_lock(&core->cst.spin);
3713                         if (tmp)
3714                                 --tmp->refs;
3715                 }
3716         }
3717 }
3718
3719 /*
3720  * Returns the index of the nearest element in the blockref array >= elm.
3721  * Returns (count) if no element could be found.
3722  *
3723  * Sets *key_nextp to the next key for loop purposes but does not modify
3724  * it if the next key would be higher than the current value of *key_nextp.
3725  * Note that *key_nexp can overflow to 0, which should be tested by the
3726  * caller.
3727  *
3728  * (*cache_indexp) is a heuristic and can be any value without effecting
3729  * the result.
3730  *
3731  * The spin lock on the related chain must be held.
3732  */
3733 int
3734 hammer2_base_find(hammer2_blockref_t *base, int count,
3735                   hammer2_chain_core_t *above, int *cache_indexp,
3736                   hammer2_key_t *key_nextp, hammer2_key_t key)
3737 {
3738         hammer2_blockref_t *scan;
3739         hammer2_key_t scan_end;
3740         int i;
3741
3742         /*
3743          * Degenerate case
3744          */
3745         KKASSERT(above->flags & HAMMER2_CORE_COUNTEDBREFS);
3746         if (count == 0 || base == NULL)
3747                 return(count);
3748
3749         /*
3750          * Sequential optimization
3751          */
3752         i = *cache_indexp;
3753         cpu_ccfence();
3754         if (i >= above->live_zero)
3755                 i = above->live_zero - 1;
3756         if (i < 0)
3757                 i = 0;
3758
3759         /*
3760          * Search backwards
3761          */
3762         scan = &base[i];
3763         while (i > 0 && (scan->type == 0 || scan->key > key)) {
3764                 --scan;
3765                 --i;
3766         }
3767         *cache_indexp = i;
3768
3769         /*
3770          * Search forwards, stop when we find a scan element which
3771          * enclosed the key or until we know that there are no further
3772          * elements.
3773          */
3774         while (i < count) {
3775                 if (scan->type != 0) {
3776                         if (scan->key > key)
3777                                 break;
3778                         scan_end = scan->key +
3779                                    ((hammer2_key_t)1 << scan->keybits) - 1;
3780                         if (scan_end >= key)
3781                                 break;
3782                 }
3783                 if (i >= above->live_zero)
3784                         return (count);
3785                 ++scan;
3786                 ++i;
3787         }
3788         if (i != count) {
3789                 scan_end = scan->key + ((hammer2_key_t)1 << scan->keybits);
3790                 if (scan_end && (*key_nextp > scan_end || *key_nextp == 0))
3791                         *key_nextp = scan_end;
3792                 *cache_indexp = i;
3793         }
3794         return (i);
3795 }
3796
3797 /*
3798  * Locate the specified block array element and delete it.  The element
3799  * must exist.
3800  *
3801  * The spin lock on the related chain must be held.
3802  *
3803  * NOTE: live_count was adjusted when the chain was deleted, so it does not
3804  *       need to be adjusted when we commit the media change.
3805  */
3806 void
3807 hammer2_base_delete(hammer2_blockref_t *base, int count,
3808                     hammer2_chain_core_t *above,
3809                     int *cache_indexp, hammer2_blockref_t *elm)
3810 {
3811         hammer2_key_t key_next;
3812         int i;
3813
3814         /*
3815          * Delete element.  Expect the element to exist.
3816          *
3817          * XXX see caller, flush code not yet sophisticated enough to prevent
3818          *     re-flushed in some cases.
3819          */
3820         key_next = 0; /* max range */
3821         i = hammer2_base_find(base, count, above, cache_indexp,
3822                               &key_next, elm->key);
3823         if (i == count || base[i].type == 0 ||
3824             base[i].key != elm->key || base[i].keybits != elm->keybits) {
3825                 kprintf("hammer2_base_delete: duplicate key %016jx/%d\n",
3826                         elm->key, elm->keybits);
3827                 return;
3828         }
3829 #if 0
3830         KKASSERT(i != count);
3831         KKASSERT(base[i].type &&
3832                  base[i].key == elm->key && base[i].keybits == elm->keybits);
3833 #endif
3834         bzero(&base[i], sizeof(*base));
3835         if (above->live_zero == i + 1) {
3836                 while (--i >= 0 && base[i].type == 0)
3837                         ;
3838                 above->live_zero = i + 1;
3839         }
3840 }
3841
3842 /*
3843  * Insert the specified element.  The block array must have space and
3844  * will be rearranged as necessary.
3845  *
3846  * The spin lock on the related chain must be held.
3847  *
3848  * Test (*flagsp) for HAMMER2_CHAIN_REPLACE.  If set an existing bref
3849  * is replaced, otherwise a new bref is created.  The flag is then set
3850  * prior to return indicating that a bref is now present in the block table.
3851  *
3852  * NOTE: live_count was adjusted when the chain was deleted, so it does not
3853  *       need to be adjusted when we commit the media change.
3854  */
3855 void
3856 hammer2_base_insert(hammer2_blockref_t *base, int count,
3857                     hammer2_chain_core_t *above,
3858                     int *cache_indexp, hammer2_blockref_t *elm,
3859                     int flags)
3860 {
3861         hammer2_key_t key_next;
3862         int i;
3863         int j;
3864         int k;
3865
3866         /*
3867          * Insert new element.  Expect the element to not already exist
3868          * unless we are replacing it.
3869          *
3870          * XXX see caller, flush code not yet sophisticated enough to prevent
3871          *     re-flushed in some cases.
3872          */
3873         key_next = 0; /* max range */
3874         i = hammer2_base_find(base, count, above, cache_indexp,
3875                               &key_next, elm->key);
3876
3877         if (i != count && (flags & HAMMER2_CHAIN_REPLACE) == 0 &&
3878             base[i].type &&
3879             base[i].key == elm->key && base[i].keybits == elm->keybits) {
3880                 kprintf("hammer2_base_insert: duplicate key %016jx/%d\n",
3881                         elm->key, elm->keybits);
3882                 return;
3883         }
3884 #if 0
3885         KKASSERT(i == count || (flags & HAMMER2_CHAIN_REPLACE) ||
3886                  base[i].type == 0 || base[i].key != elm->key ||
3887                  base[i].keybits != elm->keybits);
3888 #endif
3889
3890         /*
3891          * Shortcut fill optimization, typical ordered insertion(s) may not
3892          * require a search.
3893          */
3894         if (i == count && above->live_zero < count) {
3895                 i = above->live_zero++;
3896                 base[i] = *elm;
3897                 return;
3898         }
3899
3900         if (i < count &&
3901             base[i].key == elm->key &&
3902             base[i].keybits == elm->keybits) {
3903                 base[i] = *elm;
3904                 return;
3905         }
3906
3907         /*
3908          * Try to find an empty slot before or after.
3909          */
3910         j = i;
3911         k = i;
3912         while (j > 0 || k < count) {
3913                 --j;
3914                 if (j >= 0 && base[j].type == 0) {
3915                         if (j == i - 1) {
3916                                 base[j] = *elm;
3917                         } else {
3918                                 bcopy(&base[j+1], &base[j],
3919                                       (i - j - 1) * sizeof(hammer2_blockref_t));
3920                                 base[i - 1] = *elm;
3921                         }
3922                         return;
3923                 }
3924                 ++k;
3925                 if (k < count && base[k].type == 0) {
3926                         bcopy(&base[i], &base[i+1],
3927                               (k - i) * sizeof(hammer2_blockref_t));
3928                         base[i] = *elm;
3929                         if (above->live_zero <= k)
3930                                 above->live_zero = k + 1;
3931                         return;
3932                 }
3933         }
3934         panic("hammer2_base_insert: no room!");
3935 }
3936
3937 #if 0
3938
3939 /*
3940  * Sort the blockref array for the chain.  Used by the flush code to
3941  * sort the blockref[] array.
3942  *
3943  * The chain must be exclusively locked AND spin-locked.
3944  */
3945 typedef hammer2_blockref_t *hammer2_blockref_p;
3946
3947 static
3948 int
3949 hammer2_base_sort_callback(const void *v1, const void *v2)
3950 {
3951         hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1;
3952         hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2;
3953
3954         /*
3955          * Make sure empty elements are placed at the end of the array
3956          */
3957         if (bref1->type == 0) {
3958                 if (bref2->type == 0)
3959                         return(0);
3960                 return(1);
3961         } else if (bref2->type == 0) {
3962                 return(-1);
3963         }
3964
3965         /*
3966          * Sort by key
3967          */
3968         if (bref1->key < bref2->key)
3969                 return(-1);
3970         if (bref1->key > bref2->key)
3971                 return(1);
3972         return(0);
3973 }
3974
3975 void
3976 hammer2_base_sort(hammer2_chain_t *chain)
3977 {
3978         hammer2_blockref_t *base;
3979         int count;
3980
3981         switch(chain->bref.type) {
3982         case HAMMER2_BREF_TYPE_INODE:
3983                 /*
3984                  * Special shortcut for embedded data returns the inode
3985                  * itself.  Callers must detect this condition and access
3986                  * the embedded data (the strategy code does this for us).
3987                  *
3988                  * This is only applicable to regular files and softlinks.
3989                  */
3990                 if (chain->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA)
3991                         return;
3992                 base = &chain->data->ipdata.u.blockset.blockref[0];
3993                 count = HAMMER2_SET_COUNT;
3994                 break;
3995         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3996         case HAMMER2_BREF_TYPE_INDIRECT:
3997                 /*
3998                  * Optimize indirect blocks in the INITIAL state to avoid
3999                  * I/O.
4000                  */
4001                 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0);
4002                 base = &chain->data->npdata[0];
4003                 count = chain->bytes / sizeof(hammer2_blockref_t);
4004                 break;
4005         case HAMMER2_BREF_TYPE_VOLUME:
4006                 base = &chain->hmp->voldata.sroot_blockset.blockref[0];
4007                 count = HAMMER2_SET_COUNT;
4008                 break;
4009         case HAMMER2_BREF_TYPE_FREEMAP:
4010                 base = &chain->hmp->voldata.freemap_blockset.blockref[0];
4011                 count = HAMMER2_SET_COUNT;
4012                 break;
4013         default:
4014                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
4015                       chain->bref.type);
4016                 base = NULL;    /* safety */
4017                 count = 0;      /* safety */
4018         }
4019         kqsort(base, count, sizeof(*base), hammer2_base_sort_callback);
4020 }
4021
4022 #endif
4023
4024 /*
4025  * Chain memory management
4026  */
4027 void
4028 hammer2_chain_wait(hammer2_chain_t *chain)
4029 {
4030         tsleep(chain, 0, "chnflw", 1);
4031 }
4032
4033 /*
4034  * Manage excessive memory resource use for chain and related
4035  * structures.
4036  */
4037 void
4038 hammer2_chain_memory_wait(hammer2_pfsmount_t *pmp)
4039 {
4040 #if 0
4041         while (pmp->inmem_chains > desiredvnodes / 10 &&
4042                pmp->inmem_chains > pmp->mp->mnt_nvnodelistsize * 2) {
4043                 kprintf("w");
4044                 speedup_syncer();
4045                 pmp->inmem_waiting = 1;
4046                 tsleep(&pmp->inmem_waiting, 0, "chnmem", hz);
4047         }
4048 #endif
4049 #if 0
4050         if (pmp->inmem_chains > desiredvnodes / 10 &&
4051             pmp->inmem_chains > pmp->mp->mnt_nvnodelistsize * 7 / 4) {
4052                 speedup_syncer();
4053         }
4054 #endif
4055 }
4056
4057 void
4058 hammer2_chain_memory_wakeup(hammer2_pfsmount_t *pmp)
4059 {
4060         if (pmp->inmem_waiting &&
4061             (pmp->inmem_chains <= desiredvnodes / 10 ||
4062              pmp->inmem_chains <= pmp->mp->mnt_nvnodelistsize * 2)) {
4063                 kprintf("s");
4064                 pmp->inmem_waiting = 0;
4065                 wakeup(&pmp->inmem_waiting);
4066         }
4067 }
4068
4069 static
4070 void
4071 adjreadcounter(hammer2_blockref_t *bref, size_t bytes)
4072 {
4073         long *counterp;
4074
4075         switch(bref->type) {
4076         case HAMMER2_BREF_TYPE_DATA:
4077                 counterp = &hammer2_iod_file_read;
4078                 break;
4079         case HAMMER2_BREF_TYPE_INODE:
4080                 counterp = &hammer2_iod_meta_read;
4081                 break;
4082         case HAMMER2_BREF_TYPE_INDIRECT:
4083                 counterp = &hammer2_iod_indr_read;
4084                 break;
4085         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
4086         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
4087                 counterp = &hammer2_iod_fmap_read;
4088                 break;
4089         default:
4090                 counterp = &hammer2_iod_volu_read;
4091                 break;
4092         }
4093         *counterp += bytes;
4094 }