hammer2 - Get data-usage aggregation working, add INODE_GET
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem handles direct and indirect block searches, recursions,
37  * creation, and deletion.  Chains of blockrefs are tracked and modifications
38  * are flag for propagation... eventually all the way back to the volume
39  * header.
40  */
41
42 #include <sys/cdefs.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/types.h>
46 #include <sys/lock.h>
47 #include <sys/uuid.h>
48
49 #include "hammer2.h"
50
51 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
52
53 static hammer2_chain_t *hammer2_chain_create_indirect(
54                         hammer2_mount_t *hmp, hammer2_chain_t *parent,
55                         hammer2_key_t key, int keybits);
56
57 /*
58  * Splay tree
59  */
60 SPLAY_GENERATE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
61
62 int
63 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
64 {
65         return(chain2->index - chain1->index);
66 }
67
68 /*
69  * Recursively mark the parent chain elements so flushes can find
70  * modified elements.  Stop when we hit a chain already flagged
71  * SUBMODIFIED, but ignore the SUBMODIFIED bit that might be set
72  * in chain itself.
73  *
74  * SUBMODIFIED is not set on the chain passed in.
75  *
76  * XXX rename of parent can create a SMP race
77  */
78 static void
79 hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
80 {
81         hammer2_chain_t *parent;
82
83         parent = chain->parent;
84         while (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
85                 atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
86                 parent = parent->parent;
87         }
88 }
89
90 /*
91  * Allocate a new disconnected chain element representing the specified
92  * bref.  The chain element is locked exclusively and refs is set to 1.
93  *
94  * This essentially allocates a system memory structure representing one
95  * of the media structure types, including inodes.
96  */
97 hammer2_chain_t *
98 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
99 {
100         hammer2_chain_t *chain;
101         hammer2_inode_t *ip;
102         hammer2_indblock_t *np;
103         hammer2_data_t *dp;
104         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
105
106         /*
107          * Construct the appropriate system structure.
108          */
109         switch(bref->type) {
110         case HAMMER2_BREF_TYPE_INODE:
111                 ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
112                 chain = &ip->chain;
113                 chain->u.ip = ip;
114                 lockinit(&chain->lk, "inode", 0, LK_CANRECURSE);
115                 ip->hmp = hmp;
116                 break;
117         case HAMMER2_BREF_TYPE_INDIRECT:
118                 np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
119                 chain = &np->chain;
120                 chain->u.np = np;
121                 lockinit(&chain->lk, "iblk", 0, LK_CANRECURSE);
122                 break;
123         case HAMMER2_BREF_TYPE_DATA:
124                 dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
125                 chain = &dp->chain;
126                 chain->u.dp = dp;
127                 lockinit(&chain->lk, "dblk", 0, LK_CANRECURSE);
128                 break;
129         case HAMMER2_BREF_TYPE_VOLUME:
130                 chain = NULL;
131                 panic("hammer2_chain_alloc volume type illegal for op");
132         default:
133                 chain = NULL;
134                 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
135                       bref->type);
136         }
137
138         /*
139          * Only set bref_flush if the bref has a real media offset, otherwise
140          * the caller has to wait for the chain to be modified/block-allocated
141          * before a blockref can be synchronized with its (future) parent.
142          */
143         chain->bref = *bref;
144         if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
145                 chain->bref_flush = *bref;
146         chain->index = -1;              /* not yet assigned */
147         chain->refs = 1;
148         chain->bytes = bytes;
149         lockmgr(&chain->lk, LK_EXCLUSIVE);
150
151         return (chain);
152 }
153
154 /*
155  * Free a disconnected chain element
156  */
157 void
158 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
159 {
160         void *mem;
161
162         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
163             chain->bref.type == HAMMER2_BREF_TYPE_VOLUME) {
164                 chain->data = NULL;
165         }
166
167         KKASSERT(chain->bp == NULL);
168         KKASSERT(chain->data == NULL);
169         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
170                  chain->u.ip->vp == NULL);
171
172         if ((mem = chain->u.mem) != NULL) {
173                 chain->u.mem = NULL;
174                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
175                         kfree(mem, hmp->minode);
176                 else
177                         kfree(mem, hmp->mchain);
178         }
179 }
180
181 /*
182  * Add a reference to a chain element (for shared access).  The chain
183  * element must already have at least 1 ref controlled by the caller.
184  */
185 void
186 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
187 {
188         KKASSERT(chain->refs > 0);
189         atomic_add_int(&chain->refs, 1);
190 }
191
192 /*
193  * Drop the callers reference to the chain element.  If the ref count
194  * reaches zero the chain element and its related structure (typically an
195  * inode or indirect block) will be freed and the parent will be
196  * recursively dropped.
197  *
198  * MOVED and MODIFIED elements hold additional references so it should not
199  * be possible for the count on a modified element to drop to 0.
200  *
201  * The chain element must NOT be locked by the caller.
202  *
203  * The parent might or might not be locked by the caller but if so it
204  * will also be referenced so we shouldn't recurse upward.
205  */
206 void
207 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
208 {
209         hammer2_chain_t *parent;
210         u_int refs;
211
212         while (chain) {
213                 refs = chain->refs;
214                 cpu_ccfence();
215                 KKASSERT(refs > 0);
216                 if (refs == 1) {
217                         KKASSERT(chain != &hmp->vchain);
218                         parent = chain->parent;
219                         if (parent)
220                                 lockmgr(&parent->lk, LK_EXCLUSIVE);
221                         if (atomic_cmpset_int(&chain->refs, 1, 0)) {
222                                 /*
223                                  * Succeeded, recurse and drop parent.
224                                  * These chain elements should be synchronized
225                                  * so no delta data or inode count updates
226                                  * should be needed.
227                                  */
228                                 KKASSERT((chain->flags &
229                                           (HAMMER2_CHAIN_MOVED |
230                                            HAMMER2_CHAIN_MODIFIED)) == 0);
231                                 if (!(chain->flags & HAMMER2_CHAIN_DELETED)) {
232                                         SPLAY_REMOVE(hammer2_chain_splay,
233                                                      &parent->shead, chain);
234                                         atomic_set_int(&chain->flags,
235                                                        HAMMER2_CHAIN_DELETED);
236                                         /* parent refs dropped via recursion */
237                                 }
238                                 chain->parent = NULL;
239                                 if (parent)
240                                         lockmgr(&parent->lk, LK_RELEASE);
241                                 hammer2_chain_free(hmp, chain);
242                                 chain = parent;
243                                 /* recurse on parent */
244                         } else {
245                                 if (parent)
246                                         lockmgr(&parent->lk, LK_RELEASE);
247                                 /* retry the same chain */
248                         }
249                 } else {
250                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
251                                 /*
252                                  * Succeeded, count did not reach zero so
253                                  * cut out of the loop.
254                                  */
255                                 break;
256                         }
257                         /* retry the same chain */
258                 }
259         }
260 }
261
262 /*
263  * Ref and lock a chain element, acquiring its data with I/O if necessary,
264  * and specify how you would like the data to be resolved.
265  *
266  * Returns 0 on success or an error code if the data could not be acquired.
267  * The chain element is locked either way.
268  *
269  * The lock is allowed to recurse, multiple locking ops will aggregate
270  * the requested resolve types.  Once data is assigned it will not be
271  * removed until the last unlock.
272  *
273  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
274  *                         (typically used to avoid device/logical buffer
275  *                          aliasing for data)
276  *
277  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
278  *                         the INITIAL-create state (indirect blocks only).
279  *
280  *                         Do not resolve data elements for DATA chains.
281  *                         (typically used to avoid device/logical buffer
282  *                          aliasing for data)
283  *
284  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
285  *
286  *
287  * NOTE: Embedded elements (volume header, inodes) are always resolved
288  *       regardless.
289  *
290  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
291  *       element will instantiate and zero its buffer, and flush it on
292  *       release.
293  *
294  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
295  *       so as not to instantiate a device buffer, which could alias against
296  *       a logical file buffer.  However, if ALWAYS is specified the
297  *       device buffer will be instantiated anyway.
298  */
299 int
300 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
301 {
302         hammer2_blockref_t *bref;
303         hammer2_off_t pbase;
304         hammer2_off_t peof;
305         size_t boff;
306         size_t bbytes;
307         int error;
308         char *bdata;
309
310         /*
311          * Lock the element.  Under certain conditions this might end up
312          * being a recursive lock.
313          */
314         KKASSERT(chain->refs > 0);
315         atomic_add_int(&chain->refs, 1);
316         lockmgr(&chain->lk, LK_EXCLUSIVE);
317
318         /*
319          * If we already have a valid data pointer no further action is
320          * necessary.
321          */
322         if (chain->data)
323                 return (0);
324
325         /*
326          * Do we have to resolve the data?
327          */
328         switch(how) {
329         case HAMMER2_RESOLVE_NEVER:
330                 return(0);
331         case HAMMER2_RESOLVE_MAYBE:
332                 if (chain->flags & HAMMER2_CHAIN_INITIAL)
333                         return(0);
334                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
335                         return(0);
336                 /* fall through */
337         case HAMMER2_RESOLVE_ALWAYS:
338                 break;
339         }
340
341         /*
342          * We must resolve to a device buffer, either by issuing I/O or
343          * by creating a zero-fill element.  We do not mark the buffer
344          * dirty when creating a zero-fill element (the hammer2_chain_modify()
345          * API must still be used to do that).
346          *
347          * The device buffer is variable-sized in powers of 2 down
348          * to HAMMER2_MINALLOCSIZE (typically 1K).  A 64K physical storage
349          * chunk always contains buffers of the same size. (XXX)
350          *
351          * The minimum physical IO size may be larger than the variable
352          * block size.
353          */
354         bref = &chain->bref;
355
356         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
357                 bbytes = HAMMER2_MINIOSIZE;
358         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
359         peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
360         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
361         KKASSERT(pbase != 0);
362
363         /*
364          * The getblk() optimization can only be used on newly created
365          * elements if the physical block size matches the request.
366          */
367         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
368             chain->bytes == bbytes) {
369                 chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
370                 error = 0;
371         } else if (hammer2_cluster_enable) {
372                 error = cluster_read(hmp->devvp, peof, pbase, bbytes,
373                                      HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
374                                      &chain->bp);
375         } else {
376                 error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
377         }
378
379         if (error) {
380                 kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
381                         (intmax_t)pbase, error);
382                 bqrelse(chain->bp);
383                 chain->bp = NULL;
384                 return (error);
385         }
386
387         /*
388          * Zero the data area if the chain is in the INITIAL-create state.
389          */
390         bdata = (char *)chain->bp->b_data + boff;
391         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
392                 bzero(bdata, chain->bytes);
393                 chain->bp->b_flags |= B_CACHE;
394                 bdirty(chain->bp);
395         }
396
397         /*
398          * Setup the data pointer, either pointing it to an embedded data
399          * structure and copying the data from the buffer, or pointing it
400          * into the buffer.
401          *
402          * The buffer is not retained when copying to an embedded data
403          * structure in order to avoid potential deadlocks or recursions
404          * on the same physical buffer.
405          */
406         switch (bref->type) {
407         case HAMMER2_BREF_TYPE_VOLUME:
408                 /*
409                  * Copy data from bp to embedded buffer
410                  */
411                 panic("hammer2_chain_lock: called on unresolved volume header");
412 #if 0
413                 /* NOT YET */
414                 KKASSERT(pbase == 0);
415                 KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
416                 bcopy(bdata, &hmp->voldata, chain->bytes);
417                 chain->data = (void *)&hmp->voldata;
418                 bqrelse(chain->bp);
419                 chain->bp = NULL;
420 #endif
421                 break;
422         case HAMMER2_BREF_TYPE_INODE:
423                 /*
424                  * Copy data from bp to embedded buffer, do not retain the
425                  * device buffer.
426                  */
427                 bcopy(bdata, &chain->u.ip->ip_data, chain->bytes);
428                 chain->data = (void *)&chain->u.ip->ip_data;
429                 bqrelse(chain->bp);
430                 chain->bp = NULL;
431                 break;
432         case HAMMER2_BREF_TYPE_INDIRECT:
433         case HAMMER2_BREF_TYPE_DATA:
434         default:
435                 /*
436                  * Point data at the device buffer and leave bp intact.
437                  */
438                 chain->data = (void *)bdata;
439                 break;
440         }
441         return (0);
442 }
443
444 /*
445  * Unlock and deref a chain element.
446  *
447  * On the last lock release any non-embedded data (chain->bp) will be
448  * retired.
449  */
450 void
451 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
452 {
453         long *counterp;
454
455         /*
456          * Undo a recursive lock
457          */
458         if (lockcountnb(&chain->lk) > 1) {
459                 KKASSERT(chain->refs > 1);
460                 atomic_add_int(&chain->refs, -1);
461                 lockmgr(&chain->lk, LK_RELEASE);
462                 return;
463         }
464
465         /*
466          * Shortcut the case if the data is embedded or not resolved.
467          * Do NOT null-out pointers to embedded data (e.g. inode).
468          */
469         if (chain->bp == NULL) {
470                 lockmgr(&chain->lk, LK_RELEASE);
471                 hammer2_chain_drop(hmp, chain);
472                 return;
473         }
474
475         /*
476          * Statistics
477          */
478         if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
479                 ;
480         } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
481                 switch(chain->bref.type) {
482                 case HAMMER2_BREF_TYPE_DATA:
483                         counterp = &hammer2_ioa_file_write;
484                         break;
485                 case HAMMER2_BREF_TYPE_INODE:
486                         counterp = &hammer2_ioa_meta_write;
487                         break;
488                 case HAMMER2_BREF_TYPE_INDIRECT:
489                         counterp = &hammer2_ioa_indr_write;
490                         break;
491                 default:
492                         counterp = &hammer2_ioa_volu_write;
493                         break;
494                 }
495                 ++*counterp;
496         } else {
497                 switch(chain->bref.type) {
498                 case HAMMER2_BREF_TYPE_DATA:
499                         counterp = &hammer2_iod_file_write;
500                         break;
501                 case HAMMER2_BREF_TYPE_INODE:
502                         counterp = &hammer2_iod_meta_write;
503                         break;
504                 case HAMMER2_BREF_TYPE_INDIRECT:
505                         counterp = &hammer2_iod_indr_write;
506                         break;
507                 default:
508                         counterp = &hammer2_iod_volu_write;
509                         break;
510                 }
511                 ++*counterp;
512         }
513
514         /*
515          * Clean out the bp.
516          *
517          * If a device buffer was used for data be sure to destroy the
518          * buffer when we are done to avoid aliases (XXX what about the
519          * underlying VM pages?).
520          */
521         if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
522                 chain->bp->b_flags |= B_RELBUF;
523
524         chain->data = NULL;
525         if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
526                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
527                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
528                         atomic_clear_int(&chain->flags,
529                                          HAMMER2_CHAIN_IOFLUSH);
530                         chain->bp->b_flags |= B_RELBUF;
531                         cluster_awrite(chain->bp);
532                 } else {
533                         chain->bp->b_flags |= B_CLUSTEROK;
534                         bdwrite(chain->bp);
535                 }
536         } else {
537                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
538                         atomic_clear_int(&chain->flags,
539                                          HAMMER2_CHAIN_IOFLUSH);
540                         chain->bp->b_flags |= B_RELBUF;
541                         brelse(chain->bp);
542                 } else {
543                         /* bp might still be dirty */
544                         bqrelse(chain->bp);
545                 }
546         }
547         chain->bp = NULL;
548         lockmgr(&chain->lk, LK_RELEASE);
549         hammer2_chain_drop(hmp, chain);
550 }
551
552 /*
553  * Resize the chain's physical storage allocation.  Chains can be resized
554  * smaller without reallocating the storage.  Resizing larger will reallocate
555  * the storage.
556  *
557  * Must be passed a locked chain.
558  *
559  * If you want the resize code to copy the data to the new block then the
560  * caller should lock the chain RESOLVE_MAYBE or RESOLVE_ALWAYS.
561  *
562  * If the caller already holds a logical buffer containing the data and
563  * intends to bdwrite() that buffer resolve with RESOLVE_NEVER.  The resize
564  * operation will then not copy the data.
565  *
566  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
567  * to avoid instantiating a device buffer that conflicts with the vnode
568  * data buffer.
569  *
570  * XXX flags currently ignored, uses chain->bp to detect data/no-data.
571  */
572 void
573 hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
574                      int nradix, int flags)
575 {
576         hammer2_mount_t *hmp = ip->hmp;
577         struct buf *nbp;
578         hammer2_off_t pbase;
579         size_t obytes;
580         size_t nbytes;
581         size_t bbytes;
582         int boff;
583         char *bdata;
584         int error;
585
586         /*
587          * Only data and indirect blocks can be resized for now
588          */
589         KKASSERT(chain != &hmp->vchain);
590         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
591                  chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
592
593         /*
594          * Nothing to do if the element is already the proper size
595          */
596         obytes = chain->bytes;
597         nbytes = 1U << nradix;
598         if (obytes == nbytes)
599                 return;
600
601         /*
602          * Set MODIFIED and add a chain ref to prevent destruction.  Both
603          * modified flags share the same ref.
604          *
605          * If the chain is already marked MODIFIED then we can safely
606          * return the previous allocation to the pool without having to
607          * worry about snapshots.
608          */
609         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
610                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
611                                               HAMMER2_CHAIN_MODIFY_TID);
612                 hammer2_chain_ref(hmp, chain);
613         } else {
614                 hammer2_freemap_free(hmp, chain->bref.data_off,
615                                      chain->bref.type);
616         }
617
618         /*
619          * Relocate the block, even if making it smaller (because different
620          * block sizes may be in different regions).
621          */
622         chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type,
623                                                      nbytes);
624         chain->bytes = nbytes;
625         ip->delta_dcount += (ssize_t)(nbytes - obytes); /* XXX atomic */
626
627         /*
628          * The device buffer may be larger than the allocation size.
629          */
630         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
631                 bbytes = HAMMER2_MINIOSIZE;
632         pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
633         boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
634
635         /*
636          * Only copy the data if resolved, otherwise the caller is
637          * responsible.
638          */
639         if (chain->bp) {
640                 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
641                          chain->bref.type == HAMMER2_BREF_TYPE_DATA);
642                 KKASSERT(chain != &hmp->vchain);        /* safety */
643
644                 /*
645                  * The getblk() optimization can only be used if the
646                  * physical block size matches the request.
647                  */
648                 if (nbytes == bbytes) {
649                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
650                         error = 0;
651                 } else {
652                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
653                         KKASSERT(error == 0);
654                 }
655                 bdata = (char *)nbp->b_data + boff;
656
657                 if (nbytes < obytes) {
658                         bcopy(chain->data, bdata, nbytes);
659                 } else {
660                         bcopy(chain->data, bdata, obytes);
661                         bzero(bdata + obytes, nbytes - obytes);
662                 }
663
664                 /*
665                  * NOTE: The INITIAL state of the chain is left intact.
666                  *
667                  * NOTE: Because of the reallocation we have to set DIRTYBP
668                  *       if INITIAL is not set.
669                  *
670                  * NOTE: We set B_NOCACHE to throw away the previous bp and
671                  *       any VM backing store, even if it was dirty.
672                  *       Otherwise we run the risk of a logical/device
673                  *       conflict on reallocation.
674                  */
675                 chain->bp->b_flags |= B_RELBUF | B_NOCACHE;
676                 brelse(chain->bp);
677                 chain->bp = nbp;
678                 chain->data = (void *)bdata;
679                 if ((chain->flags & HAMMER2_CHAIN_INITIAL) == 0)
680                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
681         }
682         hammer2_chain_parent_setsubmod(hmp, chain);
683 }
684
685 /*
686  * Convert a locked chain that was retrieved read-only to read-write.
687  *
688  * If not already marked modified a new physical block will be allocated
689  * and assigned to the bref.
690  *
691  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
692  *                   level or the COW operation will not work.
693  *
694  * Data blocks     - The chain is usually locked RESOLVE_NEVER so as not to
695  *                   run the data through the device buffers.
696  */
697 void
698 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
699 {
700         struct buf *nbp;
701         int error;
702         hammer2_off_t pbase;
703         size_t bbytes;
704         size_t boff;
705         void *bdata;
706
707         /*
708          * Tells flush that modify_tid must be updated, otherwise only
709          * mirror_tid is updated.  This is the default.
710          */
711         if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
712                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFY_TID);
713
714         /*
715          * If the chain is already marked MODIFIED we can just return.
716          */
717         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
718                 if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
719                     chain->bp == NULL) {
720                         goto skip1;
721                 }
722                 return;
723         }
724
725         /*
726          * Set MODIFIED and add a chain ref to prevent destruction.  Both
727          * modified flags share the same ref.
728          */
729         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
730         hammer2_chain_ref(hmp, chain);
731
732         /*
733          * We must allocate the copy-on-write block.
734          *
735          * If the data is embedded no other action is required.
736          *
737          * If the data is not embedded we acquire and clear the
738          * new block.  If chain->data is not NULL we then do the
739          * copy-on-write.  chain->data will then be repointed to the new
740          * buffer and the old buffer will be released.
741          *
742          * For newly created elements with no prior allocation we go
743          * through the copy-on-write steps except without the copying part.
744          */
745         if (chain != &hmp->vchain) {
746                 if ((hammer2_debug & 0x0001) &&
747                     (chain->bref.data_off & HAMMER2_OFF_MASK)) {
748                         kprintf("Replace %d\n", chain->bytes);
749                 }
750                 chain->bref.data_off =
751                         hammer2_freemap_alloc(hmp, chain->bref.type,
752                                               chain->bytes);
753                 /* XXX failed allocation */
754         }
755
756         /*
757          * If data instantiation is optional and the chain has no current
758          * data association (typical for DATA and newly-created INDIRECT
759          * elements), don't instantiate the buffer now.
760          */
761         if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL)
762                 goto skip2;
763
764 skip1:
765         /*
766          * Setting the DIRTYBP flag will cause the buffer to be dirtied or
767          * written-out on unlock.  This bit is independent of the MODIFIED
768          * bit because the chain may still need meta-data adjustments done
769          * by virtue of MODIFIED for its parent, and the buffer can be
770          * flushed out (possibly multiple times) by the OS before that.
771          *
772          * Clearing the INITIAL flag (for indirect blocks) indicates that
773          * a zero-fill buffer has been instantiated.
774          */
775         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
776         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
777
778         /*
779          * We currently should never instantiate a device buffer for a
780          * data chain.
781          */
782         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
783
784         /*
785          * Execute COW operation
786          */
787         switch(chain->bref.type) {
788         case HAMMER2_BREF_TYPE_VOLUME:
789         case HAMMER2_BREF_TYPE_INODE:
790                 /*
791                  * The data is embedded, no copy-on-write operation is
792                  * needed.
793                  */
794                 KKASSERT(chain->bp == NULL);
795                 break;
796         case HAMMER2_BREF_TYPE_DATA:
797         case HAMMER2_BREF_TYPE_INDIRECT:
798                 /*
799                  * Perform the copy-on-write operation
800                  */
801                 KKASSERT(chain != &hmp->vchain);        /* safety */
802                 /*
803                  * The device buffer may be larger than the allocation size.
804                  */
805                 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
806                         bbytes = HAMMER2_MINIOSIZE;
807                 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
808                 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
809
810                 /*
811                  * The getblk() optimization can only be used if the
812                  * physical block size matches the request.
813                  */
814                 if (chain->bytes == bbytes) {
815                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
816                         error = 0;
817                 } else {
818                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
819                         KKASSERT(error == 0);
820                 }
821                 bdata = (char *)nbp->b_data + boff;
822
823                 /*
824                  * Copy or zero-fill on write depending on whether
825                  * chain->data exists or not.
826                  */
827                 if (chain->data) {
828                         bcopy(chain->data, bdata, chain->bytes);
829                         KKASSERT(chain->bp != NULL);
830                 } else {
831                         bzero(bdata, chain->bytes);
832                 }
833                 if (chain->bp) {
834                         chain->bp->b_flags |= B_RELBUF;
835                         brelse(chain->bp);
836                 }
837                 chain->bp = nbp;
838                 chain->data = bdata;
839                 break;
840         default:
841                 panic("hammer2_chain_modify: illegal non-embedded type %d",
842                       chain->bref.type);
843                 break;
844
845         }
846 skip2:
847         if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
848                 hammer2_chain_parent_setsubmod(hmp, chain);
849 }
850
851 /*
852  * Mark the volume as having been modified.  This short-cut version
853  * does not have to lock the volume's chain, which allows the ioctl
854  * code to make adjustments to connections without deadlocking.
855  */
856 void
857 hammer2_modify_volume(hammer2_mount_t *hmp)
858 {
859         hammer2_voldata_lock(hmp);
860         atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
861         hammer2_voldata_unlock(hmp);
862 }
863
864 /*
865  * Locate an in-memory chain.  The parent must be locked.  The in-memory
866  * chain is returned or NULL if no in-memory chain is present.
867  *
868  * NOTE: A chain on-media might exist for this index when NULL is returned.
869  */
870 hammer2_chain_t *
871 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
872 {
873         hammer2_chain_t dummy;
874         hammer2_chain_t *chain;
875
876         dummy.index = index;
877         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
878         return (chain);
879 }
880
881 /*
882  * Return a locked chain structure with all associated data acquired.
883  *
884  * Caller must lock the parent on call, the returned child will be locked.
885  */
886 hammer2_chain_t *
887 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
888                   int index, int flags)
889 {
890         hammer2_blockref_t *bref;
891         hammer2_chain_t *chain;
892         hammer2_chain_t dummy;
893         int how;
894
895         /*
896          * Figure out how to lock.  MAYBE can be used to optimized
897          * the initial-create state for indirect blocks.
898          */
899         if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
900                 how = HAMMER2_RESOLVE_NEVER;
901         else
902                 how = HAMMER2_RESOLVE_MAYBE;
903
904         /*
905          * First see if we have a (possibly modified) chain element cached
906          * for this (parent, index).  Acquire the data if necessary.
907          *
908          * If chain->data is non-NULL the chain should already be marked
909          * modified.
910          */
911         dummy.index = index;
912         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
913         if (chain) {
914                 if (flags & HAMMER2_LOOKUP_NOLOCK)
915                         hammer2_chain_ref(hmp, chain);
916                 else
917                         hammer2_chain_lock(hmp, chain, how);
918                 return(chain);
919         }
920
921         /*
922          * the get function must always succeed, panic if there's no
923          * data to index.
924          */
925         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
926                 panic("hammer2_chain_get: Missing bref(1)");
927                 /* NOT REACHED */
928         }
929
930         /*
931          * Otherwise lookup the bref and issue I/O (switch on the parent)
932          */
933         switch(parent->bref.type) {
934         case HAMMER2_BREF_TYPE_INODE:
935                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
936                 bref = &parent->data->ipdata.u.blockset.blockref[index];
937                 break;
938         case HAMMER2_BREF_TYPE_INDIRECT:
939                 KKASSERT(parent->data != NULL);
940                 KKASSERT(index >= 0 &&
941                          index < parent->bytes / sizeof(hammer2_blockref_t));
942                 bref = &parent->data->npdata.blockref[index];
943                 break;
944         case HAMMER2_BREF_TYPE_VOLUME:
945                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
946                 bref = &hmp->voldata.sroot_blockset.blockref[index];
947                 break;
948         default:
949                 bref = NULL;
950                 panic("hammer2_chain_get: unrecognized blockref type: %d",
951                       parent->bref.type);
952         }
953         if (bref->type == 0) {
954                 panic("hammer2_chain_get: Missing bref(2)");
955                 /* NOT REACHED */
956         }
957
958         /*
959          * Allocate a chain structure representing the existing media
960          * entry.
961          *
962          * The locking operation we do later will issue I/O to read it.
963          */
964         chain = hammer2_chain_alloc(hmp, bref);
965
966         /*
967          * Link the chain into its parent.  Caller is expected to hold an
968          * exclusive lock on the parent.
969          */
970         chain->parent = parent;
971         chain->index = index;
972         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
973                 panic("hammer2_chain_link: collision");
974         KKASSERT(parent->refs > 0);
975         atomic_add_int(&parent->refs, 1);       /* for splay entry */
976
977         /*
978          * Additional linkage for inodes.  Reuse the parent pointer to
979          * find the parent directory.
980          */
981         if (bref->type == HAMMER2_BREF_TYPE_INODE) {
982                 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
983                         parent = parent->parent;
984                 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
985                         chain->u.ip->pip = parent->u.ip;
986                         chain->u.ip->pmp = parent->u.ip->pmp;
987                         chain->u.ip->depth = parent->u.ip->depth + 1;
988                 }
989         }
990
991         /*
992          * Our new chain structure has already been referenced and locked
993          * but the lock code handles the I/O so call it to resolve the data.
994          * Then release one of our two exclusive locks.
995          *
996          * If NOLOCK is set the release will release the one-and-only lock.
997          */
998         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
999                 hammer2_chain_lock(hmp, chain, how);    /* recusive lock */
1000                 hammer2_chain_drop(hmp, chain);         /* excess ref */
1001         }
1002         lockmgr(&chain->lk, LK_RELEASE);                /* from alloc */
1003
1004         return (chain);
1005 }
1006
1007 /*
1008  * Locate any key between key_beg and key_end inclusive.  (*parentp)
1009  * typically points to an inode but can also point to a related indirect
1010  * block and this function will recurse upwards and find the inode again.
1011  *
1012  * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
1013  *           WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
1014  *           WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
1015  *
1016  * (*parentp) must be exclusively locked and referenced and can be an inode
1017  * or an existing indirect block within the inode.
1018  *
1019  * On return (*parentp) will be modified to point at the deepest parent chain
1020  * element encountered during the search, as a helper for an insertion or
1021  * deletion.   The new (*parentp) will be locked and referenced and the old
1022  * will be unlocked and dereferenced (no change if they are both the same).
1023  *
1024  * The matching chain will be returned exclusively locked and referenced.
1025  *
1026  * NULL is returned if no match was found, but (*parentp) will still
1027  * potentially be adjusted.
1028  *
1029  * This function will also recurse up the chain if the key is not within the
1030  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
1031  * can simply allow (*parentp) to float inside the loop.
1032  */
1033 hammer2_chain_t *
1034 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1035                      hammer2_key_t key_beg, hammer2_key_t key_end,
1036                      int flags)
1037 {
1038         hammer2_chain_t *parent;
1039         hammer2_chain_t *chain;
1040         hammer2_chain_t *tmp;
1041         hammer2_blockref_t *base;
1042         hammer2_blockref_t *bref;
1043         hammer2_key_t scan_beg;
1044         hammer2_key_t scan_end;
1045         int count = 0;
1046         int i;
1047
1048         /*
1049          * Recurse (*parentp) upward if necessary until the parent completely
1050          * encloses the key range or we hit the inode.
1051          */
1052         parent = *parentp;
1053         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1054                 scan_beg = parent->bref.key;
1055                 scan_end = scan_beg +
1056                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1057                 if (key_beg >= scan_beg && key_end <= scan_end)
1058                         break;
1059                 hammer2_chain_ref(hmp, parent);         /* ref old parent */
1060                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1061                 parent = parent->parent;
1062                                                         /* lock new parent */
1063                 hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
1064                 hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
1065                 *parentp = parent;                      /* new parent */
1066         }
1067
1068 again:
1069         /*
1070          * Locate the blockref array.  Currently we do a fully associative
1071          * search through the array.
1072          */
1073         switch(parent->bref.type) {
1074         case HAMMER2_BREF_TYPE_INODE:
1075                 /*
1076                  * Special shortcut for embedded data returns the inode
1077                  * itself.  Callers must detect this condition and access
1078                  * the embedded data (the strategy code does this for us).
1079                  *
1080                  * This is only applicable to regular files and softlinks.
1081                  */
1082                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1083                         if (flags & HAMMER2_LOOKUP_NOLOCK)
1084                                 hammer2_chain_ref(hmp, parent);
1085                         else
1086                                 hammer2_chain_lock(hmp, parent,
1087                                                    HAMMER2_RESOLVE_ALWAYS);
1088                         return (parent);
1089                 }
1090                 base = &parent->data->ipdata.u.blockset.blockref[0];
1091                 count = HAMMER2_SET_COUNT;
1092                 break;
1093         case HAMMER2_BREF_TYPE_INDIRECT:
1094                 /*
1095                  * Optimize indirect blocks in the INITIAL state to avoid
1096                  * I/O.
1097                  */
1098                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1099                         base = NULL;
1100                 } else {
1101                         if (parent->data == NULL)
1102                                 panic("parent->data is NULL");
1103                         base = &parent->data->npdata.blockref[0];
1104                 }
1105                 count = parent->bytes / sizeof(hammer2_blockref_t);
1106                 break;
1107         case HAMMER2_BREF_TYPE_VOLUME:
1108                 base = &hmp->voldata.sroot_blockset.blockref[0];
1109                 count = HAMMER2_SET_COUNT;
1110                 break;
1111         default:
1112                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
1113                       parent->bref.type);
1114                 base = NULL;    /* safety */
1115                 count = 0;      /* safety */
1116         }
1117
1118         /*
1119          * If the element and key overlap we use the element.
1120          */
1121         bref = NULL;
1122         for (i = 0; i < count; ++i) {
1123                 tmp = hammer2_chain_find(hmp, parent, i);
1124                 if (tmp) {
1125                         bref = &tmp->bref;
1126                         KKASSERT(bref->type != 0);
1127                 } else if (base == NULL || base[i].type == 0) {
1128                         continue;
1129                 } else {
1130                         bref = &base[i];
1131                 }
1132                 scan_beg = bref->key;
1133                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1134                 if (key_beg <= scan_end && key_end >= scan_beg)
1135                         break;
1136         }
1137         if (i == count) {
1138                 if (key_beg == key_end)
1139                         return (NULL);
1140                 return (hammer2_chain_next(hmp, parentp, NULL,
1141                                            key_beg, key_end, flags));
1142         }
1143
1144         /*
1145          * Acquire the new chain element.  If the chain element is an
1146          * indirect block we must search recursively.
1147          */
1148         chain = hammer2_chain_get(hmp, parent, i, flags);
1149         if (chain == NULL)
1150                 return (NULL);
1151
1152         /*
1153          * If the chain element is an indirect block it becomes the new
1154          * parent and we loop on it.
1155          *
1156          * The parent always has to be locked with at least RESOLVE_MAYBE,
1157          * so it might need a fixup if the caller passed incompatible flags.
1158          */
1159         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1160                 hammer2_chain_unlock(hmp, parent);
1161                 *parentp = parent = chain;
1162                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1163                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
1164                         hammer2_chain_drop(hmp, chain); /* excess ref */
1165                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1166                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
1167                         hammer2_chain_unlock(hmp, chain);
1168                 }
1169                 goto again;
1170         }
1171
1172         /*
1173          * All done, return chain
1174          */
1175         return (chain);
1176 }
1177
1178 /*
1179  * After having issued a lookup we can iterate all matching keys.
1180  *
1181  * If chain is non-NULL we continue the iteration from just after it's index.
1182  *
1183  * If chain is NULL we assume the parent was exhausted and continue the
1184  * iteration at the next parent.
1185  *
1186  * parent must be locked on entry and remains locked throughout.  chain's
1187  * lock status must match flags.
1188  */
1189 hammer2_chain_t *
1190 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1191                    hammer2_chain_t *chain,
1192                    hammer2_key_t key_beg, hammer2_key_t key_end,
1193                    int flags)
1194 {
1195         hammer2_chain_t *parent;
1196         hammer2_chain_t *tmp;
1197         hammer2_blockref_t *base;
1198         hammer2_blockref_t *bref;
1199         hammer2_key_t scan_beg;
1200         hammer2_key_t scan_end;
1201         int i;
1202         int count;
1203
1204         parent = *parentp;
1205
1206 again:
1207         /*
1208          * Calculate the next index and recalculate the parent if necessary.
1209          */
1210         if (chain) {
1211                 /*
1212                  * Continue iteration within current parent.  If not NULL
1213                  * the passed-in chain may or may not be locked, based on
1214                  * the LOOKUP_NOLOCK flag (passed in as returned from lookup
1215                  * or a prior next).
1216                  */
1217                 i = chain->index + 1;
1218                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1219                         hammer2_chain_drop(hmp, chain);
1220                 else
1221                         hammer2_chain_unlock(hmp, chain);
1222
1223                 /*
1224                  * Any scan where the lookup returned degenerate data embedded
1225                  * in the inode has an invalid index and must terminate.
1226                  */
1227                 if (chain == parent)
1228                         return(NULL);
1229                 chain = NULL;
1230         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) {
1231                 /*
1232                  * We reached the end of the iteration.
1233                  */
1234                 return (NULL);
1235         } else {
1236                 /*
1237                  * Continue iteration with next parent unless the current
1238                  * parent covers the range.
1239                  */
1240                 hammer2_chain_t *nparent;
1241
1242                 scan_beg = parent->bref.key;
1243                 scan_end = scan_beg +
1244                             ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1245                 if (key_beg >= scan_beg && key_end <= scan_end)
1246                         return (NULL);
1247
1248                 i = parent->index + 1;
1249                 nparent = parent->parent;
1250                 hammer2_chain_ref(hmp, nparent);        /* ref new parent */
1251                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1252                                                         /* lock new parent */
1253                 hammer2_chain_lock(hmp, nparent, HAMMER2_RESOLVE_MAYBE);
1254                 hammer2_chain_drop(hmp, nparent);       /* drop excess ref */
1255                 *parentp = parent = nparent;
1256         }
1257
1258 again2:
1259         /*
1260          * Locate the blockref array.  Currently we do a fully associative
1261          * search through the array.
1262          */
1263         switch(parent->bref.type) {
1264         case HAMMER2_BREF_TYPE_INODE:
1265                 base = &parent->data->ipdata.u.blockset.blockref[0];
1266                 count = HAMMER2_SET_COUNT;
1267                 break;
1268         case HAMMER2_BREF_TYPE_INDIRECT:
1269                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1270                         base = NULL;
1271                 } else {
1272                         KKASSERT(parent->data != NULL);
1273                         base = &parent->data->npdata.blockref[0];
1274                 }
1275                 count = parent->bytes / sizeof(hammer2_blockref_t);
1276                 break;
1277         case HAMMER2_BREF_TYPE_VOLUME:
1278                 base = &hmp->voldata.sroot_blockset.blockref[0];
1279                 count = HAMMER2_SET_COUNT;
1280                 break;
1281         default:
1282                 panic("hammer2_chain_next: unrecognized blockref type: %d",
1283                       parent->bref.type);
1284                 base = NULL;    /* safety */
1285                 count = 0;      /* safety */
1286                 break;
1287         }
1288         KKASSERT(i <= count);
1289
1290         /*
1291          * Look for the key.  If we are unable to find a match and an exact
1292          * match was requested we return NULL.  If a range was requested we
1293          * run hammer2_chain_next() to iterate.
1294          */
1295         bref = NULL;
1296         while (i < count) {
1297                 tmp = hammer2_chain_find(hmp, parent, i);
1298                 if (tmp) {
1299                         bref = &tmp->bref;
1300                 } else if (base == NULL || base[i].type == 0) {
1301                         ++i;
1302                         continue;
1303                 } else {
1304                         bref = &base[i];
1305                 }
1306                 scan_beg = bref->key;
1307                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1308                 if (key_beg <= scan_end && key_end >= scan_beg)
1309                         break;
1310                 ++i;
1311         }
1312
1313         /*
1314          * If we couldn't find a match recurse up a parent to continue the
1315          * search.
1316          */
1317         if (i == count)
1318                 goto again;
1319
1320         /*
1321          * Acquire the new chain element.  If the chain element is an
1322          * indirect block we must search recursively.
1323          */
1324         chain = hammer2_chain_get(hmp, parent, i, flags);
1325         if (chain == NULL)
1326                 return (NULL);
1327
1328         /*
1329          * If the chain element is an indirect block it becomes the new
1330          * parent and we loop on it.
1331          *
1332          * The parent always has to be locked with at least RESOLVE_MAYBE,
1333          * so it might need a fixup if the caller passed incompatible flags.
1334          */
1335         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1336                 hammer2_chain_unlock(hmp, parent);
1337                 *parentp = parent = chain;
1338                 chain = NULL;
1339                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1340                         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
1341                         hammer2_chain_drop(hmp, parent);        /* excess ref */
1342                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1343                         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
1344                         hammer2_chain_unlock(hmp, parent);
1345                 }
1346                 i = 0;
1347                 goto again2;
1348         }
1349
1350         /*
1351          * All done, return chain
1352          */
1353         return (chain);
1354 }
1355
1356 /*
1357  * Create and return a new hammer2 system memory structure of the specified
1358  * key, type and size and insert it RELATIVE TO (PARENT).
1359  *
1360  * (parent) is typically either an inode or an indirect block, acquired
1361  * acquired as a side effect of issuing a prior failed lookup.  parent
1362  * must be locked and held.  Do not pass the inode chain to this function
1363  * unless that is the chain returned by the failed lookup.
1364  *
1365  * Non-indirect types will automatically allocate indirect blocks as required
1366  * if the new item does not fit in the current (parent).
1367  *
1368  * Indirect types will move a portion of the existing blockref array in
1369  * (parent) into the new indirect type and then use one of the free slots
1370  * to emplace the new indirect type.
1371  *
1372  * A new locked, referenced chain element is returned of the specified type.
1373  * The element may or may not have a data area associated with it:
1374  *
1375  *      VOLUME          not allowed here
1376  *      INODE           embedded data are will be set-up
1377  *      INDIRECT        not allowed here
1378  *      DATA            no data area will be set-up (caller is expected
1379  *                      to have logical buffers, we don't want to alias
1380  *                      the data onto device buffers!).
1381  */
1382 hammer2_chain_t *
1383 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1384                      hammer2_chain_t *chain,
1385                      hammer2_key_t key, int keybits, int type, size_t bytes)
1386 {
1387         hammer2_blockref_t dummy;
1388         hammer2_blockref_t *base;
1389         hammer2_chain_t dummy_chain;
1390         int unlock_parent = 0;
1391         int allocated = 0;
1392         int count;
1393         int i;
1394
1395         if (chain == NULL) {
1396                 /*
1397                  * First allocate media space and construct the dummy bref,
1398                  * then allocate the in-memory chain structure.
1399                  */
1400                 bzero(&dummy, sizeof(dummy));
1401                 dummy.type = type;
1402                 dummy.key = key;
1403                 dummy.keybits = keybits;
1404                 dummy.data_off = hammer2_bytes_to_radix(bytes);
1405                 chain = hammer2_chain_alloc(hmp, &dummy);
1406                 allocated = 1;
1407
1408                 /*
1409                  * We do NOT set INITIAL here (yet).  INITIAL is only
1410                  * used for indirect blocks.
1411                  *
1412                  * Recalculate bytes to reflect the actual media block
1413                  * allocation.
1414                  */
1415                 bytes = (hammer2_off_t)1 <<
1416                         (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
1417                 chain->bytes = bytes;
1418
1419                 switch(type) {
1420                 case HAMMER2_BREF_TYPE_VOLUME:
1421                         panic("hammer2_chain_create: called with volume type");
1422                         break;
1423                 case HAMMER2_BREF_TYPE_INODE:
1424                         KKASSERT(bytes == HAMMER2_INODE_BYTES);
1425                         chain->data = (void *)&chain->u.ip->ip_data;
1426                         break;
1427                 case HAMMER2_BREF_TYPE_INDIRECT:
1428                         panic("hammer2_chain_create: cannot be used to"
1429                               "create indirect block");
1430                         break;
1431                 case HAMMER2_BREF_TYPE_DATA:
1432                 default:
1433                         /* leave chain->data NULL */
1434                         KKASSERT(chain->data == NULL);
1435                         break;
1436                 }
1437         } else {
1438                 /*
1439                  * Potentially update the chain's key/keybits.
1440                  */
1441                 chain->bref.key = key;
1442                 chain->bref.keybits = keybits;
1443         }
1444
1445 again:
1446         /*
1447          * Locate a free blockref in the parent's array
1448          */
1449         switch(parent->bref.type) {
1450         case HAMMER2_BREF_TYPE_INODE:
1451                 KKASSERT((parent->u.ip->ip_data.op_flags &
1452                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
1453                 KKASSERT(parent->data != NULL);
1454                 base = &parent->data->ipdata.u.blockset.blockref[0];
1455                 count = HAMMER2_SET_COUNT;
1456                 break;
1457         case HAMMER2_BREF_TYPE_INDIRECT:
1458                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1459                         base = NULL;
1460                 } else {
1461                         KKASSERT(parent->data != NULL);
1462                         base = &parent->data->npdata.blockref[0];
1463                 }
1464                 count = parent->bytes / sizeof(hammer2_blockref_t);
1465                 break;
1466         case HAMMER2_BREF_TYPE_VOLUME:
1467                 KKASSERT(parent->data != NULL);
1468                 base = &hmp->voldata.sroot_blockset.blockref[0];
1469                 count = HAMMER2_SET_COUNT;
1470                 break;
1471         default:
1472                 panic("hammer2_chain_create: unrecognized blockref type: %d",
1473                       parent->bref.type);
1474                 count = 0;
1475                 break;
1476         }
1477
1478         /*
1479          * Scan for an unallocated bref, also skipping any slots occupied
1480          * by in-memory chain elements that may not yet have been updated
1481          * in the parent's bref array.
1482          */
1483         bzero(&dummy_chain, sizeof(dummy_chain));
1484         for (i = 0; i < count; ++i) {
1485                 if (base == NULL) {
1486                         dummy_chain.index = i;
1487                         if (SPLAY_FIND(hammer2_chain_splay,
1488                                        &parent->shead, &dummy_chain) == NULL) {
1489                                 break;
1490                         }
1491                 } else if (base[i].type == 0) {
1492                         dummy_chain.index = i;
1493                         if (SPLAY_FIND(hammer2_chain_splay,
1494                                        &parent->shead, &dummy_chain) == NULL) {
1495                                 break;
1496                         }
1497                 }
1498         }
1499
1500         /*
1501          * If no free blockref could be found we must create an indirect
1502          * block and move a number of blockrefs into it.  With the parent
1503          * locked we can safely lock each child in order to move it without
1504          * causing a deadlock.
1505          *
1506          * This may return the new indirect block or the old parent depending
1507          * on where the key falls.
1508          */
1509         if (i == count) {
1510                 hammer2_chain_t *nparent;
1511
1512                 nparent = hammer2_chain_create_indirect(hmp, parent,
1513                                                         key, keybits);
1514                 if (nparent == NULL) {
1515                         if (allocated)
1516                                 hammer2_chain_free(hmp, chain);
1517                         chain = NULL;
1518                         goto done;
1519                 }
1520                 if (parent != nparent) {
1521                         if (unlock_parent)
1522                                 hammer2_chain_unlock(hmp, parent);
1523                         parent = nparent;
1524                         unlock_parent = 1;
1525                 }
1526                 goto again;
1527         }
1528
1529         /*
1530          * Link the chain into its parent.  Later on we will have to set
1531          * the MOVED bit in situations where we don't mark the new chain
1532          * as being modified.
1533          */
1534         if (chain->parent != NULL)
1535                 panic("hammer2: hammer2_chain_create: chain already connected");
1536         KKASSERT(chain->parent == NULL);
1537         chain->parent = parent;
1538         chain->index = i;
1539         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
1540                 panic("hammer2_chain_link: collision");
1541         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
1542         KKASSERT(parent->refs > 0);
1543         atomic_add_int(&parent->refs, 1);
1544
1545         /*
1546          * Additional linkage for inodes.  Reuse the parent pointer to
1547          * find the parent directory.
1548          *
1549          * Cumulative adjustments are inherited on [re]attach and will
1550          * propagate up the tree on the next flush.
1551          */
1552         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1553                 hammer2_chain_t *scan = parent;
1554                 hammer2_inode_t *ip = chain->u.ip;
1555
1556                 while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1557                         scan = scan->parent;
1558                 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE) {
1559                         ip->pip = scan->u.ip;
1560                         ip->pmp = scan->u.ip->pmp;
1561                         ip->depth = scan->u.ip->depth + 1;
1562                         ip->pip->delta_icount += ip->ip_data.inode_count;
1563                         ip->pip->delta_dcount += ip->ip_data.data_count;
1564                         ++ip->pip->delta_icount;
1565                 }
1566         }
1567
1568         /*
1569          * (allocated) indicates that this is a newly-created chain element
1570          * rather than a renamed chain element.  In this situation we want
1571          * to place the chain element in the MODIFIED state.
1572          *
1573          * The data area will be set up as follows:
1574          *
1575          *      VOLUME          not allowed here.
1576          *
1577          *      INODE           embedded data are will be set-up.
1578          *
1579          *      INDIRECT        not allowed here.
1580          *
1581          *      DATA            no data area will be set-up (caller is expected
1582          *                      to have logical buffers, we don't want to alias
1583          *                      the data onto device buffers!).
1584          */
1585         if (allocated) {
1586                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1587                         hammer2_chain_modify(hmp, chain,
1588                                              HAMMER2_MODIFY_OPTDATA);
1589                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1590                         /* not supported in this function */
1591                         panic("hammer2_chain_create: bad type");
1592                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1593                         hammer2_chain_modify(hmp, chain,
1594                                              HAMMER2_MODIFY_OPTDATA);
1595                 } else {
1596                         hammer2_chain_modify(hmp, chain, 0);
1597                 }
1598         } else {
1599                 /*
1600                  * When reconnecting inodes we have to call setsubmod()
1601                  * to ensure that its state propagates up the newly
1602                  * connected parent.
1603                  *
1604                  * Make sure MOVED is set but do not update bref_flush.  If
1605                  * the chain is undergoing modification bref_flush will be
1606                  * updated when it gets flushed.  If it is not then the
1607                  * bref may not have been flushed yet and we do not want to
1608                  * set MODIFIED here as this could result in unnecessary
1609                  * reallocations.
1610                  */
1611                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1612                         hammer2_chain_ref(hmp, chain);
1613                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1614                 }
1615                 hammer2_chain_parent_setsubmod(hmp, chain);
1616         }
1617
1618 done:
1619         if (unlock_parent)
1620                 hammer2_chain_unlock(hmp, parent);
1621         return (chain);
1622 }
1623
1624 /*
1625  * Create an indirect block that covers one or more of the elements in the
1626  * current parent.  Either returns the existing parent with no locking or
1627  * ref changes or returns the new indirect block locked and referenced
1628  * and leaving the original parent lock/ref intact as well.
1629  *
1630  * The returned chain depends on where the specified key falls.
1631  *
1632  * The key/keybits for the indirect mode only needs to follow three rules:
1633  *
1634  * (1) That all elements underneath it fit within its key space and
1635  *
1636  * (2) That all elements outside it are outside its key space.
1637  *
1638  * (3) When creating the new indirect block any elements in the current
1639  *     parent that fit within the new indirect block's keyspace must be
1640  *     moved into the new indirect block.
1641  *
1642  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
1643  *     keyspace the the current parent, but lookup/iteration rules will
1644  *     ensure (and must ensure) that rule (2) for all parents leading up
1645  *     to the nearest inode or the root volume header is adhered to.  This
1646  *     is accomplished by always recursing through matching keyspaces in
1647  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
1648  *
1649  * The current implementation calculates the current worst-case keyspace by
1650  * iterating the current parent and then divides it into two halves, choosing
1651  * whichever half has the most elements (not necessarily the half containing
1652  * the requested key).
1653  *
1654  * We can also opt to use the half with the least number of elements.  This
1655  * causes lower-numbered keys (aka logical file offsets) to recurse through
1656  * fewer indirect blocks and higher-numbered keys to recurse through more.
1657  * This also has the risk of not moving enough elements to the new indirect
1658  * block and being forced to create several indirect blocks before the element
1659  * can be inserted.
1660  */
1661 static
1662 hammer2_chain_t *
1663 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1664                               hammer2_key_t create_key, int create_bits)
1665 {
1666         hammer2_blockref_t *base;
1667         hammer2_blockref_t *bref;
1668         hammer2_chain_t *chain;
1669         hammer2_chain_t *ichain;
1670         hammer2_chain_t dummy;
1671         hammer2_key_t key = create_key;
1672         int keybits = create_bits;
1673         int locount = 0;
1674         int hicount = 0;
1675         int count;
1676         int nbytes;
1677         int i;
1678
1679         /*
1680          * Calculate the base blockref pointer or NULL if the chain
1681          * is known to be empty.  We need to calculate the array count
1682          * for SPLAY lookups either way.
1683          */
1684         hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
1685         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1686                 base = NULL;
1687
1688                 switch(parent->bref.type) {
1689                 case HAMMER2_BREF_TYPE_INODE:
1690                         count = HAMMER2_SET_COUNT;
1691                         break;
1692                 case HAMMER2_BREF_TYPE_INDIRECT:
1693                         count = parent->bytes / sizeof(hammer2_blockref_t);
1694                         break;
1695                 case HAMMER2_BREF_TYPE_VOLUME:
1696                         count = HAMMER2_SET_COUNT;
1697                         break;
1698                 default:
1699                         panic("hammer2_chain_create_indirect: "
1700                               "unrecognized blockref type: %d",
1701                               parent->bref.type);
1702                         count = 0;
1703                         break;
1704                 }
1705         } else {
1706                 switch(parent->bref.type) {
1707                 case HAMMER2_BREF_TYPE_INODE:
1708                         base = &parent->data->ipdata.u.blockset.blockref[0];
1709                         count = HAMMER2_SET_COUNT;
1710                         break;
1711                 case HAMMER2_BREF_TYPE_INDIRECT:
1712                         base = &parent->data->npdata.blockref[0];
1713                         count = parent->bytes / sizeof(hammer2_blockref_t);
1714                         break;
1715                 case HAMMER2_BREF_TYPE_VOLUME:
1716                         base = &hmp->voldata.sroot_blockset.blockref[0];
1717                         count = HAMMER2_SET_COUNT;
1718                         break;
1719                 default:
1720                         panic("hammer2_chain_create_indirect: "
1721                               "unrecognized blockref type: %d",
1722                               parent->bref.type);
1723                         count = 0;
1724                         break;
1725                 }
1726         }
1727
1728         /*
1729          * Scan for an unallocated bref, also skipping any slots occupied
1730          * by in-memory chain elements which may not yet have been updated
1731          * in the parent's bref array.
1732          */
1733         bzero(&dummy, sizeof(dummy));
1734         for (i = 0; i < count; ++i) {
1735                 int nkeybits;
1736
1737                 dummy.index = i;
1738                 chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
1739                 if (chain) {
1740                         bref = &chain->bref;
1741                 } else if (base && base[i].type) {
1742                         bref = &base[i];
1743                 } else {
1744                         continue;
1745                 }
1746
1747                 /*
1748                  * Expand our calculated key range (key, keybits) to fit
1749                  * the scanned key.  nkeybits represents the full range
1750                  * that we will later cut in half (two halves @ nkeybits - 1).
1751                  */
1752                 nkeybits = keybits;
1753                 if (nkeybits < bref->keybits)
1754                         nkeybits = bref->keybits;
1755                 while (nkeybits < 64 &&
1756                        (~(((hammer2_key_t)1 << nkeybits) - 1) &
1757                         (key ^ bref->key)) != 0) {
1758                         ++nkeybits;
1759                 }
1760
1761                 /*
1762                  * If the new key range is larger we have to determine
1763                  * which side of the new key range the existing keys fall
1764                  * under by checking the high bit, then collapsing the
1765                  * locount into the hicount or vise-versa.
1766                  */
1767                 if (keybits != nkeybits) {
1768                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
1769                                 hicount += locount;
1770                                 locount = 0;
1771                         } else {
1772                                 locount += hicount;
1773                                 hicount = 0;
1774                         }
1775                         keybits = nkeybits;
1776                 }
1777
1778                 /*
1779                  * The newly scanned key will be in the lower half or the
1780                  * higher half of the (new) key range.
1781                  */
1782                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
1783                         ++hicount;
1784                 else
1785                         ++locount;
1786         }
1787
1788         /*
1789          * Adjust keybits to represent half of the full range calculated
1790          * above (radix 63 max)
1791          */
1792         --keybits;
1793
1794         /*
1795          * Select whichever half contains the most elements.  Theoretically
1796          * we can select either side as long as it contains at least one
1797          * element (in order to ensure that a free slot is present to hold
1798          * the indirect block).
1799          */
1800         key &= ~(((hammer2_key_t)1 << keybits) - 1);
1801         if (hammer2_indirect_optimize) {
1802                 /*
1803                  * Insert node for least number of keys, this will arrange
1804                  * the first few blocks of a large file or the first few
1805                  * inodes in a directory with fewer indirect blocks when
1806                  * created linearly.
1807                  */
1808                 if (hicount < locount && hicount != 0)
1809                         key |= (hammer2_key_t)1 << keybits;
1810                 else
1811                         key &= ~(hammer2_key_t)1 << keybits;
1812         } else {
1813                 /*
1814                  * Insert node for most number of keys, best for heavily
1815                  * fragmented files.
1816                  */
1817                 if (hicount > locount)
1818                         key |= (hammer2_key_t)1 << keybits;
1819                 else
1820                         key &= ~(hammer2_key_t)1 << keybits;
1821         }
1822
1823         /*
1824          * How big should our new indirect block be?  It has to be at least
1825          * as large as its parent.
1826          */
1827         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
1828                 nbytes = HAMMER2_IND_BYTES_MIN;
1829         else
1830                 nbytes = HAMMER2_IND_BYTES_MAX;
1831         if (nbytes < count * sizeof(hammer2_blockref_t))
1832                 nbytes = count * sizeof(hammer2_blockref_t);
1833
1834         /*
1835          * Ok, create our new indirect block
1836          */
1837         dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
1838         dummy.bref.key = key;
1839         dummy.bref.keybits = keybits;
1840         dummy.bref.data_off = hammer2_bytes_to_radix(nbytes);
1841         ichain = hammer2_chain_alloc(hmp, &dummy.bref);
1842         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
1843
1844         /*
1845          * Iterate the original parent and move the matching brefs into
1846          * the new indirect block.
1847          */
1848         for (i = 0; i < count; ++i) {
1849                 /*
1850                  * For keying purposes access the bref from the media or
1851                  * from our in-memory cache.  In cases where the in-memory
1852                  * cache overrides the media the keyrefs will be the same
1853                  * anyway so we can avoid checking the cache when the media
1854                  * has a key.
1855                  */
1856                 dummy.index = i;
1857                 chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
1858                 if (chain) {
1859                         bref = &chain->bref;
1860                 } else if (base && base[i].type) {
1861                         bref = &base[i];
1862                 } else {
1863                         if (ichain->index < 0)
1864                                 ichain->index = i;
1865                         continue;
1866                 }
1867
1868                 /*
1869                  * Skip keys not in the chosen half (low or high), only bit
1870                  * (keybits - 1) needs to be compared but for safety we
1871                  * will compare all msb bits plus that bit again.
1872                  */
1873                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
1874                     (key ^ bref->key)) != 0) {
1875                         continue;
1876                 }
1877
1878                 /*
1879                  * This element is being moved from the parent, its slot
1880                  * is available for our new indirect block.
1881                  */
1882                 if (ichain->index < 0)
1883                         ichain->index = i;
1884
1885                 /*
1886                  * Load the new indirect block by acquiring or allocating
1887                  * the related chain entries, then simply move them to the
1888                  * new parent (ichain).
1889                  *
1890                  * When adjusting the parent/child relationship we must
1891                  * set the MOVED bit but we do NOT update bref_flush
1892                  * because otherwise we might synchronize a bref that has
1893                  * not yet been flushed.  We depend on chain's bref_flush
1894                  * either being correct or the chain being in a MODIFIED
1895                  * state.
1896                  *
1897                  * We do not want to set MODIFIED here as this would result
1898                  * in unnecessary reallocations.
1899                  *
1900                  * We must still set SUBMODIFIED in the parent but we do
1901                  * that after the loop.
1902                  *
1903                  * XXX we really need a lock here but we don't need the
1904                  *     data.  NODATA feature needed.
1905                  */
1906                 chain = hammer2_chain_get(hmp, parent, i,
1907                                           HAMMER2_LOOKUP_NODATA);
1908                 SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
1909                 if (SPLAY_INSERT(hammer2_chain_splay, &ichain->shead, chain))
1910                         panic("hammer2_chain_create_indirect: collision");
1911                 chain->parent = ichain;
1912                 if (base)
1913                         bzero(&base[i], sizeof(base[i]));
1914                 atomic_add_int(&parent->refs, -1);
1915                 atomic_add_int(&ichain->refs, 1);
1916                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1917                         hammer2_chain_ref(hmp, chain);
1918                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1919                 }
1920                 hammer2_chain_unlock(hmp, chain);
1921                 KKASSERT(parent->refs > 0);
1922                 chain = NULL;
1923         }
1924
1925         /*
1926          * Insert the new indirect block into the parent now that we've
1927          * cleared out some entries in the parent.  We calculated a good
1928          * insertion index in the loop above (ichain->index).
1929          *
1930          * We don't have to set MOVED here because we mark ichain modified
1931          * down below (so the normal modified -> flush -> set-moved sequence
1932          * applies).
1933          */
1934         KKASSERT(ichain->index >= 0);
1935         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, ichain))
1936                 panic("hammer2_chain_create_indirect: ichain insertion");
1937         ichain->parent = parent;
1938         atomic_add_int(&parent->refs, 1);
1939
1940         /*
1941          * Mark the new indirect block modified after insertion, which
1942          * will propagate up through parent all the way to the root and
1943          * also allocate the physical block in ichain for our caller,
1944          * and assign ichain->data to a pre-zero'd space (because there
1945          * is not prior data to copy into it).
1946          *
1947          * We have to set SUBMODIFIED in ichain's flags manually so the
1948          * flusher knows it has to recurse through it to get to all of
1949          * our moved blocks, then call setsubmod() to set the bit
1950          * recursively.
1951          */
1952         hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA);
1953         hammer2_chain_parent_setsubmod(hmp, ichain);
1954         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
1955
1956         /*
1957          * Figure out what to return.
1958          */
1959         if (create_bits > keybits) {
1960                 /*
1961                  * Key being created is way outside the key range,
1962                  * return the original parent.
1963                  */
1964                 hammer2_chain_unlock(hmp, ichain);
1965         } else if (~(((hammer2_key_t)1 << keybits) - 1) &
1966                    (create_key ^ key)) {
1967                 /*
1968                  * Key being created is outside the key range,
1969                  * return the original parent.
1970                  */
1971                 hammer2_chain_unlock(hmp, ichain);
1972         } else {
1973                 /*
1974                  * Otherwise its in the range, return the new parent.
1975                  * (leave both the new and old parent locked).
1976                  */
1977                 parent = ichain;
1978         }
1979
1980         return(parent);
1981 }
1982
1983 /*
1984  * Physically delete the specified chain element.  Note that inodes with
1985  * open descriptors should not be deleted (as with other filesystems) until
1986  * the last open descriptor is closed.
1987  *
1988  * This routine will remove the chain element from its parent and potentially
1989  * also recurse upward and delete indirect blocks which become empty as a
1990  * side effect.
1991  *
1992  * The caller must pass a pointer to the chain's parent, also locked and
1993  * referenced.  (*parentp) will be modified in a manner similar to a lookup
1994  * or iteration when indirect blocks are also deleted as a side effect.
1995  *
1996  * XXX This currently does not adhere to the MOVED flag protocol in that
1997  *     the removal is immediately indicated in the parent's blockref[]
1998  *     array.
1999  */
2000 void
2001 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
2002                      hammer2_chain_t *chain, int retain)
2003 {
2004         hammer2_blockref_t *base;
2005         hammer2_inode_t *ip;
2006         int count;
2007
2008         if (chain->parent != parent)
2009                 panic("hammer2_chain_delete: parent mismatch");
2010
2011         /*
2012          * Mark the parent modified so our base[] pointer remains valid
2013          * while we move entries.  For the optimized indirect block
2014          * case mark the parent moved instead.
2015          *
2016          * Calculate the blockref reference in the parent
2017          */
2018         switch(parent->bref.type) {
2019         case HAMMER2_BREF_TYPE_INODE:
2020                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2021                 base = &parent->data->ipdata.u.blockset.blockref[0];
2022                 count = HAMMER2_SET_COUNT;
2023                 break;
2024         case HAMMER2_BREF_TYPE_INDIRECT:
2025                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA |
2026                                                   HAMMER2_MODIFY_NO_MODIFY_TID);
2027                 if (parent->flags & HAMMER2_CHAIN_INITIAL)
2028                         base = NULL;
2029                 else
2030                         base = &parent->data->npdata.blockref[0];
2031                 count = parent->bytes / sizeof(hammer2_blockref_t);
2032                 break;
2033         case HAMMER2_BREF_TYPE_VOLUME:
2034                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2035                 base = &hmp->voldata.sroot_blockset.blockref[0];
2036                 count = HAMMER2_SET_COUNT;
2037                 break;
2038         default:
2039                 panic("hammer2_chain_delete: unrecognized blockref type: %d",
2040                       parent->bref.type);
2041                 count = 0;
2042                 break;
2043         }
2044
2045         /*
2046          * Disconnect the bref in the parent, remove the chain, and
2047          * disconnect in-memory fields from the parent.
2048          */
2049         KKASSERT(chain->index >= 0 && chain->index < count);
2050         if (base)
2051                 bzero(&base[chain->index], sizeof(*base));
2052
2053         SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
2054         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2055         atomic_add_int(&parent->refs, -1);      /* for splay entry */
2056         chain->index = -1;
2057         chain->parent = NULL;
2058
2059         /*
2060          * Cumulative adjustments must be propagated to the parent inode
2061          * when deleting and synchronized to ip.
2062          *
2063          * NOTE:  We do not propagate ip->delta_*count to the parent because
2064          *        these represent adjustments that have not yet been
2065          *        propagated upward, so we don't need to remove them from
2066          *        the parent.
2067          *
2068          * Clear the pointer to the parent inode.
2069          */
2070         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2071                 ip = chain->u.ip;
2072                 if (ip->pip) {
2073                         ip->pip->delta_icount -= ip->ip_data.inode_count;
2074                         ip->pip->delta_dcount -= ip->ip_data.data_count;
2075                         ip->ip_data.inode_count += ip->delta_icount;
2076                         ip->ip_data.data_count += ip->delta_dcount;
2077                         ip->delta_icount = 0;
2078                         ip->delta_dcount = 0;
2079                         --ip->pip->delta_icount;
2080                         ip->pip = NULL;
2081                 }
2082                 chain->u.ip->depth = 0;
2083         }
2084
2085         /*
2086          * If retain is 0 the deletion is permanent.  Because the chain is
2087          * no longer connected to the topology a flush will have no
2088          * visibility into it.  We must dispose of the references related
2089          * to the MODIFIED and MOVED flags, otherwise the ref count will
2090          * never transition to 0.
2091          *
2092          * If retain is non-zero the deleted element is likely an inode
2093          * which the vnops frontend will mark DESTROYED and flush.  In that
2094          * situation we must retain the flags for any open file descriptors
2095          * on the (removed) inode.  The final close will destroy the
2096          * disconnected chain.
2097          */
2098         if (retain == 0) {
2099                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2100                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
2101                         hammer2_chain_drop(hmp, chain);
2102                 }
2103                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2104                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2105                         hammer2_chain_drop(hmp, chain);
2106                 }
2107         }
2108
2109         /*
2110          * The chain is still likely referenced, possibly even by a vnode
2111          * (if an inode), so defer further action until the chain gets
2112          * dropped.
2113          */
2114 }
2115
2116 /*
2117  * Recursively flush the specified chain.  The chain is locked and
2118  * referenced by the caller and will remain so on return.  The chain
2119  * will remain referenced throughout but can temporarily lose its
2120  * lock during the recursion to avoid unnecessarily stalling user
2121  * processes.
2122  *
2123  *
2124  */
2125 TAILQ_HEAD(flush_deferral_list, hammer2_chain);
2126
2127 struct hammer2_flush_info {
2128         struct flush_deferral_list flush_list;
2129         int             depth;
2130         hammer2_tid_t   modify_tid;
2131 };
2132
2133 typedef struct hammer2_flush_info hammer2_flush_info_t;
2134
2135 static void
2136 hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
2137                           hammer2_flush_info_t *info)
2138 {
2139         hammer2_blockref_t *bref;
2140         hammer2_off_t pbase;
2141         size_t bbytes;
2142         size_t boff;
2143         char *bdata;
2144         struct buf *bp;
2145         int error;
2146         int wasmodified;
2147
2148         /*
2149          * If we hit the stack recursion depth limit defer the operation.
2150          * The controller of the info structure will execute the deferral
2151          * list and then retry.
2152          *
2153          * This is only applicable if SUBMODIFIED is set.  After a reflush
2154          * SUBMODIFIED will probably be cleared and we want to drop through
2155          * to finish processing the current element so our direct parent
2156          * can process the results.
2157          */
2158         if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT &&
2159             (chain->flags & HAMMER2_CHAIN_SUBMODIFIED)) {
2160                 if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
2161                         hammer2_chain_ref(hmp, chain);
2162                         TAILQ_INSERT_TAIL(&info->flush_list,
2163                                           chain, flush_node);
2164                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
2165                 }
2166                 return;
2167         }
2168
2169         if (hammer2_debug & 0x0008)
2170                 kprintf("%*.*sCHAIN type=%d@%08jx %p/%d %04x {\n",
2171                         info->depth, info->depth, "",
2172                         chain->bref.type, chain->bref.data_off,
2173                         chain, chain->refs, chain->flags);
2174
2175         /*
2176          * If SUBMODIFIED is set we recurse the flush and adjust the
2177          * blockrefs accordingly.
2178          *
2179          * NOTE: Looping on SUBMODIFIED can prevent a flush from ever
2180          *       finishing in the face of filesystem activity.
2181          */
2182         if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
2183                 hammer2_chain_t *child;
2184                 hammer2_chain_t *next;
2185                 hammer2_blockref_t *base;
2186                 int count;
2187
2188                 /*
2189                  * Clear SUBMODIFIED to catch races.  Note that if any
2190                  * child has to be flushed SUBMODIFIED will wind up being
2191                  * set again (for next time), but this does not stop us from
2192                  * synchronizing block updates which occurred.
2193                  *
2194                  * We don't want to set our chain to MODIFIED gratuitously.
2195                  */
2196                 /* XXX SUBMODIFIED not interlocked, can race */
2197                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2198
2199                 /*
2200                  * Flush the children and update the blockrefs in the chain.
2201                  * Be careful of ripouts during the loop.
2202                  */
2203                 next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
2204                 if (next)
2205                         hammer2_chain_ref(hmp, next);
2206                 while ((child = next) != NULL) {
2207                         next = SPLAY_NEXT(hammer2_chain_splay,
2208                                           &chain->shead, child);
2209                         if (next)
2210                                 hammer2_chain_ref(hmp, next);
2211                         /*
2212                          * We only recurse if SUBMODIFIED (internal node)
2213                          * or MODIFIED (internal node or leaf) is set.
2214                          * However, we must still track whether any MOVED
2215                          * entries are present to determine if the chain's
2216                          * blockref's need updating or not.
2217                          */
2218                         if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2219                                              HAMMER2_CHAIN_MODIFIED |
2220                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2221                                 hammer2_chain_drop(hmp, child);
2222                                 continue;
2223                         }
2224                         hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_MAYBE);
2225                         hammer2_chain_drop(hmp, child);
2226                         if (child->parent != chain ||
2227                             (child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2228                                              HAMMER2_CHAIN_MODIFIED |
2229                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2230                                 hammer2_chain_unlock(hmp, child);
2231                                 continue;
2232                         }
2233
2234                         /*
2235                          * Propagate the DESTROYED flag if found set, then
2236                          * recurse the flush.
2237                          */
2238                         if ((chain->flags & HAMMER2_CHAIN_DESTROYED) &&
2239                             (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) {
2240                                 atomic_set_int(&child->flags,
2241                                                HAMMER2_CHAIN_DESTROYED |
2242                                                HAMMER2_CHAIN_SUBMODIFIED);
2243                         }
2244                         ++info->depth;
2245                         hammer2_chain_flush_pass1(hmp, child, info);
2246                         --info->depth;
2247                         hammer2_chain_unlock(hmp, child);
2248                 }
2249
2250                 /*
2251                  * Now synchronize any block updates.
2252                  */
2253                 next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
2254                 if (next)
2255                         hammer2_chain_ref(hmp, next);
2256                 while ((child = next) != NULL) {
2257                         next = SPLAY_NEXT(hammer2_chain_splay,
2258                                           &chain->shead, child);
2259                         if (next)
2260                                 hammer2_chain_ref(hmp, next);
2261                         if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) {
2262                                 hammer2_chain_drop(hmp, child);
2263                                 continue;
2264                         }
2265                         hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_NEVER);
2266                         hammer2_chain_drop(hmp, child);
2267                         if (child->parent != chain ||
2268                             (child->flags & HAMMER2_CHAIN_MOVED) == 0) {
2269                                 hammer2_chain_unlock(hmp, child);
2270                                 continue;
2271                         }
2272
2273                         hammer2_chain_modify(hmp, chain,
2274                                              HAMMER2_MODIFY_NO_MODIFY_TID);
2275
2276                         switch(chain->bref.type) {
2277                         case HAMMER2_BREF_TYPE_INODE:
2278                                 KKASSERT((chain->data->ipdata.op_flags &
2279                                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
2280                                 base = &chain->data->ipdata.u.blockset.
2281                                         blockref[0];
2282                                 count = HAMMER2_SET_COUNT;
2283                                 break;
2284                         case HAMMER2_BREF_TYPE_INDIRECT:
2285                                 base = &chain->data->npdata.blockref[0];
2286                                 count = chain->bytes /
2287                                         sizeof(hammer2_blockref_t);
2288                                 break;
2289                         case HAMMER2_BREF_TYPE_VOLUME:
2290                                 base = &hmp->voldata.sroot_blockset.blockref[0];
2291                                 count = HAMMER2_SET_COUNT;
2292                                 break;
2293                         default:
2294                                 base = NULL;
2295                                 panic("hammer2_chain_get: "
2296                                       "unrecognized blockref type: %d",
2297                                       chain->bref.type);
2298                         }
2299
2300                         KKASSERT(child->index >= 0);
2301                         base[child->index] = child->bref_flush;
2302
2303                         if (chain->bref.mirror_tid <
2304                             child->bref_flush.mirror_tid) {
2305                                 chain->bref.mirror_tid =
2306                                         child->bref_flush.mirror_tid;
2307                         }
2308
2309                         if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME &&
2310                             hmp->voldata.mirror_tid <
2311                             child->bref_flush.mirror_tid) {
2312                                 hmp->voldata.mirror_tid =
2313                                         child->bref_flush.mirror_tid;
2314                         }
2315                         atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
2316                         hammer2_chain_drop(hmp, child); /* MOVED flag */
2317                         hammer2_chain_unlock(hmp, child);
2318                 }
2319         }
2320
2321         /*
2322          * If destroying the object we unconditonally clear the MODIFIED
2323          * and MOVED bits, and we destroy the buffer without writing it
2324          * out.
2325          *
2326          * We don't bother updating the hash/crc or the chain bref.
2327          *
2328          * NOTE: The destroy'd object's bref has already been updated.
2329          *       so we can clear MOVED without propagating mirror_tid
2330          *       or modify_tid upward.
2331          *
2332          * XXX allocations for unflushed data can be returned to the
2333          *     free pool.
2334          */
2335         if (chain->flags & HAMMER2_CHAIN_DESTROYED) {
2336                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2337                         if (chain->bp) {
2338                                 chain->bp->b_flags |= B_INVAL|B_RELBUF;
2339                         }
2340                         atomic_clear_int(&chain->flags,
2341                                          HAMMER2_CHAIN_MODIFIED |
2342                                          HAMMER2_CHAIN_MODIFY_TID);
2343                         hammer2_chain_drop(hmp, chain);
2344                 }
2345                 if (chain->flags & HAMMER2_CHAIN_MODIFIED_AUX) {
2346                         atomic_clear_int(&chain->flags,
2347                                          HAMMER2_CHAIN_MODIFIED_AUX);
2348                 }
2349                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2350                         atomic_clear_int(&chain->flags,
2351                                          HAMMER2_CHAIN_MOVED);
2352                         hammer2_chain_drop(hmp, chain);
2353                 }
2354                 return;
2355         }
2356
2357         /*
2358          * Flush this chain entry only if it is marked modified.
2359          */
2360         if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
2361                              HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2362                 goto done;
2363         }
2364
2365         /*
2366          * Synchronize cumulative data and inode count adjustments to
2367          * the inode and propagate the deltas upward to the parent.
2368          */
2369         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2370                 hammer2_inode_t *ip;
2371
2372                 ip = chain->u.ip;
2373                 ip->ip_data.inode_count += ip->delta_icount;
2374                 ip->ip_data.data_count += ip->delta_dcount;
2375                 if (ip->pip) {
2376                         ip->pip->delta_icount += ip->delta_icount;
2377                         ip->pip->delta_dcount += ip->delta_dcount;
2378                 }
2379                 ip->delta_icount = 0;
2380                 ip->delta_dcount = 0;
2381         }
2382
2383         /*
2384          * Flush if MODIFIED or MODIFIED_AUX is set.  MODIFIED_AUX is only
2385          * used by the volume header (&hmp->vchain).
2386          */
2387         if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
2388                              HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2389                 goto done;
2390         }
2391         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED_AUX);
2392
2393         /*
2394          * Clear MODIFIED and set HAMMER2_CHAIN_MOVED.  The caller
2395          * will re-test the MOVED bit.  We must also update the mirror_tid
2396          * and modify_tid fields as appropriate.
2397          *
2398          * bits own a single chain ref and the MOVED bit owns its own
2399          * chain ref.
2400          */
2401         chain->bref.mirror_tid = info->modify_tid;
2402         if (chain->flags & HAMMER2_CHAIN_MODIFY_TID)
2403                 chain->bref.modify_tid = info->modify_tid;
2404         wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0;
2405         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
2406                                         HAMMER2_CHAIN_MODIFY_TID);
2407
2408         if (chain->flags & HAMMER2_CHAIN_MOVED) {
2409                 /*
2410                  * Drop the ref from the MODIFIED bit we cleared.
2411                  */
2412                 if (wasmodified)
2413                         hammer2_chain_drop(hmp, chain);
2414         } else {
2415                 /*
2416                  * If we were MODIFIED we inherit the ref from clearing
2417                  * that bit, otherwise we need another ref.
2418                  */
2419                 if (wasmodified == 0)
2420                         hammer2_chain_ref(hmp, chain);
2421                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2422         }
2423         chain->bref_flush = chain->bref;
2424
2425         /*
2426          * If this is part of a recursive flush we can go ahead and write
2427          * out the buffer cache buffer and pass a new bref back up the chain.
2428          *
2429          * This will never be a volume header.
2430          */
2431         switch(chain->bref.type) {
2432         case HAMMER2_BREF_TYPE_VOLUME:
2433                 /*
2434                  * The volume header is flushed manually by the syncer, not
2435                  * here.
2436                  */
2437                 break;
2438         case HAMMER2_BREF_TYPE_DATA:
2439                 /*
2440                  * Data elements have already been flushed via the logical
2441                  * file buffer cache.  Their hash was set in the bref by
2442                  * the vop_write code.
2443                  *
2444                  * Make sure the buffer(s) have been flushed out here.
2445                  */
2446                 bbytes = chain->bytes;
2447                 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
2448                 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
2449
2450                 bp = getblk(hmp->devvp, pbase, bbytes, GETBLK_NOWAIT, 0);
2451                 if (bp) {
2452                         if ((bp->b_flags & (B_CACHE | B_DIRTY)) ==
2453                             (B_CACHE | B_DIRTY)) {
2454                                 kprintf("x");
2455                                 cluster_awrite(bp);
2456                         } else {
2457                                 bp->b_flags |= B_RELBUF;
2458                                 brelse(bp);
2459                         }
2460                 }
2461                 break;
2462         case HAMMER2_BREF_TYPE_INDIRECT:
2463                 /*
2464                  * Indirect blocks may be in an INITIAL state.  Use the
2465                  * chain_lock() call to ensure that the buffer has been
2466                  * instantiated (even though it is already locked the buffer
2467                  * might not have been instantiated).
2468                  *
2469                  * Only write the buffer out if it is dirty, it is possible
2470                  * the operating system had already written out the buffer.
2471                  */
2472                 hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_ALWAYS);
2473                 KKASSERT(chain->bp != NULL);
2474
2475                 bp = chain->bp;
2476                 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) ||
2477                     (bp->b_flags & B_DIRTY)) {
2478                         bawrite(chain->bp);
2479                 } else {
2480                         brelse(chain->bp);
2481                 }
2482                 chain->bp = NULL;
2483                 chain->data = NULL;
2484                 hammer2_chain_unlock(hmp, chain);
2485                 break;
2486         default:
2487                 /*
2488                  * Embedded elements have to be flushed out.
2489                  */
2490                 KKASSERT(chain->data != NULL);
2491                 KKASSERT(chain->bp == NULL);
2492                 bref = &chain->bref;
2493
2494                 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
2495
2496                 if (chain->bp == NULL) {
2497                         /*
2498                          * The data is embedded, we have to acquire the
2499                          * buffer cache buffer and copy the data into it.
2500                          */
2501                         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
2502                                 bbytes = HAMMER2_MINIOSIZE;
2503                         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
2504                         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
2505
2506                         /*
2507                          * The getblk() optimization can only be used if the
2508                          * physical block size matches the request.
2509                          */
2510                         if (chain->bytes == bbytes) {
2511                                 bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
2512                                 error = 0;
2513                         } else {
2514                                 error = bread(hmp->devvp, pbase, bbytes, &bp);
2515                                 KKASSERT(error == 0);
2516                         }
2517                         bdata = (char *)bp->b_data + boff;
2518
2519                         /*
2520                          * Copy the data to the buffer, mark the buffer
2521                          * dirty, and convert the chain to unmodified.
2522                          *
2523                          * We expect we might have to make adjustments to
2524                          * non-data delayed-write buffers when doing an
2525                          * actual flush so use bawrite() instead of
2526                          * cluster_awrite() here.
2527                          */
2528                         bcopy(chain->data, bdata, chain->bytes);
2529                         bp->b_flags |= B_CLUSTEROK;
2530                         bawrite(bp);
2531                         bp = NULL;
2532                         chain->bref.check.iscsi32.value =
2533                                 hammer2_icrc32(chain->data, chain->bytes);
2534                         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
2535                                 ++hammer2_iod_meta_write;
2536                         else
2537                                 ++hammer2_iod_indr_write;
2538                 } else {
2539                         chain->bref.check.iscsi32.value =
2540                                 hammer2_icrc32(chain->data, chain->bytes);
2541                 }
2542         }
2543
2544         /*
2545          * Adjustments to the bref.  The caller will use this to adjust
2546          * our chain's pointer to this chain element.
2547          */
2548         bref = &chain->bref;
2549
2550         switch(bref->type) {
2551         case HAMMER2_BREF_TYPE_VOLUME:
2552                 KKASSERT(chain->data != NULL);
2553                 KKASSERT(chain->bp == NULL);
2554
2555                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
2556                         hammer2_icrc32(
2557                                 (char *)&hmp->voldata +
2558                                  HAMMER2_VOLUME_ICRC1_OFF,
2559                                 HAMMER2_VOLUME_ICRC1_SIZE);
2560                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
2561                         hammer2_icrc32(
2562                                 (char *)&hmp->voldata +
2563                                  HAMMER2_VOLUME_ICRC0_OFF,
2564                                 HAMMER2_VOLUME_ICRC0_SIZE);
2565                 hmp->voldata.icrc_volheader =
2566                         hammer2_icrc32(
2567                                 (char *)&hmp->voldata +
2568                                  HAMMER2_VOLUME_ICRCVH_OFF,
2569                                 HAMMER2_VOLUME_ICRCVH_SIZE);
2570                 break;
2571         default:
2572                 break;
2573
2574         }
2575 done:
2576         if (hammer2_debug & 0x0008) {
2577                 kprintf("%*.*s} %p/%d %04x ",
2578                         info->depth, info->depth, "",
2579                         chain, chain->refs, chain->flags);
2580         }
2581 }
2582
2583 #if 0
2584 /*
2585  * PASS2 - not yet implemented (should be called only with the root chain?)
2586  */
2587 static void
2588 hammer2_chain_flush_pass2(hammer2_mount_t *hmp, hammer2_chain_t *chain)
2589 {
2590 }
2591 #endif
2592
2593 /*
2594  * Stand-alone flush.  If the chain is unable to completely flush we have
2595  * to be sure that SUBMODIFIED propagates up the parent chain.  We must not
2596  * clear the MOVED bit after flushing in this situation or our desynchronized
2597  * bref will not properly update in the parent.
2598  *
2599  * This routine can be called from several places but the most important
2600  * is from the hammer2_vop_reclaim() function.  We want to try to completely
2601  * clean out the inode structure to prevent disconnected inodes from
2602  * building up and blowing out the kmalloc pool.
2603  *
2604  * If modify_tid is 0 (usual case), a new modify_tid is allocated and
2605  * applied to the flush.  The depth-limit handling code is the only
2606  * code which passes a non-zero modify_tid to hammer2_chain_flush().
2607  */
2608 void
2609 hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
2610                     hammer2_tid_t modify_tid)
2611 {
2612         hammer2_chain_t *parent;
2613         hammer2_chain_t *scan;
2614         hammer2_blockref_t *base;
2615         hammer2_flush_info_t info;
2616         int count;
2617         int reflush;
2618
2619         /*
2620          * Execute the recursive flush and handle deferrals.
2621          *
2622          * Chains can be ridiculously long (thousands deep), so to
2623          * avoid blowing out the kernel stack the recursive flush has a
2624          * depth limit.  Elements at the limit are placed on a list
2625          * for re-execution after the stack has been popped.
2626          */
2627         bzero(&info, sizeof(info));
2628         TAILQ_INIT(&info.flush_list);
2629
2630         if (modify_tid == 0) {
2631                 hammer2_voldata_lock(hmp);
2632                 info.modify_tid = hmp->voldata.alloc_tid++;
2633                 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
2634                 hammer2_voldata_unlock(hmp);
2635         } else {
2636                 info.modify_tid = modify_tid;
2637         }
2638         reflush = 1;
2639
2640         while (reflush) {
2641                 /*
2642                  * Primary recursion
2643                  */
2644                 hammer2_chain_flush_pass1(hmp, chain, &info);
2645                 reflush = 0;
2646
2647                 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
2648                         /*
2649                          * Secondary recursion.  Note that a reference is
2650                          * retained from the element's presence on the
2651                          * deferral list.
2652                          */
2653                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
2654                         TAILQ_REMOVE(&info.flush_list, scan, flush_node);
2655                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
2656
2657                         /*
2658                          * Now that we've popped back up we can do a secondary
2659                          * recursion on the deferred elements.
2660                          */
2661                         if (hammer2_debug & 0x0040)
2662                                 kprintf("defered flush %p\n", scan);
2663                         hammer2_chain_lock(hmp, scan, HAMMER2_RESOLVE_MAYBE);
2664                         hammer2_chain_flush(hmp, scan, info.modify_tid);
2665                         hammer2_chain_unlock(hmp, scan);
2666
2667                         /*
2668                          * Only flag a reflush if SUBMODIFIED is no longer
2669                          * set.  If SUBMODIFIED is set the element will just
2670                          * wind up on our flush_list again.
2671                          */
2672                         if ((scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2673                                             HAMMER2_CHAIN_MODIFIED |
2674                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2675                                 reflush = 1;
2676                         }
2677                         hammer2_chain_drop(hmp, scan);
2678                 }
2679                 if ((hammer2_debug & 0x0040) && reflush)
2680                         kprintf("reflush %p\n", chain);
2681         }
2682
2683         /*
2684          * The SUBMODIFIED bit must propagate upward if the chain could not
2685          * be completely flushed.
2686          */
2687         if (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2688                             HAMMER2_CHAIN_MODIFIED |
2689                             HAMMER2_CHAIN_MODIFIED_AUX |
2690                             HAMMER2_CHAIN_MOVED)) {
2691                 hammer2_chain_parent_setsubmod(hmp, chain);
2692         }
2693
2694         /*
2695          * If the only thing left is a simple bref update try to
2696          * pro-actively update the parent, otherwise return early.
2697          */
2698         parent = chain->parent;
2699         if (parent == NULL) {
2700                 return;
2701         }
2702         if (chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
2703             (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2704                              HAMMER2_CHAIN_MODIFIED |
2705                              HAMMER2_CHAIN_MODIFIED_AUX |
2706                              HAMMER2_CHAIN_MOVED)) != HAMMER2_CHAIN_MOVED) {
2707                 return;
2708         }
2709
2710         /*
2711          * We are locking backwards so allow the lock to fail
2712          */
2713         if (lockmgr(&parent->lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
2714                 return;
2715         }
2716
2717         /*
2718          * We are updating brefs but we have to call chain_modify()
2719          * because our caller is not being run from a recursive flush.
2720          *
2721          * This will also chain up the parent list and set the SUBMODIFIED
2722          * flag.
2723          *
2724          * We do not want to set HAMMER2_CHAIN_MODIFY_TID here because the
2725          * modification is only related to updating a bref in the parent.
2726          *
2727          * When updating the blockset embedded in the volume header we must
2728          * also update voldata.mirror_tid.
2729          */
2730         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
2731         hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2732
2733         switch(parent->bref.type) {
2734         case HAMMER2_BREF_TYPE_INODE:
2735                 base = &parent->data->ipdata.u.blockset.
2736                         blockref[0];
2737                 count = HAMMER2_SET_COUNT;
2738                 break;
2739         case HAMMER2_BREF_TYPE_INDIRECT:
2740                 base = &parent->data->npdata.blockref[0];
2741                 count = parent->bytes /
2742                         sizeof(hammer2_blockref_t);
2743                 break;
2744         case HAMMER2_BREF_TYPE_VOLUME:
2745                 base = &hmp->voldata.sroot_blockset.blockref[0];
2746                 count = HAMMER2_SET_COUNT;
2747                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2748                         if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
2749                                 hmp->voldata.mirror_tid =
2750                                         chain->bref.mirror_tid;
2751                         }
2752                 }
2753                 break;
2754         default:
2755                 base = NULL;
2756                 panic("hammer2_chain_flush: "
2757                       "unrecognized blockref type: %d",
2758                       parent->bref.type);
2759         }
2760
2761         /*
2762          * Update the blockref in the parent.  We do not have to set
2763          * MOVED in the parent because the parent has been marked modified,
2764          * so the flush sequence will pick up the bref change.
2765          *
2766          * We do have to propagate mirror_tid upward.
2767          */
2768         KKASSERT(chain->index >= 0 &&
2769                  chain->index < count);
2770         KKASSERT(chain->parent == parent);
2771         if (chain->flags & HAMMER2_CHAIN_MOVED) {
2772                 base[chain->index] = chain->bref_flush;
2773                 if (parent->bref.mirror_tid < chain->bref_flush.mirror_tid)
2774                         parent->bref.mirror_tid = chain->bref_flush.mirror_tid;
2775                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2776                 hammer2_chain_drop(hmp, chain);
2777         } else if (bcmp(&base[chain->index], &chain->bref_flush,
2778                    sizeof(chain->bref)) != 0) {
2779                 panic("hammer2: unflagged bref update(2)");
2780         }
2781
2782         lockmgr(&parent->lk, LK_RELEASE);       /* release manual lockmgr op */
2783         hammer2_chain_unlock(hmp, parent);
2784 }