Merge branches 'hammer2' and 'master' of ssh://crater.dragonflybsd.org/repository...
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem handles direct and indirect block searches, recursions,
37  * creation, and deletion.  Chains of blockrefs are tracked and modifications
38  * are flag for propagation... eventually all the way back to the volume
39  * header.
40  */
41
42 #include <sys/cdefs.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/types.h>
46 #include <sys/lock.h>
47 #include <sys/uuid.h>
48
49 #include "hammer2.h"
50
51 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
52
53 static hammer2_chain_t *hammer2_chain_create_indirect(
54                         hammer2_mount_t *hmp, hammer2_chain_t *parent,
55                         hammer2_key_t key, int keybits);
56
57 /*
58  * Splay tree
59  */
60 SPLAY_GENERATE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
61
62 int
63 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
64 {
65         return(chain2->index - chain1->index);
66 }
67
68 /*
69  * Recursively mark the parent chain elements so flushes can find
70  * modified elements.  Stop when we hit a chain already flagged
71  * SUBMODIFIED, but ignore the SUBMODIFIED bit that might be set
72  * in chain itself.
73  *
74  * SUBMODIFIED is not set on the chain passed in.
75  *
76  * XXX rename of parent can create a SMP race
77  */
78 static void
79 hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
80 {
81         hammer2_chain_t *parent;
82
83         parent = chain->parent;
84         while (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
85                 atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
86                 parent = parent->parent;
87         }
88 }
89
90 /*
91  * Allocate a new disconnected chain element representing the specified
92  * bref.  The chain element is locked exclusively and refs is set to 1.
93  *
94  * This essentially allocates a system memory structure representing one
95  * of the media structure types, including inodes.
96  */
97 hammer2_chain_t *
98 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
99 {
100         hammer2_chain_t *chain;
101         hammer2_inode_t *ip;
102         hammer2_indblock_t *np;
103         hammer2_data_t *dp;
104         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
105
106         /*
107          * Construct the appropriate system structure.
108          */
109         switch(bref->type) {
110         case HAMMER2_BREF_TYPE_INODE:
111                 ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
112                 chain = &ip->chain;
113                 chain->u.ip = ip;
114                 lockinit(&chain->lk, "inode", 0, LK_CANRECURSE);
115                 ip->hmp = hmp;
116                 break;
117         case HAMMER2_BREF_TYPE_INDIRECT:
118                 np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
119                 chain = &np->chain;
120                 chain->u.np = np;
121                 lockinit(&chain->lk, "iblk", 0, LK_CANRECURSE);
122                 break;
123         case HAMMER2_BREF_TYPE_DATA:
124                 dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
125                 chain = &dp->chain;
126                 chain->u.dp = dp;
127                 lockinit(&chain->lk, "dblk", 0, LK_CANRECURSE);
128                 break;
129         case HAMMER2_BREF_TYPE_VOLUME:
130                 chain = NULL;
131                 panic("hammer2_chain_alloc volume type illegal for op");
132         default:
133                 chain = NULL;
134                 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
135                       bref->type);
136         }
137
138         /*
139          * Only set bref_flush if the bref has a real media offset, otherwise
140          * the caller has to wait for the chain to be modified/block-allocated
141          * before a blockref can be synchronized with its (future) parent.
142          */
143         chain->bref = *bref;
144         if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
145                 chain->bref_flush = *bref;
146         chain->index = -1;              /* not yet assigned */
147         chain->refs = 1;
148         chain->bytes = bytes;
149         lockmgr(&chain->lk, LK_EXCLUSIVE);
150
151         return (chain);
152 }
153
154 /*
155  * Free a disconnected chain element
156  */
157 void
158 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
159 {
160         void *mem;
161
162         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
163             chain->bref.type == HAMMER2_BREF_TYPE_VOLUME) {
164                 chain->data = NULL;
165         }
166
167         KKASSERT(chain->bp == NULL);
168         KKASSERT(chain->data == NULL);
169         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
170                  chain->u.ip->vp == NULL);
171
172         if ((mem = chain->u.mem) != NULL) {
173                 chain->u.mem = NULL;
174                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
175                         kfree(mem, hmp->minode);
176                 else
177                         kfree(mem, hmp->mchain);
178         }
179 }
180
181 /*
182  * Add a reference to a chain element (for shared access).  The chain
183  * element must already have at least 1 ref controlled by the caller.
184  */
185 void
186 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
187 {
188         KKASSERT(chain->refs > 0);
189         atomic_add_int(&chain->refs, 1);
190 }
191
192 /*
193  * Drop the callers reference to the chain element.  If the ref count
194  * reaches zero the chain element and its related structure (typically an
195  * inode or indirect block) will be freed and the parent will be
196  * recursively dropped.
197  *
198  * MOVED and MODIFIED elements hold additional references so it should not
199  * be possible for the count on a modified element to drop to 0.
200  *
201  * The chain element must NOT be locked by the caller.
202  *
203  * The parent might or might not be locked by the caller but if so it
204  * will also be referenced so we shouldn't recurse upward.
205  */
206 void
207 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
208 {
209         hammer2_chain_t *parent;
210         u_int refs;
211
212         while (chain) {
213                 refs = chain->refs;
214                 cpu_ccfence();
215                 KKASSERT(refs > 0);
216                 if (refs == 1) {
217                         KKASSERT(chain != &hmp->vchain);
218                         parent = chain->parent;
219                         if (parent)
220                                 lockmgr(&parent->lk, LK_EXCLUSIVE);
221                         if (atomic_cmpset_int(&chain->refs, 1, 0)) {
222                                 /*
223                                  * Succeeded, recurse and drop parent.
224                                  * These chain elements should be synchronized
225                                  * so no delta data or inode count updates
226                                  * should be needed.
227                                  */
228                                 KKASSERT((chain->flags &
229                                           (HAMMER2_CHAIN_MOVED |
230                                            HAMMER2_CHAIN_MODIFIED)) == 0);
231                                 if (!(chain->flags & HAMMER2_CHAIN_DELETED)) {
232                                         SPLAY_REMOVE(hammer2_chain_splay,
233                                                      &parent->shead, chain);
234                                         atomic_set_int(&chain->flags,
235                                                        HAMMER2_CHAIN_DELETED);
236                                         /* parent refs dropped via recursion */
237                                 }
238                                 chain->parent = NULL;
239                                 if (parent)
240                                         lockmgr(&parent->lk, LK_RELEASE);
241                                 hammer2_chain_free(hmp, chain);
242                                 chain = parent;
243                                 /* recurse on parent */
244                         } else {
245                                 if (parent)
246                                         lockmgr(&parent->lk, LK_RELEASE);
247                                 /* retry the same chain */
248                         }
249                 } else {
250                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
251                                 /*
252                                  * Succeeded, count did not reach zero so
253                                  * cut out of the loop.
254                                  */
255                                 break;
256                         }
257                         /* retry the same chain */
258                 }
259         }
260 }
261
262 /*
263  * Ref and lock a chain element, acquiring its data with I/O if necessary,
264  * and specify how you would like the data to be resolved.
265  *
266  * Returns 0 on success or an error code if the data could not be acquired.
267  * The chain element is locked either way.
268  *
269  * The lock is allowed to recurse, multiple locking ops will aggregate
270  * the requested resolve types.  Once data is assigned it will not be
271  * removed until the last unlock.
272  *
273  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
274  *                         (typically used to avoid device/logical buffer
275  *                          aliasing for data)
276  *
277  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
278  *                         the INITIAL-create state (indirect blocks only).
279  *
280  *                         Do not resolve data elements for DATA chains.
281  *                         (typically used to avoid device/logical buffer
282  *                          aliasing for data)
283  *
284  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
285  *
286  *
287  * NOTE: Embedded elements (volume header, inodes) are always resolved
288  *       regardless.
289  *
290  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
291  *       element will instantiate and zero its buffer, and flush it on
292  *       release.
293  *
294  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
295  *       so as not to instantiate a device buffer, which could alias against
296  *       a logical file buffer.  However, if ALWAYS is specified the
297  *       device buffer will be instantiated anyway.
298  */
299 int
300 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
301 {
302         hammer2_blockref_t *bref;
303         hammer2_off_t pbase;
304         hammer2_off_t peof;
305         size_t boff;
306         size_t bbytes;
307         int error;
308         char *bdata;
309
310         /*
311          * Lock the element.  Under certain conditions this might end up
312          * being a recursive lock.
313          */
314         KKASSERT(chain->refs > 0);
315         atomic_add_int(&chain->refs, 1);
316         lockmgr(&chain->lk, LK_EXCLUSIVE);
317
318         /*
319          * If we already have a valid data pointer no further action is
320          * necessary.
321          */
322         if (chain->data)
323                 return (0);
324
325         /*
326          * Do we have to resolve the data?
327          */
328         switch(how) {
329         case HAMMER2_RESOLVE_NEVER:
330                 return(0);
331         case HAMMER2_RESOLVE_MAYBE:
332                 if (chain->flags & HAMMER2_CHAIN_INITIAL)
333                         return(0);
334                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
335                         return(0);
336                 /* fall through */
337         case HAMMER2_RESOLVE_ALWAYS:
338                 break;
339         }
340
341         /*
342          * We must resolve to a device buffer, either by issuing I/O or
343          * by creating a zero-fill element.  We do not mark the buffer
344          * dirty when creating a zero-fill element (the hammer2_chain_modify()
345          * API must still be used to do that).
346          *
347          * The device buffer is variable-sized in powers of 2 down
348          * to HAMMER2_MINALLOCSIZE (typically 1K).  A 64K physical storage
349          * chunk always contains buffers of the same size. (XXX)
350          *
351          * The minimum physical IO size may be larger than the variable
352          * block size.
353          */
354         bref = &chain->bref;
355
356         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
357                 bbytes = HAMMER2_MINIOSIZE;
358         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
359         peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
360         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
361         KKASSERT(pbase != 0);
362
363         /*
364          * The getblk() optimization can only be used on newly created
365          * elements if the physical block size matches the request.
366          */
367         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
368             chain->bytes == bbytes) {
369                 chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
370                 error = 0;
371         } else if (hammer2_cluster_enable) {
372                 error = cluster_read(hmp->devvp, peof, pbase, bbytes,
373                                      HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
374                                      &chain->bp);
375         } else {
376                 error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
377         }
378
379         if (error) {
380                 kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
381                         (intmax_t)pbase, error);
382                 bqrelse(chain->bp);
383                 chain->bp = NULL;
384                 return (error);
385         }
386
387         /*
388          * Zero the data area if the chain is in the INITIAL-create state.
389          */
390         bdata = (char *)chain->bp->b_data + boff;
391         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
392                 bzero(bdata, chain->bytes);
393                 chain->bp->b_flags |= B_CACHE;
394                 bdirty(chain->bp);
395         }
396
397         /*
398          * Setup the data pointer, either pointing it to an embedded data
399          * structure and copying the data from the buffer, or pointing it
400          * into the buffer.
401          *
402          * The buffer is not retained when copying to an embedded data
403          * structure in order to avoid potential deadlocks or recursions
404          * on the same physical buffer.
405          */
406         switch (bref->type) {
407         case HAMMER2_BREF_TYPE_VOLUME:
408                 /*
409                  * Copy data from bp to embedded buffer
410                  */
411                 panic("hammer2_chain_lock: called on unresolved volume header");
412 #if 0
413                 /* NOT YET */
414                 KKASSERT(pbase == 0);
415                 KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
416                 bcopy(bdata, &hmp->voldata, chain->bytes);
417                 chain->data = (void *)&hmp->voldata;
418                 bqrelse(chain->bp);
419                 chain->bp = NULL;
420 #endif
421                 break;
422         case HAMMER2_BREF_TYPE_INODE:
423                 /*
424                  * Copy data from bp to embedded buffer, do not retain the
425                  * device buffer.
426                  */
427                 bcopy(bdata, &chain->u.ip->ip_data, chain->bytes);
428                 chain->data = (void *)&chain->u.ip->ip_data;
429                 bqrelse(chain->bp);
430                 chain->bp = NULL;
431                 break;
432         case HAMMER2_BREF_TYPE_INDIRECT:
433         case HAMMER2_BREF_TYPE_DATA:
434         default:
435                 /*
436                  * Point data at the device buffer and leave bp intact.
437                  */
438                 chain->data = (void *)bdata;
439                 break;
440         }
441         return (0);
442 }
443
444 /*
445  * Unlock and deref a chain element.
446  *
447  * On the last lock release any non-embedded data (chain->bp) will be
448  * retired.
449  */
450 void
451 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
452 {
453         long *counterp;
454
455         /*
456          * Undo a recursive lock
457          */
458         if (lockcountnb(&chain->lk) > 1) {
459                 KKASSERT(chain->refs > 1);
460                 atomic_add_int(&chain->refs, -1);
461                 lockmgr(&chain->lk, LK_RELEASE);
462                 return;
463         }
464
465         /*
466          * Shortcut the case if the data is embedded or not resolved.
467          * Do NOT null-out pointers to embedded data (e.g. inode).
468          */
469         if (chain->bp == NULL) {
470                 lockmgr(&chain->lk, LK_RELEASE);
471                 hammer2_chain_drop(hmp, chain);
472                 return;
473         }
474
475         /*
476          * Statistics
477          */
478         if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
479                 ;
480         } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
481                 switch(chain->bref.type) {
482                 case HAMMER2_BREF_TYPE_DATA:
483                         counterp = &hammer2_ioa_file_write;
484                         break;
485                 case HAMMER2_BREF_TYPE_INODE:
486                         counterp = &hammer2_ioa_meta_write;
487                         break;
488                 case HAMMER2_BREF_TYPE_INDIRECT:
489                         counterp = &hammer2_ioa_indr_write;
490                         break;
491                 default:
492                         counterp = &hammer2_ioa_volu_write;
493                         break;
494                 }
495                 ++*counterp;
496         } else {
497                 switch(chain->bref.type) {
498                 case HAMMER2_BREF_TYPE_DATA:
499                         counterp = &hammer2_iod_file_write;
500                         break;
501                 case HAMMER2_BREF_TYPE_INODE:
502                         counterp = &hammer2_iod_meta_write;
503                         break;
504                 case HAMMER2_BREF_TYPE_INDIRECT:
505                         counterp = &hammer2_iod_indr_write;
506                         break;
507                 default:
508                         counterp = &hammer2_iod_volu_write;
509                         break;
510                 }
511                 ++*counterp;
512         }
513
514         /*
515          * Clean out the bp.
516          *
517          * If a device buffer was used for data be sure to destroy the
518          * buffer when we are done to avoid aliases (XXX what about the
519          * underlying VM pages?).
520          */
521         if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
522                 chain->bp->b_flags |= B_RELBUF;
523
524         chain->data = NULL;
525         if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
526                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
527                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
528                         atomic_clear_int(&chain->flags,
529                                          HAMMER2_CHAIN_IOFLUSH);
530                         chain->bp->b_flags |= B_RELBUF;
531                         cluster_awrite(chain->bp);
532                 } else {
533                         chain->bp->b_flags |= B_CLUSTEROK;
534                         bdwrite(chain->bp);
535                 }
536         } else {
537                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
538                         atomic_clear_int(&chain->flags,
539                                          HAMMER2_CHAIN_IOFLUSH);
540                         chain->bp->b_flags |= B_RELBUF;
541                         brelse(chain->bp);
542                 } else {
543                         /* bp might still be dirty */
544                         bqrelse(chain->bp);
545                 }
546         }
547         chain->bp = NULL;
548         lockmgr(&chain->lk, LK_RELEASE);
549         hammer2_chain_drop(hmp, chain);
550 }
551
552 /*
553  * Resize the chain's physical storage allocation.  Chains can be resized
554  * smaller without reallocating the storage.  Resizing larger will reallocate
555  * the storage.
556  *
557  * Must be passed a locked chain.
558  *
559  * If you want the resize code to copy the data to the new block then the
560  * caller should lock the chain RESOLVE_MAYBE or RESOLVE_ALWAYS.
561  *
562  * If the caller already holds a logical buffer containing the data and
563  * intends to bdwrite() that buffer resolve with RESOLVE_NEVER.  The resize
564  * operation will then not copy the data.
565  *
566  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
567  * to avoid instantiating a device buffer that conflicts with the vnode
568  * data buffer.
569  *
570  * XXX flags currently ignored, uses chain->bp to detect data/no-data.
571  */
572 void
573 hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
574                      int nradix, int flags)
575 {
576         hammer2_mount_t *hmp = ip->hmp;
577         struct buf *nbp;
578         hammer2_off_t pbase;
579         size_t obytes;
580         size_t nbytes;
581         size_t bbytes;
582         int boff;
583         char *bdata;
584         int error;
585
586         /*
587          * Only data and indirect blocks can be resized for now
588          */
589         KKASSERT(chain != &hmp->vchain);
590         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
591                  chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
592
593         /*
594          * Nothing to do if the element is already the proper size
595          */
596         obytes = chain->bytes;
597         nbytes = 1U << nradix;
598         if (obytes == nbytes)
599                 return;
600
601         /*
602          * Set MODIFIED and add a chain ref to prevent destruction.  Both
603          * modified flags share the same ref.
604          *
605          * If the chain is already marked MODIFIED then we can safely
606          * return the previous allocation to the pool without having to
607          * worry about snapshots.
608          */
609         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
610                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
611                                               HAMMER2_CHAIN_MODIFY_TID);
612                 hammer2_chain_ref(hmp, chain);
613         } else {
614                 hammer2_freemap_free(hmp, chain->bref.data_off,
615                                      chain->bref.type);
616         }
617
618         /*
619          * Relocate the block, even if making it smaller (because different
620          * block sizes may be in different regions).
621          */
622         chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type,
623                                                      nbytes);
624         chain->bytes = nbytes;
625         ip->delta_dcount += (ssize_t)(nbytes - obytes); /* XXX atomic */
626
627         /*
628          * The device buffer may be larger than the allocation size.
629          */
630         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
631                 bbytes = HAMMER2_MINIOSIZE;
632         pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
633         boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
634
635         /*
636          * Only copy the data if resolved, otherwise the caller is
637          * responsible.
638          */
639         if (chain->bp) {
640                 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
641                          chain->bref.type == HAMMER2_BREF_TYPE_DATA);
642                 KKASSERT(chain != &hmp->vchain);        /* safety */
643
644                 /*
645                  * The getblk() optimization can only be used if the
646                  * physical block size matches the request.
647                  */
648                 if (nbytes == bbytes) {
649                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
650                         error = 0;
651                 } else {
652                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
653                         KKASSERT(error == 0);
654                 }
655                 bdata = (char *)nbp->b_data + boff;
656
657                 if (nbytes < obytes) {
658                         bcopy(chain->data, bdata, nbytes);
659                 } else {
660                         bcopy(chain->data, bdata, obytes);
661                         bzero(bdata + obytes, nbytes - obytes);
662                 }
663
664                 /*
665                  * NOTE: The INITIAL state of the chain is left intact.
666                  *
667                  * NOTE: Because of the reallocation we have to set DIRTYBP
668                  *       if INITIAL is not set.
669                  *
670                  * NOTE: We set B_NOCACHE to throw away the previous bp and
671                  *       any VM backing store, even if it was dirty.
672                  *       Otherwise we run the risk of a logical/device
673                  *       conflict on reallocation.
674                  */
675                 chain->bp->b_flags |= B_RELBUF | B_NOCACHE;
676                 brelse(chain->bp);
677                 chain->bp = nbp;
678                 chain->data = (void *)bdata;
679                 if ((chain->flags & HAMMER2_CHAIN_INITIAL) == 0)
680                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
681         }
682         hammer2_chain_parent_setsubmod(hmp, chain);
683 }
684
685 /*
686  * Convert a locked chain that was retrieved read-only to read-write.
687  *
688  * If not already marked modified a new physical block will be allocated
689  * and assigned to the bref.
690  *
691  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
692  *                   level or the COW operation will not work.
693  *
694  * Data blocks     - The chain is usually locked RESOLVE_NEVER so as not to
695  *                   run the data through the device buffers.
696  */
697 void
698 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
699 {
700         struct buf *nbp;
701         int error;
702         hammer2_off_t pbase;
703         size_t bbytes;
704         size_t boff;
705         void *bdata;
706
707         /*
708          * Tells flush that modify_tid must be updated, otherwise only
709          * mirror_tid is updated.  This is the default.
710          */
711         if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
712                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFY_TID);
713
714         /*
715          * If the chain is already marked MODIFIED we can just return.
716          */
717         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
718                 if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
719                     chain->bp == NULL) {
720                         goto skip1;
721                 }
722                 return;
723         }
724
725         /*
726          * Set MODIFIED and add a chain ref to prevent destruction.  Both
727          * modified flags share the same ref.
728          */
729         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
730         hammer2_chain_ref(hmp, chain);
731
732         /*
733          * We must allocate the copy-on-write block.
734          *
735          * If the data is embedded no other action is required.
736          *
737          * If the data is not embedded we acquire and clear the
738          * new block.  If chain->data is not NULL we then do the
739          * copy-on-write.  chain->data will then be repointed to the new
740          * buffer and the old buffer will be released.
741          *
742          * For newly created elements with no prior allocation we go
743          * through the copy-on-write steps except without the copying part.
744          */
745         if (chain != &hmp->vchain) {
746                 if ((hammer2_debug & 0x0001) &&
747                     (chain->bref.data_off & HAMMER2_OFF_MASK)) {
748                         kprintf("Replace %d\n", chain->bytes);
749                 }
750                 chain->bref.data_off =
751                         hammer2_freemap_alloc(hmp, chain->bref.type,
752                                               chain->bytes);
753                 /* XXX failed allocation */
754         }
755
756         /*
757          * If data instantiation is optional and the chain has no current
758          * data association (typical for DATA and newly-created INDIRECT
759          * elements), don't instantiate the buffer now.
760          */
761         if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL)
762                 goto skip2;
763
764 skip1:
765         /*
766          * Setting the DIRTYBP flag will cause the buffer to be dirtied or
767          * written-out on unlock.  This bit is independent of the MODIFIED
768          * bit because the chain may still need meta-data adjustments done
769          * by virtue of MODIFIED for its parent, and the buffer can be
770          * flushed out (possibly multiple times) by the OS before that.
771          *
772          * Clearing the INITIAL flag (for indirect blocks) indicates that
773          * a zero-fill buffer has been instantiated.
774          */
775         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
776         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
777
778         /*
779          * We currently should never instantiate a device buffer for a
780          * data chain.
781          */
782         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
783
784         /*
785          * Execute COW operation
786          */
787         switch(chain->bref.type) {
788         case HAMMER2_BREF_TYPE_VOLUME:
789         case HAMMER2_BREF_TYPE_INODE:
790                 /*
791                  * The data is embedded, no copy-on-write operation is
792                  * needed.
793                  */
794                 KKASSERT(chain->bp == NULL);
795                 break;
796         case HAMMER2_BREF_TYPE_DATA:
797         case HAMMER2_BREF_TYPE_INDIRECT:
798                 /*
799                  * Perform the copy-on-write operation
800                  */
801                 KKASSERT(chain != &hmp->vchain);        /* safety */
802                 /*
803                  * The device buffer may be larger than the allocation size.
804                  */
805                 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
806                         bbytes = HAMMER2_MINIOSIZE;
807                 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
808                 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
809
810                 /*
811                  * The getblk() optimization can only be used if the
812                  * physical block size matches the request.
813                  */
814                 if (chain->bytes == bbytes) {
815                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
816                         error = 0;
817                 } else {
818                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
819                         KKASSERT(error == 0);
820                 }
821                 bdata = (char *)nbp->b_data + boff;
822
823                 /*
824                  * Copy or zero-fill on write depending on whether
825                  * chain->data exists or not.
826                  */
827                 if (chain->data) {
828                         bcopy(chain->data, bdata, chain->bytes);
829                         KKASSERT(chain->bp != NULL);
830                 } else {
831                         bzero(bdata, chain->bytes);
832                 }
833                 if (chain->bp) {
834                         chain->bp->b_flags |= B_RELBUF;
835                         brelse(chain->bp);
836                 }
837                 chain->bp = nbp;
838                 chain->data = bdata;
839                 break;
840         default:
841                 panic("hammer2_chain_modify: illegal non-embedded type %d",
842                       chain->bref.type);
843                 break;
844
845         }
846 skip2:
847         if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
848                 hammer2_chain_parent_setsubmod(hmp, chain);
849 }
850
851 /*
852  * Mark the volume as having been modified.  This short-cut version
853  * does not have to lock the volume's chain, which allows the ioctl
854  * code to make adjustments to connections without deadlocking.
855  */
856 void
857 hammer2_modify_volume(hammer2_mount_t *hmp)
858 {
859         hammer2_voldata_lock(hmp);
860         atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
861         hammer2_voldata_unlock(hmp);
862 }
863
864 /*
865  * Locate an in-memory chain.  The parent must be locked.  The in-memory
866  * chain is returned or NULL if no in-memory chain is present.
867  *
868  * NOTE: A chain on-media might exist for this index when NULL is returned.
869  */
870 hammer2_chain_t *
871 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
872 {
873         hammer2_chain_t dummy;
874         hammer2_chain_t *chain;
875
876         dummy.index = index;
877         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
878         return (chain);
879 }
880
881 /*
882  * Return a locked chain structure with all associated data acquired.
883  *
884  * Caller must lock the parent on call, the returned child will be locked.
885  */
886 hammer2_chain_t *
887 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
888                   int index, int flags)
889 {
890         hammer2_blockref_t *bref;
891         hammer2_chain_t *chain;
892         hammer2_chain_t dummy;
893         int how;
894
895         /*
896          * Figure out how to lock.  MAYBE can be used to optimized
897          * the initial-create state for indirect blocks.
898          */
899         if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
900                 how = HAMMER2_RESOLVE_NEVER;
901         else
902                 how = HAMMER2_RESOLVE_MAYBE;
903
904         /*
905          * First see if we have a (possibly modified) chain element cached
906          * for this (parent, index).  Acquire the data if necessary.
907          *
908          * If chain->data is non-NULL the chain should already be marked
909          * modified.
910          */
911         dummy.index = index;
912         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
913         if (chain) {
914                 if (flags & HAMMER2_LOOKUP_NOLOCK)
915                         hammer2_chain_ref(hmp, chain);
916                 else
917                         hammer2_chain_lock(hmp, chain, how);
918                 return(chain);
919         }
920
921         /*
922          * the get function must always succeed, panic if there's no
923          * data to index.
924          */
925         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
926                 panic("hammer2_chain_get: Missing bref(1)");
927                 /* NOT REACHED */
928         }
929
930         /*
931          * Otherwise lookup the bref and issue I/O (switch on the parent)
932          */
933         switch(parent->bref.type) {
934         case HAMMER2_BREF_TYPE_INODE:
935                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
936                 bref = &parent->data->ipdata.u.blockset.blockref[index];
937                 break;
938         case HAMMER2_BREF_TYPE_INDIRECT:
939                 KKASSERT(parent->data != NULL);
940                 KKASSERT(index >= 0 &&
941                          index < parent->bytes / sizeof(hammer2_blockref_t));
942                 bref = &parent->data->npdata.blockref[index];
943                 break;
944         case HAMMER2_BREF_TYPE_VOLUME:
945                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
946                 bref = &hmp->voldata.sroot_blockset.blockref[index];
947                 break;
948         default:
949                 bref = NULL;
950                 panic("hammer2_chain_get: unrecognized blockref type: %d",
951                       parent->bref.type);
952         }
953         if (bref->type == 0) {
954                 panic("hammer2_chain_get: Missing bref(2)");
955                 /* NOT REACHED */
956         }
957
958         /*
959          * Allocate a chain structure representing the existing media
960          * entry.
961          *
962          * The locking operation we do later will issue I/O to read it.
963          */
964         chain = hammer2_chain_alloc(hmp, bref);
965
966         /*
967          * Link the chain into its parent.  Caller is expected to hold an
968          * exclusive lock on the parent.
969          */
970         chain->parent = parent;
971         chain->index = index;
972         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
973                 panic("hammer2_chain_link: collision");
974         KKASSERT(parent->refs > 0);
975         atomic_add_int(&parent->refs, 1);       /* for splay entry */
976
977         /*
978          * Additional linkage for inodes.  Reuse the parent pointer to
979          * find the parent directory.
980          */
981         if (bref->type == HAMMER2_BREF_TYPE_INODE) {
982                 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
983                         parent = parent->parent;
984                 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
985                         chain->u.ip->pip = parent->u.ip;
986                         chain->u.ip->pmp = parent->u.ip->pmp;
987                         chain->u.ip->depth = parent->u.ip->depth + 1;
988                 }
989         }
990
991         /*
992          * Our new chain structure has already been referenced and locked
993          * but the lock code handles the I/O so call it to resolve the data.
994          * Then release one of our two exclusive locks.
995          *
996          * If NOLOCK is set the release will release the one-and-only lock.
997          */
998         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
999                 hammer2_chain_lock(hmp, chain, how);    /* recusive lock */
1000                 hammer2_chain_drop(hmp, chain);         /* excess ref */
1001         }
1002         lockmgr(&chain->lk, LK_RELEASE);                /* from alloc */
1003
1004         return (chain);
1005 }
1006
1007 /*
1008  * Locate any key between key_beg and key_end inclusive.  (*parentp)
1009  * typically points to an inode but can also point to a related indirect
1010  * block and this function will recurse upwards and find the inode again.
1011  *
1012  * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
1013  *           WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
1014  *           WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
1015  *
1016  * (*parentp) must be exclusively locked and referenced and can be an inode
1017  * or an existing indirect block within the inode.
1018  *
1019  * On return (*parentp) will be modified to point at the deepest parent chain
1020  * element encountered during the search, as a helper for an insertion or
1021  * deletion.   The new (*parentp) will be locked and referenced and the old
1022  * will be unlocked and dereferenced (no change if they are both the same).
1023  *
1024  * The matching chain will be returned exclusively locked and referenced.
1025  *
1026  * NULL is returned if no match was found, but (*parentp) will still
1027  * potentially be adjusted.
1028  *
1029  * This function will also recurse up the chain if the key is not within the
1030  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
1031  * can simply allow (*parentp) to float inside the loop.
1032  */
1033 hammer2_chain_t *
1034 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1035                      hammer2_key_t key_beg, hammer2_key_t key_end,
1036                      int flags)
1037 {
1038         hammer2_chain_t *parent;
1039         hammer2_chain_t *chain;
1040         hammer2_chain_t *tmp;
1041         hammer2_blockref_t *base;
1042         hammer2_blockref_t *bref;
1043         hammer2_key_t scan_beg;
1044         hammer2_key_t scan_end;
1045         int count = 0;
1046         int i;
1047
1048         /*
1049          * Recurse (*parentp) upward if necessary until the parent completely
1050          * encloses the key range or we hit the inode.
1051          */
1052         parent = *parentp;
1053         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1054                 scan_beg = parent->bref.key;
1055                 scan_end = scan_beg +
1056                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1057                 if (key_beg >= scan_beg && key_end <= scan_end)
1058                         break;
1059                 hammer2_chain_ref(hmp, parent);         /* ref old parent */
1060                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1061                 parent = parent->parent;
1062                                                         /* lock new parent */
1063                 hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
1064                 hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
1065                 *parentp = parent;                      /* new parent */
1066         }
1067
1068 again:
1069         /*
1070          * Locate the blockref array.  Currently we do a fully associative
1071          * search through the array.
1072          */
1073         switch(parent->bref.type) {
1074         case HAMMER2_BREF_TYPE_INODE:
1075                 /*
1076                  * Special shortcut for embedded data returns the inode
1077                  * itself.  Callers must detect this condition and access
1078                  * the embedded data (the strategy code does this for us).
1079                  *
1080                  * This is only applicable to regular files and softlinks.
1081                  */
1082                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1083                         if (flags & HAMMER2_LOOKUP_NOLOCK)
1084                                 hammer2_chain_ref(hmp, parent);
1085                         else
1086                                 hammer2_chain_lock(hmp, parent,
1087                                                    HAMMER2_RESOLVE_ALWAYS);
1088                         return (parent);
1089                 }
1090                 base = &parent->data->ipdata.u.blockset.blockref[0];
1091                 count = HAMMER2_SET_COUNT;
1092                 break;
1093         case HAMMER2_BREF_TYPE_INDIRECT:
1094                 /*
1095                  * Optimize indirect blocks in the INITIAL state to avoid
1096                  * I/O.
1097                  */
1098                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1099                         base = NULL;
1100                 } else {
1101                         if (parent->data == NULL)
1102                                 panic("parent->data is NULL");
1103                         base = &parent->data->npdata.blockref[0];
1104                 }
1105                 count = parent->bytes / sizeof(hammer2_blockref_t);
1106                 break;
1107         case HAMMER2_BREF_TYPE_VOLUME:
1108                 base = &hmp->voldata.sroot_blockset.blockref[0];
1109                 count = HAMMER2_SET_COUNT;
1110                 break;
1111         default:
1112                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
1113                       parent->bref.type);
1114                 base = NULL;    /* safety */
1115                 count = 0;      /* safety */
1116         }
1117
1118         /*
1119          * If the element and key overlap we use the element.
1120          */
1121         bref = NULL;
1122         for (i = 0; i < count; ++i) {
1123                 tmp = hammer2_chain_find(hmp, parent, i);
1124                 if (tmp) {
1125                         bref = &tmp->bref;
1126                         KKASSERT(bref->type != 0);
1127                 } else if (base == NULL || base[i].type == 0) {
1128                         continue;
1129                 } else {
1130                         bref = &base[i];
1131                 }
1132                 scan_beg = bref->key;
1133                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1134                 if (key_beg <= scan_end && key_end >= scan_beg)
1135                         break;
1136         }
1137         if (i == count) {
1138                 if (key_beg == key_end)
1139                         return (NULL);
1140                 return (hammer2_chain_next(hmp, parentp, NULL,
1141                                            key_beg, key_end, flags));
1142         }
1143
1144         /*
1145          * Acquire the new chain element.  If the chain element is an
1146          * indirect block we must search recursively.
1147          */
1148         chain = hammer2_chain_get(hmp, parent, i, flags);
1149         if (chain == NULL)
1150                 return (NULL);
1151
1152         /*
1153          * If the chain element is an indirect block it becomes the new
1154          * parent and we loop on it.
1155          *
1156          * The parent always has to be locked with at least RESOLVE_MAYBE,
1157          * so it might need a fixup if the caller passed incompatible flags.
1158          */
1159         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1160                 hammer2_chain_unlock(hmp, parent);
1161                 *parentp = parent = chain;
1162                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1163                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
1164                         hammer2_chain_drop(hmp, chain); /* excess ref */
1165                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1166                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
1167                         hammer2_chain_unlock(hmp, chain);
1168                 }
1169                 goto again;
1170         }
1171
1172         /*
1173          * All done, return chain
1174          */
1175         return (chain);
1176 }
1177
1178 /*
1179  * After having issued a lookup we can iterate all matching keys.
1180  *
1181  * If chain is non-NULL we continue the iteration from just after it's index.
1182  *
1183  * If chain is NULL we assume the parent was exhausted and continue the
1184  * iteration at the next parent.
1185  *
1186  * parent must be locked on entry and remains locked throughout.  chain's
1187  * lock status must match flags.
1188  */
1189 hammer2_chain_t *
1190 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1191                    hammer2_chain_t *chain,
1192                    hammer2_key_t key_beg, hammer2_key_t key_end,
1193                    int flags)
1194 {
1195         hammer2_chain_t *parent;
1196         hammer2_chain_t *tmp;
1197         hammer2_blockref_t *base;
1198         hammer2_blockref_t *bref;
1199         hammer2_key_t scan_beg;
1200         hammer2_key_t scan_end;
1201         int i;
1202         int count;
1203
1204         parent = *parentp;
1205
1206 again:
1207         /*
1208          * Calculate the next index and recalculate the parent if necessary.
1209          */
1210         if (chain) {
1211                 /*
1212                  * Continue iteration within current parent.  If not NULL
1213                  * the passed-in chain may or may not be locked, based on
1214                  * the LOOKUP_NOLOCK flag (passed in as returned from lookup
1215                  * or a prior next).
1216                  */
1217                 i = chain->index + 1;
1218                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1219                         hammer2_chain_drop(hmp, chain);
1220                 else
1221                         hammer2_chain_unlock(hmp, chain);
1222
1223                 /*
1224                  * Any scan where the lookup returned degenerate data embedded
1225                  * in the inode has an invalid index and must terminate.
1226                  */
1227                 if (chain == parent)
1228                         return(NULL);
1229                 chain = NULL;
1230         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) {
1231                 /*
1232                  * We reached the end of the iteration.
1233                  */
1234                 return (NULL);
1235         } else {
1236                 /*
1237                  * Continue iteration with next parent unless the current
1238                  * parent covers the range.
1239                  */
1240                 hammer2_chain_t *nparent;
1241
1242                 scan_beg = parent->bref.key;
1243                 scan_end = scan_beg +
1244                             ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1245                 if (key_beg >= scan_beg && key_end <= scan_end)
1246                         return (NULL);
1247
1248                 i = parent->index + 1;
1249                 nparent = parent->parent;
1250                 hammer2_chain_ref(hmp, nparent);        /* ref new parent */
1251                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1252                                                         /* lock new parent */
1253                 hammer2_chain_lock(hmp, nparent, HAMMER2_RESOLVE_MAYBE);
1254                 hammer2_chain_drop(hmp, nparent);       /* drop excess ref */
1255                 *parentp = parent = nparent;
1256         }
1257
1258 again2:
1259         /*
1260          * Locate the blockref array.  Currently we do a fully associative
1261          * search through the array.
1262          */
1263         switch(parent->bref.type) {
1264         case HAMMER2_BREF_TYPE_INODE:
1265                 base = &parent->data->ipdata.u.blockset.blockref[0];
1266                 count = HAMMER2_SET_COUNT;
1267                 break;
1268         case HAMMER2_BREF_TYPE_INDIRECT:
1269                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1270                         base = NULL;
1271                 } else {
1272                         KKASSERT(parent->data != NULL);
1273                         base = &parent->data->npdata.blockref[0];
1274                 }
1275                 count = parent->bytes / sizeof(hammer2_blockref_t);
1276                 break;
1277         case HAMMER2_BREF_TYPE_VOLUME:
1278                 base = &hmp->voldata.sroot_blockset.blockref[0];
1279                 count = HAMMER2_SET_COUNT;
1280                 break;
1281         default:
1282                 panic("hammer2_chain_next: unrecognized blockref type: %d",
1283                       parent->bref.type);
1284                 base = NULL;    /* safety */
1285                 count = 0;      /* safety */
1286                 break;
1287         }
1288         KKASSERT(i <= count);
1289
1290         /*
1291          * Look for the key.  If we are unable to find a match and an exact
1292          * match was requested we return NULL.  If a range was requested we
1293          * run hammer2_chain_next() to iterate.
1294          */
1295         bref = NULL;
1296         while (i < count) {
1297                 tmp = hammer2_chain_find(hmp, parent, i);
1298                 if (tmp) {
1299                         bref = &tmp->bref;
1300                 } else if (base == NULL || base[i].type == 0) {
1301                         ++i;
1302                         continue;
1303                 } else {
1304                         bref = &base[i];
1305                 }
1306                 scan_beg = bref->key;
1307                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1308                 if (key_beg <= scan_end && key_end >= scan_beg)
1309                         break;
1310                 ++i;
1311         }
1312
1313         /*
1314          * If we couldn't find a match recurse up a parent to continue the
1315          * search.
1316          */
1317         if (i == count)
1318                 goto again;
1319
1320         /*
1321          * Acquire the new chain element.  If the chain element is an
1322          * indirect block we must search recursively.
1323          */
1324         chain = hammer2_chain_get(hmp, parent, i, flags);
1325         if (chain == NULL)
1326                 return (NULL);
1327
1328         /*
1329          * If the chain element is an indirect block it becomes the new
1330          * parent and we loop on it.
1331          *
1332          * The parent always has to be locked with at least RESOLVE_MAYBE,
1333          * so it might need a fixup if the caller passed incompatible flags.
1334          */
1335         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1336                 hammer2_chain_unlock(hmp, parent);
1337                 *parentp = parent = chain;
1338                 chain = NULL;
1339                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1340                         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
1341                         hammer2_chain_drop(hmp, parent);        /* excess ref */
1342                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1343                         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
1344                         hammer2_chain_unlock(hmp, parent);
1345                 }
1346                 i = 0;
1347                 goto again2;
1348         }
1349
1350         /*
1351          * All done, return chain
1352          */
1353         return (chain);
1354 }
1355
1356 /*
1357  * Create and return a new hammer2 system memory structure of the specified
1358  * key, type and size and insert it RELATIVE TO (PARENT).
1359  *
1360  * (parent) is typically either an inode or an indirect block, acquired
1361  * acquired as a side effect of issuing a prior failed lookup.  parent
1362  * must be locked and held.  Do not pass the inode chain to this function
1363  * unless that is the chain returned by the failed lookup.
1364  *
1365  * Non-indirect types will automatically allocate indirect blocks as required
1366  * if the new item does not fit in the current (parent).
1367  *
1368  * Indirect types will move a portion of the existing blockref array in
1369  * (parent) into the new indirect type and then use one of the free slots
1370  * to emplace the new indirect type.
1371  *
1372  * A new locked, referenced chain element is returned of the specified type.
1373  * The element may or may not have a data area associated with it:
1374  *
1375  *      VOLUME          not allowed here
1376  *      INODE           embedded data are will be set-up
1377  *      INDIRECT        not allowed here
1378  *      DATA            no data area will be set-up (caller is expected
1379  *                      to have logical buffers, we don't want to alias
1380  *                      the data onto device buffers!).
1381  */
1382 hammer2_chain_t *
1383 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1384                      hammer2_chain_t *chain,
1385                      hammer2_key_t key, int keybits, int type, size_t bytes)
1386 {
1387         hammer2_blockref_t dummy;
1388         hammer2_blockref_t *base;
1389         hammer2_chain_t dummy_chain;
1390         int unlock_parent = 0;
1391         int allocated = 0;
1392         int count;
1393         int i;
1394
1395         if (chain == NULL) {
1396                 /*
1397                  * First allocate media space and construct the dummy bref,
1398                  * then allocate the in-memory chain structure.
1399                  */
1400                 bzero(&dummy, sizeof(dummy));
1401                 dummy.type = type;
1402                 dummy.key = key;
1403                 dummy.keybits = keybits;
1404                 dummy.data_off = hammer2_bytes_to_radix(bytes);
1405                 chain = hammer2_chain_alloc(hmp, &dummy);
1406                 allocated = 1;
1407
1408                 /*
1409                  * We do NOT set INITIAL here (yet).  INITIAL is only
1410                  * used for indirect blocks.
1411                  *
1412                  * Recalculate bytes to reflect the actual media block
1413                  * allocation.
1414                  */
1415                 bytes = (hammer2_off_t)1 <<
1416                         (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
1417                 chain->bytes = bytes;
1418
1419                 switch(type) {
1420                 case HAMMER2_BREF_TYPE_VOLUME:
1421                         panic("hammer2_chain_create: called with volume type");
1422                         break;
1423                 case HAMMER2_BREF_TYPE_INODE:
1424                         KKASSERT(bytes == HAMMER2_INODE_BYTES);
1425                         chain->data = (void *)&chain->u.ip->ip_data;
1426                         break;
1427                 case HAMMER2_BREF_TYPE_INDIRECT:
1428                         panic("hammer2_chain_create: cannot be used to"
1429                               "create indirect block");
1430                         break;
1431                 case HAMMER2_BREF_TYPE_DATA:
1432                 default:
1433                         /* leave chain->data NULL */
1434                         KKASSERT(chain->data == NULL);
1435                         break;
1436                 }
1437         } else {
1438                 /*
1439                  * Potentially update the chain's key/keybits.
1440                  */
1441                 chain->bref.key = key;
1442                 chain->bref.keybits = keybits;
1443         }
1444
1445 again:
1446         /*
1447          * Locate a free blockref in the parent's array
1448          */
1449         switch(parent->bref.type) {
1450         case HAMMER2_BREF_TYPE_INODE:
1451                 KKASSERT((parent->u.ip->ip_data.op_flags &
1452                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
1453                 KKASSERT(parent->data != NULL);
1454                 base = &parent->data->ipdata.u.blockset.blockref[0];
1455                 count = HAMMER2_SET_COUNT;
1456                 break;
1457         case HAMMER2_BREF_TYPE_INDIRECT:
1458                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1459                         base = NULL;
1460                 } else {
1461                         KKASSERT(parent->data != NULL);
1462                         base = &parent->data->npdata.blockref[0];
1463                 }
1464                 count = parent->bytes / sizeof(hammer2_blockref_t);
1465                 break;
1466         case HAMMER2_BREF_TYPE_VOLUME:
1467                 KKASSERT(parent->data != NULL);
1468                 base = &hmp->voldata.sroot_blockset.blockref[0];
1469                 count = HAMMER2_SET_COUNT;
1470                 break;
1471         default:
1472                 panic("hammer2_chain_create: unrecognized blockref type: %d",
1473                       parent->bref.type);
1474                 count = 0;
1475                 break;
1476         }
1477
1478         /*
1479          * Scan for an unallocated bref, also skipping any slots occupied
1480          * by in-memory chain elements that may not yet have been updated
1481          * in the parent's bref array.
1482          */
1483         bzero(&dummy_chain, sizeof(dummy_chain));
1484         for (i = 0; i < count; ++i) {
1485                 if (base == NULL) {
1486                         dummy_chain.index = i;
1487                         if (SPLAY_FIND(hammer2_chain_splay,
1488                                        &parent->shead, &dummy_chain) == NULL) {
1489                                 break;
1490                         }
1491                 } else if (base[i].type == 0) {
1492                         dummy_chain.index = i;
1493                         if (SPLAY_FIND(hammer2_chain_splay,
1494                                        &parent->shead, &dummy_chain) == NULL) {
1495                                 break;
1496                         }
1497                 }
1498         }
1499
1500         /*
1501          * If no free blockref could be found we must create an indirect
1502          * block and move a number of blockrefs into it.  With the parent
1503          * locked we can safely lock each child in order to move it without
1504          * causing a deadlock.
1505          *
1506          * This may return the new indirect block or the old parent depending
1507          * on where the key falls.
1508          */
1509         if (i == count) {
1510                 hammer2_chain_t *nparent;
1511
1512                 nparent = hammer2_chain_create_indirect(hmp, parent,
1513                                                         key, keybits);
1514                 if (nparent == NULL) {
1515                         if (allocated)
1516                                 hammer2_chain_free(hmp, chain);
1517                         chain = NULL;
1518                         goto done;
1519                 }
1520                 if (parent != nparent) {
1521                         if (unlock_parent)
1522                                 hammer2_chain_unlock(hmp, parent);
1523                         parent = nparent;
1524                         unlock_parent = 1;
1525                 }
1526                 goto again;
1527         }
1528
1529         /*
1530          * Link the chain into its parent.  Later on we will have to set
1531          * the MOVED bit in situations where we don't mark the new chain
1532          * as being modified.
1533          */
1534         if (chain->parent != NULL)
1535                 panic("hammer2: hammer2_chain_create: chain already connected");
1536         KKASSERT(chain->parent == NULL);
1537         chain->parent = parent;
1538         chain->index = i;
1539         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
1540                 panic("hammer2_chain_link: collision");
1541         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
1542         KKASSERT(parent->refs > 0);
1543         atomic_add_int(&parent->refs, 1);
1544
1545         /*
1546          * Additional linkage for inodes.  Reuse the parent pointer to
1547          * find the parent directory.
1548          *
1549          * Cumulative adjustments are inherited on [re]attach.
1550          */
1551         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1552                 hammer2_chain_t *scan = parent;
1553                 hammer2_inode_t *ip = chain->u.ip;
1554
1555                 while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1556                         scan = scan->parent;
1557                 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE) {
1558                         ip->pip = scan->u.ip;
1559                         ip->pmp = scan->u.ip->pmp;
1560                         ip->depth = scan->u.ip->depth + 1;
1561                         ip->delta_icount += ip->ip_data.inode_count;
1562                         ip->delta_dcount += ip->ip_data.data_count;
1563                         ++ip->pip->delta_icount;
1564                 }
1565         }
1566
1567         /*
1568          * (allocated) indicates that this is a newly-created chain element
1569          * rather than a renamed chain element.  In this situation we want
1570          * to place the chain element in the MODIFIED state.
1571          *
1572          * The data area will be set up as follows:
1573          *
1574          *      VOLUME          not allowed here.
1575          *
1576          *      INODE           embedded data are will be set-up.
1577          *
1578          *      INDIRECT        not allowed here.
1579          *
1580          *      DATA            no data area will be set-up (caller is expected
1581          *                      to have logical buffers, we don't want to alias
1582          *                      the data onto device buffers!).
1583          */
1584         if (allocated) {
1585                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1586                         hammer2_chain_modify(hmp, chain,
1587                                              HAMMER2_MODIFY_OPTDATA);
1588                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1589                         /* not supported in this function */
1590                         panic("hammer2_chain_create: bad type");
1591                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1592                         hammer2_chain_modify(hmp, chain,
1593                                              HAMMER2_MODIFY_OPTDATA);
1594                 } else {
1595                         hammer2_chain_modify(hmp, chain, 0);
1596                 }
1597         } else {
1598                 /*
1599                  * When reconnecting inodes we have to call setsubmod()
1600                  * to ensure that its state propagates up the newly
1601                  * connected parent.
1602                  *
1603                  * Make sure MOVED is set but do not update bref_flush.  If
1604                  * the chain is undergoing modification bref_flush will be
1605                  * updated when it gets flushed.  If it is not then the
1606                  * bref may not have been flushed yet and we do not want to
1607                  * set MODIFIED here as this could result in unnecessary
1608                  * reallocations.
1609                  */
1610                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1611                         hammer2_chain_ref(hmp, chain);
1612                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1613                 }
1614                 hammer2_chain_parent_setsubmod(hmp, chain);
1615         }
1616
1617 done:
1618         if (unlock_parent)
1619                 hammer2_chain_unlock(hmp, parent);
1620         return (chain);
1621 }
1622
1623 /*
1624  * Create an indirect block that covers one or more of the elements in the
1625  * current parent.  Either returns the existing parent with no locking or
1626  * ref changes or returns the new indirect block locked and referenced
1627  * and leaving the original parent lock/ref intact as well.
1628  *
1629  * The returned chain depends on where the specified key falls.
1630  *
1631  * The key/keybits for the indirect mode only needs to follow three rules:
1632  *
1633  * (1) That all elements underneath it fit within its key space and
1634  *
1635  * (2) That all elements outside it are outside its key space.
1636  *
1637  * (3) When creating the new indirect block any elements in the current
1638  *     parent that fit within the new indirect block's keyspace must be
1639  *     moved into the new indirect block.
1640  *
1641  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
1642  *     keyspace the the current parent, but lookup/iteration rules will
1643  *     ensure (and must ensure) that rule (2) for all parents leading up
1644  *     to the nearest inode or the root volume header is adhered to.  This
1645  *     is accomplished by always recursing through matching keyspaces in
1646  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
1647  *
1648  * The current implementation calculates the current worst-case keyspace by
1649  * iterating the current parent and then divides it into two halves, choosing
1650  * whichever half has the most elements (not necessarily the half containing
1651  * the requested key).
1652  *
1653  * We can also opt to use the half with the least number of elements.  This
1654  * causes lower-numbered keys (aka logical file offsets) to recurse through
1655  * fewer indirect blocks and higher-numbered keys to recurse through more.
1656  * This also has the risk of not moving enough elements to the new indirect
1657  * block and being forced to create several indirect blocks before the element
1658  * can be inserted.
1659  */
1660 static
1661 hammer2_chain_t *
1662 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1663                               hammer2_key_t create_key, int create_bits)
1664 {
1665         hammer2_blockref_t *base;
1666         hammer2_blockref_t *bref;
1667         hammer2_chain_t *chain;
1668         hammer2_chain_t *ichain;
1669         hammer2_chain_t dummy;
1670         hammer2_key_t key = create_key;
1671         int keybits = create_bits;
1672         int locount = 0;
1673         int hicount = 0;
1674         int count;
1675         int nbytes;
1676         int i;
1677
1678         /*
1679          * Calculate the base blockref pointer or NULL if the chain
1680          * is known to be empty.  We need to calculate the array count
1681          * for SPLAY lookups either way.
1682          */
1683         hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
1684         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1685                 base = NULL;
1686
1687                 switch(parent->bref.type) {
1688                 case HAMMER2_BREF_TYPE_INODE:
1689                         count = HAMMER2_SET_COUNT;
1690                         break;
1691                 case HAMMER2_BREF_TYPE_INDIRECT:
1692                         count = parent->bytes / sizeof(hammer2_blockref_t);
1693                         break;
1694                 case HAMMER2_BREF_TYPE_VOLUME:
1695                         count = HAMMER2_SET_COUNT;
1696                         break;
1697                 default:
1698                         panic("hammer2_chain_create_indirect: "
1699                               "unrecognized blockref type: %d",
1700                               parent->bref.type);
1701                         count = 0;
1702                         break;
1703                 }
1704         } else {
1705                 switch(parent->bref.type) {
1706                 case HAMMER2_BREF_TYPE_INODE:
1707                         base = &parent->data->ipdata.u.blockset.blockref[0];
1708                         count = HAMMER2_SET_COUNT;
1709                         break;
1710                 case HAMMER2_BREF_TYPE_INDIRECT:
1711                         base = &parent->data->npdata.blockref[0];
1712                         count = parent->bytes / sizeof(hammer2_blockref_t);
1713                         break;
1714                 case HAMMER2_BREF_TYPE_VOLUME:
1715                         base = &hmp->voldata.sroot_blockset.blockref[0];
1716                         count = HAMMER2_SET_COUNT;
1717                         break;
1718                 default:
1719                         panic("hammer2_chain_create_indirect: "
1720                               "unrecognized blockref type: %d",
1721                               parent->bref.type);
1722                         count = 0;
1723                         break;
1724                 }
1725         }
1726
1727         /*
1728          * Scan for an unallocated bref, also skipping any slots occupied
1729          * by in-memory chain elements which may not yet have been updated
1730          * in the parent's bref array.
1731          */
1732         bzero(&dummy, sizeof(dummy));
1733         for (i = 0; i < count; ++i) {
1734                 int nkeybits;
1735
1736                 dummy.index = i;
1737                 chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
1738                 if (chain) {
1739                         bref = &chain->bref;
1740                 } else if (base && base[i].type) {
1741                         bref = &base[i];
1742                 } else {
1743                         continue;
1744                 }
1745
1746                 /*
1747                  * Expand our calculated key range (key, keybits) to fit
1748                  * the scanned key.  nkeybits represents the full range
1749                  * that we will later cut in half (two halves @ nkeybits - 1).
1750                  */
1751                 nkeybits = keybits;
1752                 if (nkeybits < bref->keybits)
1753                         nkeybits = bref->keybits;
1754                 while (nkeybits < 64 &&
1755                        (~(((hammer2_key_t)1 << nkeybits) - 1) &
1756                         (key ^ bref->key)) != 0) {
1757                         ++nkeybits;
1758                 }
1759
1760                 /*
1761                  * If the new key range is larger we have to determine
1762                  * which side of the new key range the existing keys fall
1763                  * under by checking the high bit, then collapsing the
1764                  * locount into the hicount or vise-versa.
1765                  */
1766                 if (keybits != nkeybits) {
1767                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
1768                                 hicount += locount;
1769                                 locount = 0;
1770                         } else {
1771                                 locount += hicount;
1772                                 hicount = 0;
1773                         }
1774                         keybits = nkeybits;
1775                 }
1776
1777                 /*
1778                  * The newly scanned key will be in the lower half or the
1779                  * higher half of the (new) key range.
1780                  */
1781                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
1782                         ++hicount;
1783                 else
1784                         ++locount;
1785         }
1786
1787         /*
1788          * Adjust keybits to represent half of the full range calculated
1789          * above (radix 63 max)
1790          */
1791         --keybits;
1792
1793         /*
1794          * Select whichever half contains the most elements.  Theoretically
1795          * we can select either side as long as it contains at least one
1796          * element (in order to ensure that a free slot is present to hold
1797          * the indirect block).
1798          */
1799         key &= ~(((hammer2_key_t)1 << keybits) - 1);
1800         if (hammer2_indirect_optimize) {
1801                 /*
1802                  * Insert node for least number of keys, this will arrange
1803                  * the first few blocks of a large file or the first few
1804                  * inodes in a directory with fewer indirect blocks when
1805                  * created linearly.
1806                  */
1807                 if (hicount < locount && hicount != 0)
1808                         key |= (hammer2_key_t)1 << keybits;
1809                 else
1810                         key &= ~(hammer2_key_t)1 << keybits;
1811         } else {
1812                 /*
1813                  * Insert node for most number of keys, best for heavily
1814                  * fragmented files.
1815                  */
1816                 if (hicount > locount)
1817                         key |= (hammer2_key_t)1 << keybits;
1818                 else
1819                         key &= ~(hammer2_key_t)1 << keybits;
1820         }
1821
1822         /*
1823          * How big should our new indirect block be?  It has to be at least
1824          * as large as its parent.
1825          */
1826         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
1827                 nbytes = HAMMER2_IND_BYTES_MIN;
1828         else
1829                 nbytes = HAMMER2_IND_BYTES_MAX;
1830         if (nbytes < count * sizeof(hammer2_blockref_t))
1831                 nbytes = count * sizeof(hammer2_blockref_t);
1832
1833         /*
1834          * Ok, create our new indirect block
1835          */
1836         dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
1837         dummy.bref.key = key;
1838         dummy.bref.keybits = keybits;
1839         dummy.bref.data_off = hammer2_bytes_to_radix(nbytes);
1840         ichain = hammer2_chain_alloc(hmp, &dummy.bref);
1841         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
1842
1843         /*
1844          * Iterate the original parent and move the matching brefs into
1845          * the new indirect block.
1846          */
1847         for (i = 0; i < count; ++i) {
1848                 /*
1849                  * For keying purposes access the bref from the media or
1850                  * from our in-memory cache.  In cases where the in-memory
1851                  * cache overrides the media the keyrefs will be the same
1852                  * anyway so we can avoid checking the cache when the media
1853                  * has a key.
1854                  */
1855                 dummy.index = i;
1856                 chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
1857                 if (chain) {
1858                         bref = &chain->bref;
1859                 } else if (base && base[i].type) {
1860                         bref = &base[i];
1861                 } else {
1862                         if (ichain->index < 0)
1863                                 ichain->index = i;
1864                         continue;
1865                 }
1866
1867                 /*
1868                  * Skip keys not in the chosen half (low or high), only bit
1869                  * (keybits - 1) needs to be compared but for safety we
1870                  * will compare all msb bits plus that bit again.
1871                  */
1872                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
1873                     (key ^ bref->key)) != 0) {
1874                         continue;
1875                 }
1876
1877                 /*
1878                  * This element is being moved from the parent, its slot
1879                  * is available for our new indirect block.
1880                  */
1881                 if (ichain->index < 0)
1882                         ichain->index = i;
1883
1884                 /*
1885                  * Load the new indirect block by acquiring or allocating
1886                  * the related chain entries, then simply move them to the
1887                  * new parent (ichain).
1888                  *
1889                  * When adjusting the parent/child relationship we must
1890                  * set the MOVED bit but we do NOT update bref_flush
1891                  * because otherwise we might synchronize a bref that has
1892                  * not yet been flushed.  We depend on chain's bref_flush
1893                  * either being correct or the chain being in a MODIFIED
1894                  * state.
1895                  *
1896                  * We do not want to set MODIFIED here as this would result
1897                  * in unnecessary reallocations.
1898                  *
1899                  * We must still set SUBMODIFIED in the parent but we do
1900                  * that after the loop.
1901                  *
1902                  * XXX we really need a lock here but we don't need the
1903                  *     data.  NODATA feature needed.
1904                  */
1905                 chain = hammer2_chain_get(hmp, parent, i,
1906                                           HAMMER2_LOOKUP_NODATA);
1907                 SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
1908                 if (SPLAY_INSERT(hammer2_chain_splay, &ichain->shead, chain))
1909                         panic("hammer2_chain_create_indirect: collision");
1910                 chain->parent = ichain;
1911                 if (base)
1912                         bzero(&base[i], sizeof(base[i]));
1913                 atomic_add_int(&parent->refs, -1);
1914                 atomic_add_int(&ichain->refs, 1);
1915                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1916                         hammer2_chain_ref(hmp, chain);
1917                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1918                 }
1919                 hammer2_chain_unlock(hmp, chain);
1920                 KKASSERT(parent->refs > 0);
1921                 chain = NULL;
1922         }
1923
1924         /*
1925          * Insert the new indirect block into the parent now that we've
1926          * cleared out some entries in the parent.  We calculated a good
1927          * insertion index in the loop above (ichain->index).
1928          *
1929          * We don't have to set MOVED here because we mark ichain modified
1930          * down below (so the normal modified -> flush -> set-moved sequence
1931          * applies).
1932          */
1933         KKASSERT(ichain->index >= 0);
1934         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, ichain))
1935                 panic("hammer2_chain_create_indirect: ichain insertion");
1936         ichain->parent = parent;
1937         atomic_add_int(&parent->refs, 1);
1938
1939         /*
1940          * Mark the new indirect block modified after insertion, which
1941          * will propagate up through parent all the way to the root and
1942          * also allocate the physical block in ichain for our caller,
1943          * and assign ichain->data to a pre-zero'd space (because there
1944          * is not prior data to copy into it).
1945          *
1946          * We have to set SUBMODIFIED in ichain's flags manually so the
1947          * flusher knows it has to recurse through it to get to all of
1948          * our moved blocks, then call setsubmod() to set the bit
1949          * recursively.
1950          */
1951         hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA);
1952         hammer2_chain_parent_setsubmod(hmp, ichain);
1953         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
1954
1955         /*
1956          * Figure out what to return.
1957          */
1958         if (create_bits > keybits) {
1959                 /*
1960                  * Key being created is way outside the key range,
1961                  * return the original parent.
1962                  */
1963                 hammer2_chain_unlock(hmp, ichain);
1964         } else if (~(((hammer2_key_t)1 << keybits) - 1) &
1965                    (create_key ^ key)) {
1966                 /*
1967                  * Key being created is outside the key range,
1968                  * return the original parent.
1969                  */
1970                 hammer2_chain_unlock(hmp, ichain);
1971         } else {
1972                 /*
1973                  * Otherwise its in the range, return the new parent.
1974                  * (leave both the new and old parent locked).
1975                  */
1976                 parent = ichain;
1977         }
1978
1979         return(parent);
1980 }
1981
1982 /*
1983  * Physically delete the specified chain element.  Note that inodes with
1984  * open descriptors should not be deleted (as with other filesystems) until
1985  * the last open descriptor is closed.
1986  *
1987  * This routine will remove the chain element from its parent and potentially
1988  * also recurse upward and delete indirect blocks which become empty as a
1989  * side effect.
1990  *
1991  * The caller must pass a pointer to the chain's parent, also locked and
1992  * referenced.  (*parentp) will be modified in a manner similar to a lookup
1993  * or iteration when indirect blocks are also deleted as a side effect.
1994  *
1995  * XXX This currently does not adhere to the MOVED flag protocol in that
1996  *     the removal is immediately indicated in the parent's blockref[]
1997  *     array.
1998  */
1999 void
2000 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
2001                      hammer2_chain_t *chain, int retain)
2002 {
2003         hammer2_blockref_t *base;
2004         hammer2_inode_t *ip;
2005         int count;
2006
2007         if (chain->parent != parent)
2008                 panic("hammer2_chain_delete: parent mismatch");
2009
2010         /*
2011          * Mark the parent modified so our base[] pointer remains valid
2012          * while we move entries.  For the optimized indirect block
2013          * case mark the parent moved instead.
2014          *
2015          * Calculate the blockref reference in the parent
2016          */
2017         switch(parent->bref.type) {
2018         case HAMMER2_BREF_TYPE_INODE:
2019                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2020                 base = &parent->data->ipdata.u.blockset.blockref[0];
2021                 count = HAMMER2_SET_COUNT;
2022                 break;
2023         case HAMMER2_BREF_TYPE_INDIRECT:
2024                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA |
2025                                                   HAMMER2_MODIFY_NO_MODIFY_TID);
2026                 if (parent->flags & HAMMER2_CHAIN_INITIAL)
2027                         base = NULL;
2028                 else
2029                         base = &parent->data->npdata.blockref[0];
2030                 count = parent->bytes / sizeof(hammer2_blockref_t);
2031                 break;
2032         case HAMMER2_BREF_TYPE_VOLUME:
2033                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2034                 base = &hmp->voldata.sroot_blockset.blockref[0];
2035                 count = HAMMER2_SET_COUNT;
2036                 break;
2037         default:
2038                 panic("hammer2_chain_delete: unrecognized blockref type: %d",
2039                       parent->bref.type);
2040                 count = 0;
2041                 break;
2042         }
2043
2044         /*
2045          * Disconnect the bref in the parent, remove the chain, and
2046          * disconnect in-memory fields from the parent.
2047          */
2048         KKASSERT(chain->index >= 0 && chain->index < count);
2049         if (base)
2050                 bzero(&base[chain->index], sizeof(*base));
2051
2052         SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
2053         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2054         atomic_add_int(&parent->refs, -1);      /* for splay entry */
2055         chain->index = -1;
2056         chain->parent = NULL;
2057
2058         /*
2059          * Cumulative adjustments must be propagated to the parent inode
2060          * when deleting and synchronized to ip.  A future reattachment
2061          * (e.g. during a rename) expects only to use ip_data.*_count.
2062          *
2063          * Clear the pointer to the parent inode.
2064          */
2065         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2066                 ip = chain->u.ip;
2067                 if (ip->pip) {
2068                         ip->pip->delta_icount += ip->delta_icount;
2069                         ip->pip->delta_dcount += ip->delta_dcount;
2070                         ip->ip_data.inode_count += ip->delta_icount;
2071                         ip->ip_data.data_count += ip->delta_dcount;
2072                         ip->delta_icount = 0;
2073                         ip->delta_dcount = 0;
2074                         --ip->pip->delta_icount;
2075                         ip->pip = NULL;
2076                 }
2077                 chain->u.ip->depth = 0;
2078         }
2079
2080         /*
2081          * If retain is 0 the deletion is permanent.  Because the chain is
2082          * no longer connected to the topology a flush will have no
2083          * visibility into it.  We must dispose of the references related
2084          * to the MODIFIED and MOVED flags, otherwise the ref count will
2085          * never transition to 0.
2086          *
2087          * If retain is non-zero the deleted element is likely an inode
2088          * which the vnops frontend will mark DESTROYED and flush.  In that
2089          * situation we must retain the flags for any open file descriptors
2090          * on the (removed) inode.  The final close will destroy the
2091          * disconnected chain.
2092          */
2093         if (retain == 0) {
2094                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2095                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
2096                         hammer2_chain_drop(hmp, chain);
2097                 }
2098                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2099                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2100                         hammer2_chain_drop(hmp, chain);
2101                 }
2102         }
2103
2104         /*
2105          * The chain is still likely referenced, possibly even by a vnode
2106          * (if an inode), so defer further action until the chain gets
2107          * dropped.
2108          */
2109 }
2110
2111 /*
2112  * Recursively flush the specified chain.  The chain is locked and
2113  * referenced by the caller and will remain so on return.  The chain
2114  * will remain referenced throughout but can temporarily lose its
2115  * lock during the recursion to avoid unnecessarily stalling user
2116  * processes.
2117  *
2118  *
2119  */
2120 TAILQ_HEAD(flush_deferral_list, hammer2_chain);
2121
2122 struct hammer2_flush_info {
2123         struct flush_deferral_list flush_list;
2124         int             depth;
2125         hammer2_tid_t   modify_tid;
2126 };
2127
2128 typedef struct hammer2_flush_info hammer2_flush_info_t;
2129
2130 static void
2131 hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
2132                           hammer2_flush_info_t *info)
2133 {
2134         hammer2_blockref_t *bref;
2135         hammer2_off_t pbase;
2136         size_t bbytes;
2137         size_t boff;
2138         char *bdata;
2139         struct buf *bp;
2140         int error;
2141         int wasmodified;
2142
2143         /*
2144          * If we hit the stack recursion depth limit defer the operation.
2145          * The controller of the info structure will execute the deferral
2146          * list and then retry.
2147          *
2148          * This is only applicable if SUBMODIFIED is set.  After a reflush
2149          * SUBMODIFIED will probably be cleared and we want to drop through
2150          * to finish processing the current element so our direct parent
2151          * can process the results.
2152          */
2153         if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT &&
2154             (chain->flags & HAMMER2_CHAIN_SUBMODIFIED)) {
2155                 if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
2156                         hammer2_chain_ref(hmp, chain);
2157                         TAILQ_INSERT_TAIL(&info->flush_list,
2158                                           chain, flush_node);
2159                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
2160                 }
2161                 return;
2162         }
2163
2164         if (hammer2_debug & 0x0008)
2165                 kprintf("%*.*sCHAIN type=%d@%08jx %p/%d %04x {\n",
2166                         info->depth, info->depth, "",
2167                         chain->bref.type, chain->bref.data_off,
2168                         chain, chain->refs, chain->flags);
2169
2170         /*
2171          * If SUBMODIFIED is set we recurse the flush and adjust the
2172          * blockrefs accordingly.
2173          *
2174          * NOTE: Looping on SUBMODIFIED can prevent a flush from ever
2175          *       finishing in the face of filesystem activity.
2176          */
2177         if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
2178                 hammer2_chain_t *child;
2179                 hammer2_chain_t *next;
2180                 hammer2_blockref_t *base;
2181                 int count;
2182
2183                 /*
2184                  * Clear SUBMODIFIED to catch races.  Note that if any
2185                  * child has to be flushed SUBMODIFIED will wind up being
2186                  * set again (for next time), but this does not stop us from
2187                  * synchronizing block updates which occurred.
2188                  *
2189                  * We don't want to set our chain to MODIFIED gratuitously.
2190                  */
2191                 /* XXX SUBMODIFIED not interlocked, can race */
2192                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2193
2194                 /*
2195                  * Flush the children and update the blockrefs in the chain.
2196                  * Be careful of ripouts during the loop.
2197                  */
2198                 next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
2199                 if (next)
2200                         hammer2_chain_ref(hmp, next);
2201                 while ((child = next) != NULL) {
2202                         next = SPLAY_NEXT(hammer2_chain_splay,
2203                                           &chain->shead, child);
2204                         if (next)
2205                                 hammer2_chain_ref(hmp, next);
2206                         /*
2207                          * We only recurse if SUBMODIFIED (internal node)
2208                          * or MODIFIED (internal node or leaf) is set.
2209                          * However, we must still track whether any MOVED
2210                          * entries are present to determine if the chain's
2211                          * blockref's need updating or not.
2212                          */
2213                         if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2214                                              HAMMER2_CHAIN_MODIFIED |
2215                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2216                                 hammer2_chain_drop(hmp, child);
2217                                 continue;
2218                         }
2219                         hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_MAYBE);
2220                         hammer2_chain_drop(hmp, child);
2221                         if (child->parent != chain ||
2222                             (child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2223                                              HAMMER2_CHAIN_MODIFIED |
2224                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2225                                 hammer2_chain_unlock(hmp, child);
2226                                 continue;
2227                         }
2228
2229                         /*
2230                          * Propagate the DESTROYED flag if found set, then
2231                          * recurse the flush.
2232                          */
2233                         if ((chain->flags & HAMMER2_CHAIN_DESTROYED) &&
2234                             (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) {
2235                                 atomic_set_int(&child->flags,
2236                                                HAMMER2_CHAIN_DESTROYED |
2237                                                HAMMER2_CHAIN_SUBMODIFIED);
2238                         }
2239                         ++info->depth;
2240                         hammer2_chain_flush_pass1(hmp, child, info);
2241                         --info->depth;
2242                         hammer2_chain_unlock(hmp, child);
2243                 }
2244
2245                 /*
2246                  * Now synchronize any block updates.
2247                  */
2248                 next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
2249                 if (next)
2250                         hammer2_chain_ref(hmp, next);
2251                 while ((child = next) != NULL) {
2252                         next = SPLAY_NEXT(hammer2_chain_splay,
2253                                           &chain->shead, child);
2254                         if (next)
2255                                 hammer2_chain_ref(hmp, next);
2256                         if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) {
2257                                 hammer2_chain_drop(hmp, child);
2258                                 continue;
2259                         }
2260                         hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_NEVER);
2261                         hammer2_chain_drop(hmp, child);
2262                         if (child->parent != chain ||
2263                             (child->flags & HAMMER2_CHAIN_MOVED) == 0) {
2264                                 hammer2_chain_unlock(hmp, child);
2265                                 continue;
2266                         }
2267
2268                         hammer2_chain_modify(hmp, chain,
2269                                              HAMMER2_MODIFY_NO_MODIFY_TID);
2270
2271                         switch(chain->bref.type) {
2272                         case HAMMER2_BREF_TYPE_INODE:
2273                                 KKASSERT((chain->data->ipdata.op_flags &
2274                                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
2275                                 base = &chain->data->ipdata.u.blockset.
2276                                         blockref[0];
2277                                 count = HAMMER2_SET_COUNT;
2278                                 break;
2279                         case HAMMER2_BREF_TYPE_INDIRECT:
2280                                 base = &chain->data->npdata.blockref[0];
2281                                 count = chain->bytes /
2282                                         sizeof(hammer2_blockref_t);
2283                                 break;
2284                         case HAMMER2_BREF_TYPE_VOLUME:
2285                                 base = &hmp->voldata.sroot_blockset.blockref[0];
2286                                 count = HAMMER2_SET_COUNT;
2287                                 break;
2288                         default:
2289                                 base = NULL;
2290                                 panic("hammer2_chain_get: "
2291                                       "unrecognized blockref type: %d",
2292                                       chain->bref.type);
2293                         }
2294
2295                         KKASSERT(child->index >= 0);
2296                         base[child->index] = child->bref_flush;
2297
2298                         if (chain->bref.mirror_tid <
2299                             child->bref_flush.mirror_tid) {
2300                                 chain->bref.mirror_tid =
2301                                         child->bref_flush.mirror_tid;
2302                         }
2303
2304                         if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME &&
2305                             hmp->voldata.mirror_tid <
2306                             child->bref_flush.mirror_tid) {
2307                                 hmp->voldata.mirror_tid =
2308                                         child->bref_flush.mirror_tid;
2309                         }
2310                         atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
2311                         hammer2_chain_drop(hmp, child); /* MOVED flag */
2312                         hammer2_chain_unlock(hmp, child);
2313                 }
2314         }
2315
2316         /*
2317          * If destroying the object we unconditonally clear the MODIFIED
2318          * and MOVED bits, and we destroy the buffer without writing it
2319          * out.
2320          *
2321          * We don't bother updating the hash/crc or the chain bref.
2322          *
2323          * NOTE: The destroy'd object's bref has already been updated.
2324          *       so we can clear MOVED without propagating mirror_tid
2325          *       or modify_tid upward.
2326          *
2327          * XXX allocations for unflushed data can be returned to the
2328          *     free pool.
2329          */
2330         if (chain->flags & HAMMER2_CHAIN_DESTROYED) {
2331                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2332                         if (chain->bp) {
2333                                 chain->bp->b_flags |= B_INVAL|B_RELBUF;
2334                         }
2335                         atomic_clear_int(&chain->flags,
2336                                          HAMMER2_CHAIN_MODIFIED |
2337                                          HAMMER2_CHAIN_MODIFY_TID);
2338                         hammer2_chain_drop(hmp, chain);
2339                 }
2340                 if (chain->flags & HAMMER2_CHAIN_MODIFIED_AUX) {
2341                         atomic_clear_int(&chain->flags,
2342                                          HAMMER2_CHAIN_MODIFIED_AUX);
2343                 }
2344                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2345                         atomic_clear_int(&chain->flags,
2346                                          HAMMER2_CHAIN_MOVED);
2347                         hammer2_chain_drop(hmp, chain);
2348                 }
2349                 return;
2350         }
2351
2352         /*
2353          * Flush this chain entry only if it is marked modified.
2354          */
2355         if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
2356                              HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2357                 goto done;
2358         }
2359
2360         /*
2361          * Synchronize cumulative data and inode count adjustments to
2362          * the inode and propagate the deltas upward to the parent.
2363          */
2364         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2365                 hammer2_inode_t *ip;
2366
2367                 ip = chain->u.ip;
2368                 ip->ip_data.inode_count += ip->delta_icount;
2369                 ip->ip_data.data_count += ip->delta_dcount;
2370                 if (ip->pip) {
2371                         ip->pip->delta_icount += ip->delta_icount;
2372                         ip->pip->delta_dcount += ip->delta_dcount;
2373                 }
2374                 ip->delta_icount = 0;
2375                 ip->delta_dcount = 0;
2376         }
2377
2378         /*
2379          * Flush if MODIFIED or MODIFIED_AUX is set.  MODIFIED_AUX is only
2380          * used by the volume header (&hmp->vchain).
2381          */
2382         if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
2383                              HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2384                 goto done;
2385         }
2386         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED_AUX);
2387
2388         /*
2389          * Clear MODIFIED and set HAMMER2_CHAIN_MOVED.  The caller
2390          * will re-test the MOVED bit.  We must also update the mirror_tid
2391          * and modify_tid fields as appropriate.
2392          *
2393          * bits own a single chain ref and the MOVED bit owns its own
2394          * chain ref.
2395          */
2396         chain->bref.mirror_tid = info->modify_tid;
2397         if (chain->flags & HAMMER2_CHAIN_MODIFY_TID)
2398                 chain->bref.modify_tid = info->modify_tid;
2399         wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0;
2400         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
2401                                         HAMMER2_CHAIN_MODIFY_TID);
2402
2403         if (chain->flags & HAMMER2_CHAIN_MOVED) {
2404                 /*
2405                  * Drop the ref from the MODIFIED bit we cleared.
2406                  */
2407                 if (wasmodified)
2408                         hammer2_chain_drop(hmp, chain);
2409         } else {
2410                 /*
2411                  * If we were MODIFIED we inherit the ref from clearing
2412                  * that bit, otherwise we need another ref.
2413                  */
2414                 if (wasmodified == 0)
2415                         hammer2_chain_ref(hmp, chain);
2416                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2417         }
2418         chain->bref_flush = chain->bref;
2419
2420         /*
2421          * If this is part of a recursive flush we can go ahead and write
2422          * out the buffer cache buffer and pass a new bref back up the chain.
2423          *
2424          * This will never be a volume header.
2425          */
2426         switch(chain->bref.type) {
2427         case HAMMER2_BREF_TYPE_VOLUME:
2428                 /*
2429                  * The volume header is flushed manually by the syncer, not
2430                  * here.
2431                  */
2432                 break;
2433         case HAMMER2_BREF_TYPE_DATA:
2434                 /*
2435                  * Data elements have already been flushed via the logical
2436                  * file buffer cache.  Their hash was set in the bref by
2437                  * the vop_write code.
2438                  *
2439                  * Make sure the buffer(s) have been flushed out here.
2440                  */
2441                 bbytes = chain->bytes;
2442                 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
2443                 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
2444
2445                 bp = getblk(hmp->devvp, pbase, bbytes, GETBLK_NOWAIT, 0);
2446                 if (bp) {
2447                         if ((bp->b_flags & (B_CACHE | B_DIRTY)) ==
2448                             (B_CACHE | B_DIRTY)) {
2449                                 kprintf("x");
2450                                 cluster_awrite(bp);
2451                         } else {
2452                                 bp->b_flags |= B_RELBUF;
2453                                 brelse(bp);
2454                         }
2455                 }
2456                 break;
2457         case HAMMER2_BREF_TYPE_INDIRECT:
2458                 /*
2459                  * Indirect blocks may be in an INITIAL state.  Use the
2460                  * chain_lock() call to ensure that the buffer has been
2461                  * instantiated (even though it is already locked the buffer
2462                  * might not have been instantiated).
2463                  *
2464                  * Only write the buffer out if it is dirty, it is possible
2465                  * the operating system had already written out the buffer.
2466                  */
2467                 hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_ALWAYS);
2468                 KKASSERT(chain->bp != NULL);
2469
2470                 bp = chain->bp;
2471                 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) ||
2472                     (bp->b_flags & B_DIRTY)) {
2473                         bawrite(chain->bp);
2474                 } else {
2475                         brelse(chain->bp);
2476                 }
2477                 chain->bp = NULL;
2478                 chain->data = NULL;
2479                 hammer2_chain_unlock(hmp, chain);
2480                 break;
2481         default:
2482                 /*
2483                  * Embedded elements have to be flushed out.
2484                  */
2485                 KKASSERT(chain->data != NULL);
2486                 KKASSERT(chain->bp == NULL);
2487                 bref = &chain->bref;
2488
2489                 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
2490
2491                 if (chain->bp == NULL) {
2492                         /*
2493                          * The data is embedded, we have to acquire the
2494                          * buffer cache buffer and copy the data into it.
2495                          */
2496                         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
2497                                 bbytes = HAMMER2_MINIOSIZE;
2498                         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
2499                         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
2500
2501                         /*
2502                          * The getblk() optimization can only be used if the
2503                          * physical block size matches the request.
2504                          */
2505                         if (chain->bytes == bbytes) {
2506                                 bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
2507                                 error = 0;
2508                         } else {
2509                                 error = bread(hmp->devvp, pbase, bbytes, &bp);
2510                                 KKASSERT(error == 0);
2511                         }
2512                         bdata = (char *)bp->b_data + boff;
2513
2514                         /*
2515                          * Copy the data to the buffer, mark the buffer
2516                          * dirty, and convert the chain to unmodified.
2517                          *
2518                          * We expect we might have to make adjustments to
2519                          * non-data delayed-write buffers when doing an
2520                          * actual flush so use bawrite() instead of
2521                          * cluster_awrite() here.
2522                          */
2523                         bcopy(chain->data, bdata, chain->bytes);
2524                         bp->b_flags |= B_CLUSTEROK;
2525                         bawrite(bp);
2526                         bp = NULL;
2527                         chain->bref.check.iscsi32.value =
2528                                 hammer2_icrc32(chain->data, chain->bytes);
2529                         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
2530                                 ++hammer2_iod_meta_write;
2531                         else
2532                                 ++hammer2_iod_indr_write;
2533                 } else {
2534                         chain->bref.check.iscsi32.value =
2535                                 hammer2_icrc32(chain->data, chain->bytes);
2536                 }
2537         }
2538
2539         /*
2540          * Adjustments to the bref.  The caller will use this to adjust
2541          * our chain's pointer to this chain element.
2542          */
2543         bref = &chain->bref;
2544
2545         switch(bref->type) {
2546         case HAMMER2_BREF_TYPE_VOLUME:
2547                 KKASSERT(chain->data != NULL);
2548                 KKASSERT(chain->bp == NULL);
2549
2550                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
2551                         hammer2_icrc32(
2552                                 (char *)&hmp->voldata +
2553                                  HAMMER2_VOLUME_ICRC1_OFF,
2554                                 HAMMER2_VOLUME_ICRC1_SIZE);
2555                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
2556                         hammer2_icrc32(
2557                                 (char *)&hmp->voldata +
2558                                  HAMMER2_VOLUME_ICRC0_OFF,
2559                                 HAMMER2_VOLUME_ICRC0_SIZE);
2560                 hmp->voldata.icrc_volheader =
2561                         hammer2_icrc32(
2562                                 (char *)&hmp->voldata +
2563                                  HAMMER2_VOLUME_ICRCVH_OFF,
2564                                 HAMMER2_VOLUME_ICRCVH_SIZE);
2565                 break;
2566         default:
2567                 break;
2568
2569         }
2570 done:
2571         if (hammer2_debug & 0x0008) {
2572                 kprintf("%*.*s} %p/%d %04x ",
2573                         info->depth, info->depth, "",
2574                         chain, chain->refs, chain->flags);
2575         }
2576 }
2577
2578 #if 0
2579 /*
2580  * PASS2 - not yet implemented (should be called only with the root chain?)
2581  */
2582 static void
2583 hammer2_chain_flush_pass2(hammer2_mount_t *hmp, hammer2_chain_t *chain)
2584 {
2585 }
2586 #endif
2587
2588 /*
2589  * Stand-alone flush.  If the chain is unable to completely flush we have
2590  * to be sure that SUBMODIFIED propagates up the parent chain.  We must not
2591  * clear the MOVED bit after flushing in this situation or our desynchronized
2592  * bref will not properly update in the parent.
2593  *
2594  * This routine can be called from several places but the most important
2595  * is from the hammer2_vop_reclaim() function.  We want to try to completely
2596  * clean out the inode structure to prevent disconnected inodes from
2597  * building up and blowing out the kmalloc pool.
2598  *
2599  * If modify_tid is 0 (usual case), a new modify_tid is allocated and
2600  * applied to the flush.  The depth-limit handling code is the only
2601  * code which passes a non-zero modify_tid to hammer2_chain_flush().
2602  */
2603 void
2604 hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
2605                     hammer2_tid_t modify_tid)
2606 {
2607         hammer2_chain_t *parent;
2608         hammer2_chain_t *scan;
2609         hammer2_blockref_t *base;
2610         hammer2_flush_info_t info;
2611         int count;
2612         int reflush;
2613
2614         /*
2615          * Execute the recursive flush and handle deferrals.
2616          *
2617          * Chains can be ridiculously long (thousands deep), so to
2618          * avoid blowing out the kernel stack the recursive flush has a
2619          * depth limit.  Elements at the limit are placed on a list
2620          * for re-execution after the stack has been popped.
2621          */
2622         bzero(&info, sizeof(info));
2623         TAILQ_INIT(&info.flush_list);
2624
2625         if (modify_tid == 0) {
2626                 hammer2_voldata_lock(hmp);
2627                 info.modify_tid = hmp->voldata.alloc_tid++;
2628                 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
2629                 hammer2_voldata_unlock(hmp);
2630         } else {
2631                 info.modify_tid = modify_tid;
2632         }
2633         reflush = 1;
2634
2635         while (reflush) {
2636                 /*
2637                  * Primary recursion
2638                  */
2639                 hammer2_chain_flush_pass1(hmp, chain, &info);
2640                 reflush = 0;
2641
2642                 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
2643                         /*
2644                          * Secondary recursion.  Note that a reference is
2645                          * retained from the element's presence on the
2646                          * deferral list.
2647                          */
2648                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
2649                         TAILQ_REMOVE(&info.flush_list, scan, flush_node);
2650                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
2651
2652                         /*
2653                          * Now that we've popped back up we can do a secondary
2654                          * recursion on the deferred elements.
2655                          */
2656                         if (hammer2_debug & 0x0040)
2657                                 kprintf("defered flush %p\n", scan);
2658                         hammer2_chain_lock(hmp, scan, HAMMER2_RESOLVE_MAYBE);
2659                         hammer2_chain_flush(hmp, scan, info.modify_tid);
2660                         hammer2_chain_unlock(hmp, scan);
2661
2662                         /*
2663                          * Only flag a reflush if SUBMODIFIED is no longer
2664                          * set.  If SUBMODIFIED is set the element will just
2665                          * wind up on our flush_list again.
2666                          */
2667                         if ((scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2668                                             HAMMER2_CHAIN_MODIFIED |
2669                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2670                                 reflush = 1;
2671                         }
2672                         hammer2_chain_drop(hmp, scan);
2673                 }
2674                 if ((hammer2_debug & 0x0040) && reflush)
2675                         kprintf("reflush %p\n", chain);
2676         }
2677
2678         /*
2679          * The SUBMODIFIED bit must propagate upward if the chain could not
2680          * be completely flushed.
2681          */
2682         if (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2683                             HAMMER2_CHAIN_MODIFIED |
2684                             HAMMER2_CHAIN_MODIFIED_AUX |
2685                             HAMMER2_CHAIN_MOVED)) {
2686                 hammer2_chain_parent_setsubmod(hmp, chain);
2687         }
2688
2689         /*
2690          * If the only thing left is a simple bref update try to
2691          * pro-actively update the parent, otherwise return early.
2692          */
2693         parent = chain->parent;
2694         if (parent == NULL) {
2695                 return;
2696         }
2697         if (chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
2698             (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2699                              HAMMER2_CHAIN_MODIFIED |
2700                              HAMMER2_CHAIN_MODIFIED_AUX |
2701                              HAMMER2_CHAIN_MOVED)) != HAMMER2_CHAIN_MOVED) {
2702                 return;
2703         }
2704
2705         /*
2706          * We are locking backwards so allow the lock to fail
2707          */
2708         if (lockmgr(&parent->lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
2709                 return;
2710         }
2711
2712         /*
2713          * We are updating brefs but we have to call chain_modify()
2714          * because our caller is not being run from a recursive flush.
2715          *
2716          * This will also chain up the parent list and set the SUBMODIFIED
2717          * flag.
2718          *
2719          * We do not want to set HAMMER2_CHAIN_MODIFY_TID here because the
2720          * modification is only related to updating a bref in the parent.
2721          *
2722          * When updating the blockset embedded in the volume header we must
2723          * also update voldata.mirror_tid.
2724          */
2725         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
2726         hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2727
2728         switch(parent->bref.type) {
2729         case HAMMER2_BREF_TYPE_INODE:
2730                 base = &parent->data->ipdata.u.blockset.
2731                         blockref[0];
2732                 count = HAMMER2_SET_COUNT;
2733                 break;
2734         case HAMMER2_BREF_TYPE_INDIRECT:
2735                 base = &parent->data->npdata.blockref[0];
2736                 count = parent->bytes /
2737                         sizeof(hammer2_blockref_t);
2738                 break;
2739         case HAMMER2_BREF_TYPE_VOLUME:
2740                 base = &hmp->voldata.sroot_blockset.blockref[0];
2741                 count = HAMMER2_SET_COUNT;
2742                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2743                         if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
2744                                 hmp->voldata.mirror_tid =
2745                                         chain->bref.mirror_tid;
2746                         }
2747                 }
2748                 break;
2749         default:
2750                 base = NULL;
2751                 panic("hammer2_chain_flush: "
2752                       "unrecognized blockref type: %d",
2753                       parent->bref.type);
2754         }
2755
2756         /*
2757          * Update the blockref in the parent.  We do not have to set
2758          * MOVED in the parent because the parent has been marked modified,
2759          * so the flush sequence will pick up the bref change.
2760          *
2761          * We do have to propagate mirror_tid upward.
2762          */
2763         KKASSERT(chain->index >= 0 &&
2764                  chain->index < count);
2765         KKASSERT(chain->parent == parent);
2766         if (chain->flags & HAMMER2_CHAIN_MOVED) {
2767                 base[chain->index] = chain->bref_flush;
2768                 if (parent->bref.mirror_tid < chain->bref_flush.mirror_tid)
2769                         parent->bref.mirror_tid = chain->bref_flush.mirror_tid;
2770                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2771                 hammer2_chain_drop(hmp, chain);
2772         } else if (bcmp(&base[chain->index], &chain->bref_flush,
2773                    sizeof(chain->bref)) != 0) {
2774                 panic("hammer2: unflagged bref update(2)");
2775         }
2776
2777         lockmgr(&parent->lk, LK_RELEASE);       /* release manual lockmgr op */
2778         hammer2_chain_unlock(hmp, parent);
2779 }