91972e701f28de68f7b2607a0fd3ae612bb38e61
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem handles direct and indirect block searches, recursions,
37  * creation, and deletion.  Chains of blockrefs are tracked and modifications
38  * are flag for propagation... eventually all the way back to the volume
39  * header.
40  */
41
42 #include <sys/cdefs.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/types.h>
46 #include <sys/lock.h>
47 #include <sys/uuid.h>
48
49 #include "hammer2.h"
50
51 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
52
53 static hammer2_chain_t *hammer2_chain_create_indirect(
54                         hammer2_mount_t *hmp, hammer2_chain_t *parent,
55                         hammer2_key_t key, int keybits);
56
57 /*
58  * Splay tree
59  */
60 SPLAY_GENERATE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
61
62 int
63 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
64 {
65         return(chain2->index - chain1->index);
66 }
67
68 /*
69  * Recursively mark the parent chain elements so flushes can find
70  * modified elements.  Stop when we hit a chain already flagged
71  * SUBMODIFIED, but ignore the SUBMODIFIED bit that might be set
72  * in chain itself.
73  *
74  * SUBMODIFIED is not set on the chain passed in.
75  *
76  * XXX rename of parent can create a SMP race
77  */
78 static void
79 hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
80 {
81         hammer2_chain_t *parent;
82
83         parent = chain->parent;
84         while (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
85                 atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
86                 parent = parent->parent;
87         }
88 }
89
90 /*
91  * Allocate a new disconnected chain element representing the specified
92  * bref.  The chain element is locked exclusively and refs is set to 1.
93  *
94  * This essentially allocates a system memory structure representing one
95  * of the media structure types, including inodes.
96  */
97 hammer2_chain_t *
98 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
99 {
100         hammer2_chain_t *chain;
101         hammer2_inode_t *ip;
102         hammer2_indblock_t *np;
103         hammer2_data_t *dp;
104         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
105
106         /*
107          * Construct the appropriate system structure.
108          */
109         switch(bref->type) {
110         case HAMMER2_BREF_TYPE_INODE:
111                 ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
112                 chain = &ip->chain;
113                 chain->u.ip = ip;
114                 lockinit(&chain->lk, "inode", 0, LK_CANRECURSE);
115                 ip->hmp = hmp;
116                 break;
117         case HAMMER2_BREF_TYPE_INDIRECT:
118                 np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
119                 chain = &np->chain;
120                 chain->u.np = np;
121                 lockinit(&chain->lk, "iblk", 0, LK_CANRECURSE);
122                 break;
123         case HAMMER2_BREF_TYPE_DATA:
124                 dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
125                 chain = &dp->chain;
126                 chain->u.dp = dp;
127                 lockinit(&chain->lk, "dblk", 0, LK_CANRECURSE);
128                 break;
129         case HAMMER2_BREF_TYPE_VOLUME:
130                 chain = NULL;
131                 panic("hammer2_chain_alloc volume type illegal for op");
132         default:
133                 chain = NULL;
134                 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
135                       bref->type);
136         }
137
138         /*
139          * Only set bref_flush if the bref has a real media offset, otherwise
140          * the caller has to wait for the chain to be modified/block-allocated
141          * before a blockref can be synchronized with its (future) parent.
142          */
143         chain->bref = *bref;
144         if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
145                 chain->bref_flush = *bref;
146         chain->index = -1;              /* not yet assigned */
147         chain->refs = 1;
148         chain->bytes = bytes;
149         lockmgr(&chain->lk, LK_EXCLUSIVE);
150
151         return (chain);
152 }
153
154 /*
155  * Free a disconnected chain element
156  */
157 void
158 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
159 {
160         void *mem;
161
162         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
163             chain->bref.type == HAMMER2_BREF_TYPE_VOLUME) {
164                 chain->data = NULL;
165         }
166
167         KKASSERT(chain->bp == NULL);
168         KKASSERT(chain->data == NULL);
169         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
170                  chain->u.ip->vp == NULL);
171
172         if ((mem = chain->u.mem) != NULL) {
173                 chain->u.mem = NULL;
174                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
175                         kfree(mem, hmp->minode);
176                 else
177                         kfree(mem, hmp->mchain);
178         }
179 }
180
181 /*
182  * Add a reference to a chain element (for shared access).  The chain
183  * element must already have at least 1 ref controlled by the caller.
184  */
185 void
186 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
187 {
188         KKASSERT(chain->refs > 0);
189         atomic_add_int(&chain->refs, 1);
190 }
191
192 /*
193  * Drop the callers reference to the chain element.  If the ref count
194  * reaches zero the chain element and its related structure (typically an
195  * inode or indirect block) will be freed and the parent will be
196  * recursively dropped.
197  *
198  * MOVED and MODIFIED elements hold additional references so it should not
199  * be possible for the count on a modified element to drop to 0.
200  *
201  * The chain element must NOT be locked by the caller.
202  *
203  * The parent might or might not be locked by the caller but if so it
204  * will also be referenced so we shouldn't recurse upward.
205  */
206 void
207 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
208 {
209         hammer2_chain_t *parent;
210         u_int refs;
211
212         while (chain) {
213                 refs = chain->refs;
214                 cpu_ccfence();
215                 KKASSERT(refs > 0);
216                 if (refs == 1) {
217                         KKASSERT(chain != &hmp->vchain);
218                         parent = chain->parent;
219                         if (parent)
220                                 lockmgr(&parent->lk, LK_EXCLUSIVE);
221                         if (atomic_cmpset_int(&chain->refs, 1, 0)) {
222                                 /*
223                                  * Succeeded, recurse and drop parent.
224                                  * These chain elements should be synchronized
225                                  * so no delta data or inode count updates
226                                  * should be needed.
227                                  */
228                                 KKASSERT((chain->flags &
229                                           (HAMMER2_CHAIN_MOVED |
230                                            HAMMER2_CHAIN_MODIFIED)) == 0);
231                                 if (!(chain->flags & HAMMER2_CHAIN_DELETED)) {
232                                         SPLAY_REMOVE(hammer2_chain_splay,
233                                                      &parent->shead, chain);
234                                         atomic_set_int(&chain->flags,
235                                                        HAMMER2_CHAIN_DELETED);
236                                         /* parent refs dropped via recursion */
237                                 }
238                                 chain->parent = NULL;
239                                 if (parent)
240                                         lockmgr(&parent->lk, LK_RELEASE);
241                                 hammer2_chain_free(hmp, chain);
242                                 chain = parent;
243                                 /* recurse on parent */
244                         } else {
245                                 if (parent)
246                                         lockmgr(&parent->lk, LK_RELEASE);
247                                 /* retry the same chain */
248                         }
249                 } else {
250                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
251                                 /*
252                                  * Succeeded, count did not reach zero so
253                                  * cut out of the loop.
254                                  */
255                                 break;
256                         }
257                         /* retry the same chain */
258                 }
259         }
260 }
261
262 /*
263  * Ref and lock a chain element, acquiring its data with I/O if necessary,
264  * and specify how you would like the data to be resolved.
265  *
266  * Returns 0 on success or an error code if the data could not be acquired.
267  * The chain element is locked either way.
268  *
269  * The lock is allowed to recurse, multiple locking ops will aggregate
270  * the requested resolve types.  Once data is assigned it will not be
271  * removed until the last unlock.
272  *
273  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
274  *                         (typically used to avoid device/logical buffer
275  *                          aliasing for data)
276  *
277  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
278  *                         the INITIAL-create state (indirect blocks only).
279  *
280  *                         Do not resolve data elements for DATA chains.
281  *                         (typically used to avoid device/logical buffer
282  *                          aliasing for data)
283  *
284  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
285  *
286  *
287  * NOTE: Embedded elements (volume header, inodes) are always resolved
288  *       regardless.
289  *
290  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
291  *       element will instantiate and zero its buffer, and flush it on
292  *       release.
293  *
294  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
295  *       so as not to instantiate a device buffer, which could alias against
296  *       a logical file buffer.  However, if ALWAYS is specified the
297  *       device buffer will be instantiated anyway.
298  */
299 int
300 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
301 {
302         hammer2_blockref_t *bref;
303         hammer2_off_t pbase;
304         hammer2_off_t peof;
305         size_t boff;
306         size_t bbytes;
307         int error;
308         char *bdata;
309
310         /*
311          * Lock the element.  Under certain conditions this might end up
312          * being a recursive lock.
313          */
314         KKASSERT(chain->refs > 0);
315         atomic_add_int(&chain->refs, 1);
316         lockmgr(&chain->lk, LK_EXCLUSIVE);
317
318         /*
319          * If we already have a valid data pointer no further action is
320          * necessary.
321          */
322         if (chain->data)
323                 return (0);
324
325         /*
326          * Do we have to resolve the data?
327          */
328         switch(how) {
329         case HAMMER2_RESOLVE_NEVER:
330                 return(0);
331         case HAMMER2_RESOLVE_MAYBE:
332                 if (chain->flags & HAMMER2_CHAIN_INITIAL)
333                         return(0);
334                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
335                         return(0);
336                 /* fall through */
337         case HAMMER2_RESOLVE_ALWAYS:
338                 break;
339         }
340
341         /*
342          * We must resolve to a device buffer, either by issuing I/O or
343          * by creating a zero-fill element.  We do not mark the buffer
344          * dirty when creating a zero-fill element (the hammer2_chain_modify()
345          * API must still be used to do that).
346          *
347          * The device buffer is variable-sized in powers of 2 down
348          * to HAMMER2_MINALLOCSIZE (typically 1K).  A 64K physical storage
349          * chunk always contains buffers of the same size. (XXX)
350          *
351          * The minimum physical IO size may be larger than the variable
352          * block size.
353          */
354         bref = &chain->bref;
355
356         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
357                 bbytes = HAMMER2_MINIOSIZE;
358         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
359         peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
360         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
361         KKASSERT(pbase != 0);
362
363         /*
364          * The getblk() optimization can only be used on newly created
365          * elements if the physical block size matches the request.
366          */
367         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
368             chain->bytes == bbytes) {
369                 chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
370                 error = 0;
371         } else if (hammer2_cluster_enable) {
372                 error = cluster_read(hmp->devvp, peof, pbase, bbytes,
373                                      HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
374                                      &chain->bp);
375         } else {
376                 error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
377         }
378
379         if (error) {
380                 kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
381                         (intmax_t)pbase, error);
382                 bqrelse(chain->bp);
383                 chain->bp = NULL;
384                 return (error);
385         }
386
387         /*
388          * Zero the data area if the chain is in the INITIAL-create state
389          */
390         bdata = (char *)chain->bp->b_data + boff;
391         if (chain->flags & HAMMER2_CHAIN_INITIAL)
392                 bzero(bdata, chain->bytes);
393
394         /*
395          * Setup the data pointer, either pointing it to an embedded data
396          * structure and copying the data from the buffer, or pointing it
397          * into the buffer.
398          *
399          * The buffer is not retained when copying to an embedded data
400          * structure in order to avoid potential deadlocks or recursions
401          * on the same physical buffer.
402          */
403         switch (bref->type) {
404         case HAMMER2_BREF_TYPE_VOLUME:
405                 /*
406                  * Copy data from bp to embedded buffer
407                  */
408                 panic("hammer2_chain_lock: called on unresolved volume header");
409 #if 0
410                 /* NOT YET */
411                 KKASSERT(pbase == 0);
412                 KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
413                 bcopy(bdata, &hmp->voldata, chain->bytes);
414                 chain->data = (void *)&hmp->voldata;
415                 bqrelse(chain->bp);
416                 chain->bp = NULL;
417 #endif
418                 break;
419         case HAMMER2_BREF_TYPE_INODE:
420                 /*
421                  * Copy data from bp to embedded buffer, do not retain the
422                  * device buffer.
423                  */
424                 bcopy(bdata, &chain->u.ip->ip_data, chain->bytes);
425                 chain->data = (void *)&chain->u.ip->ip_data;
426                 bqrelse(chain->bp);
427                 chain->bp = NULL;
428                 break;
429         case HAMMER2_BREF_TYPE_INDIRECT:
430         case HAMMER2_BREF_TYPE_DATA:
431         default:
432                 /*
433                  * Point data at the device buffer and leave bp intact.
434                  */
435                 chain->data = (void *)bdata;
436                 break;
437         }
438         return (0);
439 }
440
441 /*
442  * Unlock and deref a chain element.
443  *
444  * On the last lock release any non-embedded data (chain->bp) will be
445  * retired.
446  */
447 void
448 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
449 {
450         long *counterp;
451
452         /*
453          * Undo a recursive lock
454          */
455         if (lockcountnb(&chain->lk) > 1) {
456                 KKASSERT(chain->refs > 1);
457                 atomic_add_int(&chain->refs, -1);
458                 lockmgr(&chain->lk, LK_RELEASE);
459                 return;
460         }
461
462         /*
463          * Shortcut the case if the data is embedded or not resolved.
464          * Do NOT null-out pointers to embedded data (e.g. inode).
465          */
466         if (chain->bp == NULL) {
467                 lockmgr(&chain->lk, LK_RELEASE);
468                 hammer2_chain_drop(hmp, chain);
469                 return;
470         }
471
472         /*
473          * Statistics
474          */
475         if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
476                 ;
477         } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
478                 switch(chain->bref.type) {
479                 case HAMMER2_BREF_TYPE_DATA:
480                         counterp = &hammer2_ioa_file_write;
481                         break;
482                 case HAMMER2_BREF_TYPE_INODE:
483                         counterp = &hammer2_ioa_meta_write;
484                         break;
485                 case HAMMER2_BREF_TYPE_INDIRECT:
486                         counterp = &hammer2_ioa_indr_write;
487                         break;
488                 default:
489                         counterp = &hammer2_ioa_volu_write;
490                         break;
491                 }
492                 ++*counterp;
493         } else {
494                 switch(chain->bref.type) {
495                 case HAMMER2_BREF_TYPE_DATA:
496                         counterp = &hammer2_iod_file_write;
497                         break;
498                 case HAMMER2_BREF_TYPE_INODE:
499                         counterp = &hammer2_iod_meta_write;
500                         break;
501                 case HAMMER2_BREF_TYPE_INDIRECT:
502                         counterp = &hammer2_iod_indr_write;
503                         break;
504                 default:
505                         counterp = &hammer2_iod_volu_write;
506                         break;
507                 }
508                 ++*counterp;
509         }
510
511         /*
512          * Clean out the bp.
513          *
514          * If a device buffer was used for data be sure to destroy the
515          * buffer when we are done to avoid aliases (XXX what about the
516          * underlying VM pages?).
517          */
518         if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
519                 chain->bp->b_flags |= B_RELBUF;
520
521         chain->data = NULL;
522         if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
523                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
524                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
525                         atomic_clear_int(&chain->flags,
526                                          HAMMER2_CHAIN_IOFLUSH);
527                         chain->bp->b_flags |= B_RELBUF;
528                         cluster_awrite(chain->bp);
529                 } else {
530                         chain->bp->b_flags |= B_CLUSTEROK;
531                         bdwrite(chain->bp);
532                 }
533         } else {
534                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
535                         atomic_clear_int(&chain->flags,
536                                          HAMMER2_CHAIN_IOFLUSH);
537                         chain->bp->b_flags |= B_RELBUF;
538                         brelse(chain->bp);
539                 } else {
540                         /* bp might still be dirty */
541                         bqrelse(chain->bp);
542                 }
543         }
544         chain->bp = NULL;
545         lockmgr(&chain->lk, LK_RELEASE);
546         hammer2_chain_drop(hmp, chain);
547 }
548
549 /*
550  * Resize the chain's physical storage allocation.  Chains can be resized
551  * smaller without reallocating the storage.  Resizing larger will reallocate
552  * the storage.
553  *
554  * Must be passed a locked chain.
555  *
556  * If you want the resize code to copy the data to the new block then the
557  * caller should lock the chain RESOLVE_MAYBE or RESOLVE_ALWAYS.
558  *
559  * If the caller already holds a logical buffer containing the data and
560  * intends to bdwrite() that buffer resolve with RESOLVE_NEVER.  The resize
561  * operation will then not copy the data.
562  *
563  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
564  * to avoid instantiating a device buffer that conflicts with the vnode
565  * data buffer.
566  *
567  * XXX flags currently ignored, uses chain->bp to detect data/no-data.
568  */
569 void
570 hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
571                      int nradix, int flags)
572 {
573         hammer2_mount_t *hmp = ip->hmp;
574         struct buf *nbp;
575         hammer2_off_t pbase;
576         size_t obytes;
577         size_t nbytes;
578         size_t bbytes;
579         int boff;
580         char *bdata;
581         int error;
582
583         /*
584          * Only data and indirect blocks can be resized for now
585          */
586         KKASSERT(chain != &hmp->vchain);
587         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
588                  chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
589
590         /*
591          * Nothing to do if the element is already the proper size
592          */
593         obytes = chain->bytes;
594         nbytes = 1U << nradix;
595         if (obytes == nbytes)
596                 return;
597
598         /*
599          * Set MODIFIED and add a chain ref to prevent destruction.  Both
600          * modified flags share the same ref.
601          *
602          * If the chain is already marked MODIFIED then we can safely
603          * return the previous allocation to the pool without having to
604          * worry about snapshots.
605          */
606         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
607                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
608                                               HAMMER2_CHAIN_MODIFY_TID);
609                 hammer2_chain_ref(hmp, chain);
610         } else {
611                 hammer2_freemap_free(hmp, chain->bref.data_off,
612                                      chain->bref.type);
613         }
614
615         /*
616          * Relocate the block, even if making it smaller (because different
617          * block sizes may be in different regions).
618          */
619         chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type,
620                                                      nbytes);
621         chain->bytes = nbytes;
622         ip->delta_dcount += (ssize_t)(nbytes - obytes); /* XXX atomic */
623
624         /*
625          * The device buffer may be larger than the allocation size.
626          */
627         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
628                 bbytes = HAMMER2_MINIOSIZE;
629         pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
630         boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
631
632         /*
633          * Only copy the data if resolved, otherwise the caller is
634          * responsible.
635          */
636         if (chain->bp) {
637                 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
638                          chain->bref.type == HAMMER2_BREF_TYPE_DATA);
639                 KKASSERT(chain != &hmp->vchain);        /* safety */
640
641                 /*
642                  * The getblk() optimization can only be used if the
643                  * physical block size matches the request.
644                  */
645                 if (nbytes == bbytes) {
646                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
647                         error = 0;
648                 } else {
649                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
650                         KKASSERT(error == 0);
651                 }
652                 bdata = (char *)nbp->b_data + boff;
653
654                 if (nbytes < obytes) {
655                         bcopy(chain->data, bdata, nbytes);
656                 } else {
657                         bcopy(chain->data, bdata, obytes);
658                         bzero(bdata + obytes, nbytes - obytes);
659                 }
660
661                 /*
662                  * NOTE: The INITIAL state of the chain is left intact.
663                  *
664                  * NOTE: Because of the reallocation we have to set DIRTYBP
665                  *       if INITIAL is not set.
666                  *
667                  * NOTE: We set B_NOCACHE to throw away the previous bp and
668                  *       any VM backing store, even if it was dirty.
669                  *       Otherwise we run the risk of a logical/device
670                  *       conflict on reallocation.
671                  */
672                 chain->bp->b_flags |= B_RELBUF | B_NOCACHE;
673                 brelse(chain->bp);
674                 chain->bp = nbp;
675                 chain->data = (void *)bdata;
676                 if ((chain->flags & HAMMER2_CHAIN_INITIAL) == 0)
677                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
678         }
679         hammer2_chain_parent_setsubmod(hmp, chain);
680 }
681
682 /*
683  * Convert a locked chain that was retrieved read-only to read-write.
684  *
685  * If not already marked modified a new physical block will be allocated
686  * and assigned to the bref.
687  *
688  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
689  *                   level or the COW operation will not work.
690  *
691  * Data blocks     - The chain is usually locked RESOLVE_NEVER so as not to
692  *                   run the data through the device buffers.
693  */
694 void
695 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
696 {
697         struct buf *nbp;
698         int error;
699         hammer2_off_t pbase;
700         size_t bbytes;
701         size_t boff;
702         void *bdata;
703
704         /*
705          * Tells flush that modify_tid must be updated, otherwise only
706          * mirror_tid is updated.  This is the default.
707          */
708         if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
709                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFY_TID);
710
711         /*
712          * If the chain is already marked MODIFIED we can just return.
713          */
714         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
715                 if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
716                     chain->bp == NULL) {
717                         goto skip1;
718                 }
719                 return;
720         }
721
722         /*
723          * Set MODIFIED and add a chain ref to prevent destruction.  Both
724          * modified flags share the same ref.
725          */
726         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
727         hammer2_chain_ref(hmp, chain);
728
729         /*
730          * We must allocate the copy-on-write block.
731          *
732          * If the data is embedded no other action is required.
733          *
734          * If the data is not embedded we acquire and clear the
735          * new block.  If chain->data is not NULL we then do the
736          * copy-on-write.  chain->data will then be repointed to the new
737          * buffer and the old buffer will be released.
738          *
739          * For newly created elements with no prior allocation we go
740          * through the copy-on-write steps except without the copying part.
741          */
742         if (chain != &hmp->vchain) {
743                 if ((hammer2_debug & 0x0001) &&
744                     (chain->bref.data_off & HAMMER2_OFF_MASK)) {
745                         kprintf("Replace %d\n", chain->bytes);
746                 }
747                 chain->bref.data_off =
748                         hammer2_freemap_alloc(hmp, chain->bref.type,
749                                               chain->bytes);
750                 /* XXX failed allocation */
751         }
752
753         /*
754          * If data instantiation is optional and the chain has no current
755          * data association (typical for DATA and newly-created INDIRECT
756          * elements), don't instantiate the buffer now.
757          */
758         if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL)
759                 goto skip2;
760
761 skip1:
762         /*
763          * Setting the DIRTYBP flag will cause the buffer to be dirtied or
764          * written-out on unlock.  This bit is independent of the MODIFIED
765          * bit because the chain may still need meta-data adjustments done
766          * by virtue of MODIFIED for its parent, and the buffer can be
767          * flushed out (possibly multiple times) by the OS before that.
768          *
769          * Clearing the INITIAL flag (for indirect blocks) indicates that
770          * a zero-fill buffer has been instantiated.
771          */
772         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
773         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
774
775         /*
776          * We currently should never instantiate a device buffer for a
777          * data chain.
778          */
779         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
780
781         /*
782          * Execute COW operation
783          */
784         switch(chain->bref.type) {
785         case HAMMER2_BREF_TYPE_VOLUME:
786         case HAMMER2_BREF_TYPE_INODE:
787                 /*
788                  * The data is embedded, no copy-on-write operation is
789                  * needed.
790                  */
791                 KKASSERT(chain->bp == NULL);
792                 break;
793         case HAMMER2_BREF_TYPE_DATA:
794         case HAMMER2_BREF_TYPE_INDIRECT:
795                 /*
796                  * Perform the copy-on-write operation
797                  */
798                 KKASSERT(chain != &hmp->vchain);        /* safety */
799                 /*
800                  * The device buffer may be larger than the allocation size.
801                  */
802                 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
803                         bbytes = HAMMER2_MINIOSIZE;
804                 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
805                 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
806
807                 /*
808                  * The getblk() optimization can only be used if the
809                  * physical block size matches the request.
810                  */
811                 if (chain->bytes == bbytes) {
812                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
813                         error = 0;
814                 } else {
815                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
816                         KKASSERT(error == 0);
817                 }
818                 bdata = (char *)nbp->b_data + boff;
819
820                 /*
821                  * Copy or zero-fill on write depending on whether
822                  * chain->data exists or not.
823                  */
824                 if (chain->data) {
825                         bcopy(chain->data, bdata, chain->bytes);
826                         KKASSERT(chain->bp != NULL);
827                 } else {
828                         bzero(bdata, chain->bytes);
829                 }
830                 if (chain->bp) {
831                         chain->bp->b_flags |= B_RELBUF;
832                         brelse(chain->bp);
833                 }
834                 chain->bp = nbp;
835                 chain->data = bdata;
836                 break;
837         default:
838                 panic("hammer2_chain_modify: illegal non-embedded type %d",
839                       chain->bref.type);
840                 break;
841
842         }
843 skip2:
844         if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
845                 hammer2_chain_parent_setsubmod(hmp, chain);
846 }
847
848 /*
849  * Mark the volume as having been modified.  This short-cut version
850  * does not have to lock the volume's chain, which allows the ioctl
851  * code to make adjustments to connections without deadlocking.
852  */
853 void
854 hammer2_modify_volume(hammer2_mount_t *hmp)
855 {
856         hammer2_voldata_lock(hmp);
857         atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
858         hammer2_voldata_unlock(hmp);
859 }
860
861 /*
862  * Locate an in-memory chain.  The parent must be locked.  The in-memory
863  * chain is returned or NULL if no in-memory chain is present.
864  *
865  * NOTE: A chain on-media might exist for this index when NULL is returned.
866  */
867 hammer2_chain_t *
868 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
869 {
870         hammer2_chain_t dummy;
871         hammer2_chain_t *chain;
872
873         dummy.index = index;
874         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
875         return (chain);
876 }
877
878 /*
879  * Return a locked chain structure with all associated data acquired.
880  *
881  * Caller must lock the parent on call, the returned child will be locked.
882  */
883 hammer2_chain_t *
884 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
885                   int index, int flags)
886 {
887         hammer2_blockref_t *bref;
888         hammer2_chain_t *chain;
889         hammer2_chain_t dummy;
890         int how;
891
892         /*
893          * Figure out how to lock.  MAYBE can be used to optimized
894          * the initial-create state for indirect blocks.
895          */
896         if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
897                 how = HAMMER2_RESOLVE_NEVER;
898         else
899                 how = HAMMER2_RESOLVE_MAYBE;
900
901         /*
902          * First see if we have a (possibly modified) chain element cached
903          * for this (parent, index).  Acquire the data if necessary.
904          *
905          * If chain->data is non-NULL the chain should already be marked
906          * modified.
907          */
908         dummy.index = index;
909         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
910         if (chain) {
911                 if (flags & HAMMER2_LOOKUP_NOLOCK)
912                         hammer2_chain_ref(hmp, chain);
913                 else
914                         hammer2_chain_lock(hmp, chain, how);
915                 return(chain);
916         }
917
918         /*
919          * the get function must always succeed, panic if there's no
920          * data to index.
921          */
922         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
923                 panic("hammer2_chain_get: Missing bref(1)");
924                 /* NOT REACHED */
925         }
926
927         /*
928          * Otherwise lookup the bref and issue I/O (switch on the parent)
929          */
930         switch(parent->bref.type) {
931         case HAMMER2_BREF_TYPE_INODE:
932                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
933                 bref = &parent->data->ipdata.u.blockset.blockref[index];
934                 break;
935         case HAMMER2_BREF_TYPE_INDIRECT:
936                 KKASSERT(parent->data != NULL);
937                 KKASSERT(index >= 0 &&
938                          index < parent->bytes / sizeof(hammer2_blockref_t));
939                 bref = &parent->data->npdata.blockref[index];
940                 break;
941         case HAMMER2_BREF_TYPE_VOLUME:
942                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
943                 bref = &hmp->voldata.sroot_blockset.blockref[index];
944                 break;
945         default:
946                 bref = NULL;
947                 panic("hammer2_chain_get: unrecognized blockref type: %d",
948                       parent->bref.type);
949         }
950         if (bref->type == 0) {
951                 panic("hammer2_chain_get: Missing bref(2)");
952                 /* NOT REACHED */
953         }
954
955         /*
956          * Allocate a chain structure representing the existing media
957          * entry.
958          *
959          * The locking operation we do later will issue I/O to read it.
960          */
961         chain = hammer2_chain_alloc(hmp, bref);
962
963         /*
964          * Link the chain into its parent.  Caller is expected to hold an
965          * exclusive lock on the parent.
966          */
967         chain->parent = parent;
968         chain->index = index;
969         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
970                 panic("hammer2_chain_link: collision");
971         KKASSERT(parent->refs > 0);
972         atomic_add_int(&parent->refs, 1);       /* for splay entry */
973
974         /*
975          * Additional linkage for inodes.  Reuse the parent pointer to
976          * find the parent directory.
977          */
978         if (bref->type == HAMMER2_BREF_TYPE_INODE) {
979                 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
980                         parent = parent->parent;
981                 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
982                         chain->u.ip->pip = parent->u.ip;
983                         chain->u.ip->pmp = parent->u.ip->pmp;
984                         chain->u.ip->depth = parent->u.ip->depth + 1;
985                 }
986         }
987
988         /*
989          * Our new chain structure has already been referenced and locked
990          * but the lock code handles the I/O so call it to resolve the data.
991          * Then release one of our two exclusive locks.
992          *
993          * If NOLOCK is set the release will release the one-and-only lock.
994          */
995         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
996                 hammer2_chain_lock(hmp, chain, how);    /* recusive lock */
997                 hammer2_chain_drop(hmp, chain);         /* excess ref */
998         }
999         lockmgr(&chain->lk, LK_RELEASE);                /* from alloc */
1000
1001         return (chain);
1002 }
1003
1004 /*
1005  * Locate any key between key_beg and key_end inclusive.  (*parentp)
1006  * typically points to an inode but can also point to a related indirect
1007  * block and this function will recurse upwards and find the inode again.
1008  *
1009  * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
1010  *           WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
1011  *           WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
1012  *
1013  * (*parentp) must be exclusively locked and referenced and can be an inode
1014  * or an existing indirect block within the inode.
1015  *
1016  * On return (*parentp) will be modified to point at the deepest parent chain
1017  * element encountered during the search, as a helper for an insertion or
1018  * deletion.   The new (*parentp) will be locked and referenced and the old
1019  * will be unlocked and dereferenced (no change if they are both the same).
1020  *
1021  * The matching chain will be returned exclusively locked and referenced.
1022  *
1023  * NULL is returned if no match was found, but (*parentp) will still
1024  * potentially be adjusted.
1025  *
1026  * This function will also recurse up the chain if the key is not within the
1027  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
1028  * can simply allow (*parentp) to float inside the loop.
1029  */
1030 hammer2_chain_t *
1031 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1032                      hammer2_key_t key_beg, hammer2_key_t key_end,
1033                      int flags)
1034 {
1035         hammer2_chain_t *parent;
1036         hammer2_chain_t *chain;
1037         hammer2_chain_t *tmp;
1038         hammer2_blockref_t *base;
1039         hammer2_blockref_t *bref;
1040         hammer2_key_t scan_beg;
1041         hammer2_key_t scan_end;
1042         int count = 0;
1043         int i;
1044
1045         /*
1046          * Recurse (*parentp) upward if necessary until the parent completely
1047          * encloses the key range or we hit the inode.
1048          */
1049         parent = *parentp;
1050         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1051                 scan_beg = parent->bref.key;
1052                 scan_end = scan_beg +
1053                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1054                 if (key_beg >= scan_beg && key_end <= scan_end)
1055                         break;
1056                 hammer2_chain_ref(hmp, parent);         /* ref old parent */
1057                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1058                 parent = parent->parent;
1059                                                         /* lock new parent */
1060                 hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
1061                 hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
1062                 *parentp = parent;                      /* new parent */
1063         }
1064
1065 again:
1066         /*
1067          * Locate the blockref array.  Currently we do a fully associative
1068          * search through the array.
1069          */
1070         switch(parent->bref.type) {
1071         case HAMMER2_BREF_TYPE_INODE:
1072                 /*
1073                  * Special shortcut for embedded data returns the inode
1074                  * itself.  Callers must detect this condition and access
1075                  * the embedded data (the strategy code does this for us).
1076                  *
1077                  * This is only applicable to regular files and softlinks.
1078                  */
1079                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1080                         if (flags & HAMMER2_LOOKUP_NOLOCK)
1081                                 hammer2_chain_ref(hmp, parent);
1082                         else
1083                                 hammer2_chain_lock(hmp, parent,
1084                                                    HAMMER2_RESOLVE_ALWAYS);
1085                         return (parent);
1086                 }
1087                 base = &parent->data->ipdata.u.blockset.blockref[0];
1088                 count = HAMMER2_SET_COUNT;
1089                 break;
1090         case HAMMER2_BREF_TYPE_INDIRECT:
1091                 /*
1092                  * Optimize indirect blocks in the INITIAL state to avoid
1093                  * I/O.
1094                  */
1095                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1096                         base = NULL;
1097                 } else {
1098                         if (parent->data == NULL)
1099                                 panic("parent->data is NULL");
1100                         base = &parent->data->npdata.blockref[0];
1101                 }
1102                 count = parent->bytes / sizeof(hammer2_blockref_t);
1103                 break;
1104         case HAMMER2_BREF_TYPE_VOLUME:
1105                 base = &hmp->voldata.sroot_blockset.blockref[0];
1106                 count = HAMMER2_SET_COUNT;
1107                 break;
1108         default:
1109                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
1110                       parent->bref.type);
1111                 base = NULL;    /* safety */
1112                 count = 0;      /* safety */
1113         }
1114
1115         /*
1116          * If the element and key overlap we use the element.
1117          */
1118         bref = NULL;
1119         for (i = 0; i < count; ++i) {
1120                 tmp = hammer2_chain_find(hmp, parent, i);
1121                 if (tmp) {
1122                         bref = &tmp->bref;
1123                         KKASSERT(bref->type != 0);
1124                 } else if (base == NULL || base[i].type == 0) {
1125                         continue;
1126                 } else {
1127                         bref = &base[i];
1128                 }
1129                 scan_beg = bref->key;
1130                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1131                 if (key_beg <= scan_end && key_end >= scan_beg)
1132                         break;
1133         }
1134         if (i == count) {
1135                 if (key_beg == key_end)
1136                         return (NULL);
1137                 return (hammer2_chain_next(hmp, parentp, NULL,
1138                                            key_beg, key_end, flags));
1139         }
1140
1141         /*
1142          * Acquire the new chain element.  If the chain element is an
1143          * indirect block we must search recursively.
1144          */
1145         chain = hammer2_chain_get(hmp, parent, i, flags);
1146         if (chain == NULL)
1147                 return (NULL);
1148
1149         /*
1150          * If the chain element is an indirect block it becomes the new
1151          * parent and we loop on it.
1152          *
1153          * The parent always has to be locked with at least RESOLVE_MAYBE,
1154          * so it might need a fixup if the caller passed incompatible flags.
1155          */
1156         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1157                 hammer2_chain_unlock(hmp, parent);
1158                 *parentp = parent = chain;
1159                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1160                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
1161                         hammer2_chain_drop(hmp, chain); /* excess ref */
1162                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1163                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
1164                         hammer2_chain_unlock(hmp, chain);
1165                 }
1166                 goto again;
1167         }
1168
1169         /*
1170          * All done, return chain
1171          */
1172         return (chain);
1173 }
1174
1175 /*
1176  * After having issued a lookup we can iterate all matching keys.
1177  *
1178  * If chain is non-NULL we continue the iteration from just after it's index.
1179  *
1180  * If chain is NULL we assume the parent was exhausted and continue the
1181  * iteration at the next parent.
1182  *
1183  * parent must be locked on entry and remains locked throughout.  chain's
1184  * lock status must match flags.
1185  */
1186 hammer2_chain_t *
1187 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1188                    hammer2_chain_t *chain,
1189                    hammer2_key_t key_beg, hammer2_key_t key_end,
1190                    int flags)
1191 {
1192         hammer2_chain_t *parent;
1193         hammer2_chain_t *tmp;
1194         hammer2_blockref_t *base;
1195         hammer2_blockref_t *bref;
1196         hammer2_key_t scan_beg;
1197         hammer2_key_t scan_end;
1198         int i;
1199         int count;
1200
1201         parent = *parentp;
1202
1203 again:
1204         /*
1205          * Calculate the next index and recalculate the parent if necessary.
1206          */
1207         if (chain) {
1208                 /*
1209                  * Continue iteration within current parent.  If not NULL
1210                  * the passed-in chain may or may not be locked, based on
1211                  * the LOOKUP_NOLOCK flag (passed in as returned from lookup
1212                  * or a prior next).
1213                  */
1214                 i = chain->index + 1;
1215                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1216                         hammer2_chain_drop(hmp, chain);
1217                 else
1218                         hammer2_chain_unlock(hmp, chain);
1219
1220                 /*
1221                  * Any scan where the lookup returned degenerate data embedded
1222                  * in the inode has an invalid index and must terminate.
1223                  */
1224                 if (chain == parent)
1225                         return(NULL);
1226                 chain = NULL;
1227         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) {
1228                 /*
1229                  * We reached the end of the iteration.
1230                  */
1231                 return (NULL);
1232         } else {
1233                 /*
1234                  * Continue iteration with next parent unless the current
1235                  * parent covers the range.
1236                  */
1237                 hammer2_chain_t *nparent;
1238
1239                 scan_beg = parent->bref.key;
1240                 scan_end = scan_beg +
1241                             ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1242                 if (key_beg >= scan_beg && key_end <= scan_end)
1243                         return (NULL);
1244
1245                 i = parent->index + 1;
1246                 nparent = parent->parent;
1247                 hammer2_chain_ref(hmp, nparent);        /* ref new parent */
1248                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1249                                                         /* lock new parent */
1250                 hammer2_chain_lock(hmp, nparent, HAMMER2_RESOLVE_MAYBE);
1251                 hammer2_chain_drop(hmp, nparent);       /* drop excess ref */
1252                 *parentp = parent = nparent;
1253         }
1254
1255 again2:
1256         /*
1257          * Locate the blockref array.  Currently we do a fully associative
1258          * search through the array.
1259          */
1260         switch(parent->bref.type) {
1261         case HAMMER2_BREF_TYPE_INODE:
1262                 base = &parent->data->ipdata.u.blockset.blockref[0];
1263                 count = HAMMER2_SET_COUNT;
1264                 break;
1265         case HAMMER2_BREF_TYPE_INDIRECT:
1266                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1267                         base = NULL;
1268                 } else {
1269                         KKASSERT(parent->data != NULL);
1270                         base = &parent->data->npdata.blockref[0];
1271                 }
1272                 count = parent->bytes / sizeof(hammer2_blockref_t);
1273                 break;
1274         case HAMMER2_BREF_TYPE_VOLUME:
1275                 base = &hmp->voldata.sroot_blockset.blockref[0];
1276                 count = HAMMER2_SET_COUNT;
1277                 break;
1278         default:
1279                 panic("hammer2_chain_next: unrecognized blockref type: %d",
1280                       parent->bref.type);
1281                 base = NULL;    /* safety */
1282                 count = 0;      /* safety */
1283                 break;
1284         }
1285         KKASSERT(i <= count);
1286
1287         /*
1288          * Look for the key.  If we are unable to find a match and an exact
1289          * match was requested we return NULL.  If a range was requested we
1290          * run hammer2_chain_next() to iterate.
1291          */
1292         bref = NULL;
1293         while (i < count) {
1294                 tmp = hammer2_chain_find(hmp, parent, i);
1295                 if (tmp) {
1296                         bref = &tmp->bref;
1297                 } else if (base == NULL || base[i].type == 0) {
1298                         ++i;
1299                         continue;
1300                 } else {
1301                         bref = &base[i];
1302                 }
1303                 scan_beg = bref->key;
1304                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1305                 if (key_beg <= scan_end && key_end >= scan_beg)
1306                         break;
1307                 ++i;
1308         }
1309
1310         /*
1311          * If we couldn't find a match recurse up a parent to continue the
1312          * search.
1313          */
1314         if (i == count)
1315                 goto again;
1316
1317         /*
1318          * Acquire the new chain element.  If the chain element is an
1319          * indirect block we must search recursively.
1320          */
1321         chain = hammer2_chain_get(hmp, parent, i, flags);
1322         if (chain == NULL)
1323                 return (NULL);
1324
1325         /*
1326          * If the chain element is an indirect block it becomes the new
1327          * parent and we loop on it.
1328          *
1329          * The parent always has to be locked with at least RESOLVE_MAYBE,
1330          * so it might need a fixup if the caller passed incompatible flags.
1331          */
1332         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1333                 hammer2_chain_unlock(hmp, parent);
1334                 *parentp = parent = chain;
1335                 chain = NULL;
1336                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1337                         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
1338                         hammer2_chain_drop(hmp, parent);        /* excess ref */
1339                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1340                         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
1341                         hammer2_chain_unlock(hmp, parent);
1342                 }
1343                 i = 0;
1344                 goto again2;
1345         }
1346
1347         /*
1348          * All done, return chain
1349          */
1350         return (chain);
1351 }
1352
1353 /*
1354  * Create and return a new hammer2 system memory structure of the specified
1355  * key, type and size and insert it RELATIVE TO (PARENT).
1356  *
1357  * (parent) is typically either an inode or an indirect block, acquired
1358  * acquired as a side effect of issuing a prior failed lookup.  parent
1359  * must be locked and held.  Do not pass the inode chain to this function
1360  * unless that is the chain returned by the failed lookup.
1361  *
1362  * Non-indirect types will automatically allocate indirect blocks as required
1363  * if the new item does not fit in the current (parent).
1364  *
1365  * Indirect types will move a portion of the existing blockref array in
1366  * (parent) into the new indirect type and then use one of the free slots
1367  * to emplace the new indirect type.
1368  *
1369  * A new locked, referenced chain element is returned of the specified type.
1370  * The element may or may not have a data area associated with it:
1371  *
1372  *      VOLUME          not allowed here
1373  *      INODE           embedded data are will be set-up
1374  *      INDIRECT        not allowed here
1375  *      DATA            no data area will be set-up (caller is expected
1376  *                      to have logical buffers, we don't want to alias
1377  *                      the data onto device buffers!).
1378  */
1379 hammer2_chain_t *
1380 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1381                      hammer2_chain_t *chain,
1382                      hammer2_key_t key, int keybits, int type, size_t bytes)
1383 {
1384         hammer2_blockref_t dummy;
1385         hammer2_blockref_t *base;
1386         hammer2_chain_t dummy_chain;
1387         int unlock_parent = 0;
1388         int allocated = 0;
1389         int count;
1390         int i;
1391
1392         if (chain == NULL) {
1393                 /*
1394                  * First allocate media space and construct the dummy bref,
1395                  * then allocate the in-memory chain structure.
1396                  */
1397                 bzero(&dummy, sizeof(dummy));
1398                 dummy.type = type;
1399                 dummy.key = key;
1400                 dummy.keybits = keybits;
1401                 dummy.data_off = hammer2_bytes_to_radix(bytes);
1402                 chain = hammer2_chain_alloc(hmp, &dummy);
1403                 allocated = 1;
1404
1405                 /*
1406                  * We do NOT set INITIAL here (yet).  INITIAL is only
1407                  * used for indirect blocks.
1408                  *
1409                  * Recalculate bytes to reflect the actual media block
1410                  * allocation.
1411                  */
1412                 bytes = (hammer2_off_t)1 <<
1413                         (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
1414                 chain->bytes = bytes;
1415
1416                 switch(type) {
1417                 case HAMMER2_BREF_TYPE_VOLUME:
1418                         panic("hammer2_chain_create: called with volume type");
1419                         break;
1420                 case HAMMER2_BREF_TYPE_INODE:
1421                         KKASSERT(bytes == HAMMER2_INODE_BYTES);
1422                         chain->data = (void *)&chain->u.ip->ip_data;
1423                         break;
1424                 case HAMMER2_BREF_TYPE_INDIRECT:
1425                         panic("hammer2_chain_create: cannot be used to"
1426                               "create indirect block");
1427                         break;
1428                 case HAMMER2_BREF_TYPE_DATA:
1429                 default:
1430                         /* leave chain->data NULL */
1431                         KKASSERT(chain->data == NULL);
1432                         break;
1433                 }
1434         } else {
1435                 /*
1436                  * Potentially update the chain's key/keybits.
1437                  */
1438                 chain->bref.key = key;
1439                 chain->bref.keybits = keybits;
1440         }
1441
1442 again:
1443         /*
1444          * Locate a free blockref in the parent's array
1445          */
1446         switch(parent->bref.type) {
1447         case HAMMER2_BREF_TYPE_INODE:
1448                 KKASSERT((parent->u.ip->ip_data.op_flags &
1449                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
1450                 KKASSERT(parent->data != NULL);
1451                 base = &parent->data->ipdata.u.blockset.blockref[0];
1452                 count = HAMMER2_SET_COUNT;
1453                 break;
1454         case HAMMER2_BREF_TYPE_INDIRECT:
1455                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1456                         base = NULL;
1457                 } else {
1458                         KKASSERT(parent->data != NULL);
1459                         base = &parent->data->npdata.blockref[0];
1460                 }
1461                 count = parent->bytes / sizeof(hammer2_blockref_t);
1462                 break;
1463         case HAMMER2_BREF_TYPE_VOLUME:
1464                 KKASSERT(parent->data != NULL);
1465                 base = &hmp->voldata.sroot_blockset.blockref[0];
1466                 count = HAMMER2_SET_COUNT;
1467                 break;
1468         default:
1469                 panic("hammer2_chain_create: unrecognized blockref type: %d",
1470                       parent->bref.type);
1471                 count = 0;
1472                 break;
1473         }
1474
1475         /*
1476          * Scan for an unallocated bref, also skipping any slots occupied
1477          * by in-memory chain elements that may not yet have been updated
1478          * in the parent's bref array.
1479          */
1480         bzero(&dummy_chain, sizeof(dummy_chain));
1481         for (i = 0; i < count; ++i) {
1482                 if (base == NULL) {
1483                         dummy_chain.index = i;
1484                         if (SPLAY_FIND(hammer2_chain_splay,
1485                                        &parent->shead, &dummy_chain) == NULL) {
1486                                 break;
1487                         }
1488                 } else if (base[i].type == 0) {
1489                         dummy_chain.index = i;
1490                         if (SPLAY_FIND(hammer2_chain_splay,
1491                                        &parent->shead, &dummy_chain) == NULL) {
1492                                 break;
1493                         }
1494                 }
1495         }
1496
1497         /*
1498          * If no free blockref could be found we must create an indirect
1499          * block and move a number of blockrefs into it.  With the parent
1500          * locked we can safely lock each child in order to move it without
1501          * causing a deadlock.
1502          *
1503          * This may return the new indirect block or the old parent depending
1504          * on where the key falls.
1505          */
1506         if (i == count) {
1507                 hammer2_chain_t *nparent;
1508
1509                 nparent = hammer2_chain_create_indirect(hmp, parent,
1510                                                         key, keybits);
1511                 if (nparent == NULL) {
1512                         if (allocated)
1513                                 hammer2_chain_free(hmp, chain);
1514                         chain = NULL;
1515                         goto done;
1516                 }
1517                 if (parent != nparent) {
1518                         if (unlock_parent)
1519                                 hammer2_chain_unlock(hmp, parent);
1520                         parent = nparent;
1521                         unlock_parent = 1;
1522                 }
1523                 goto again;
1524         }
1525
1526         /*
1527          * Link the chain into its parent.  Later on we will have to set
1528          * the MOVED bit in situations where we don't mark the new chain
1529          * as being modified.
1530          */
1531         if (chain->parent != NULL)
1532                 panic("hammer2: hammer2_chain_create: chain already connected");
1533         KKASSERT(chain->parent == NULL);
1534         chain->parent = parent;
1535         chain->index = i;
1536         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
1537                 panic("hammer2_chain_link: collision");
1538         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
1539         KKASSERT(parent->refs > 0);
1540         atomic_add_int(&parent->refs, 1);
1541
1542         /*
1543          * Additional linkage for inodes.  Reuse the parent pointer to
1544          * find the parent directory.
1545          *
1546          * Cumulative adjustments are inherited on [re]attach.
1547          */
1548         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1549                 hammer2_chain_t *scan = parent;
1550                 hammer2_inode_t *ip = chain->u.ip;
1551
1552                 while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1553                         scan = scan->parent;
1554                 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE) {
1555                         ip->pip = scan->u.ip;
1556                         ip->pmp = scan->u.ip->pmp;
1557                         ip->depth = scan->u.ip->depth + 1;
1558                         ip->delta_icount += ip->ip_data.inode_count;
1559                         ip->delta_dcount += ip->ip_data.data_count;
1560                         ++ip->pip->delta_icount;
1561                 }
1562         }
1563
1564         /*
1565          * (allocated) indicates that this is a newly-created chain element
1566          * rather than a renamed chain element.  In this situation we want
1567          * to place the chain element in the MODIFIED state.
1568          *
1569          * The data area will be set up as follows:
1570          *
1571          *      VOLUME          not allowed here.
1572          *
1573          *      INODE           embedded data are will be set-up.
1574          *
1575          *      INDIRECT        not allowed here.
1576          *
1577          *      DATA            no data area will be set-up (caller is expected
1578          *                      to have logical buffers, we don't want to alias
1579          *                      the data onto device buffers!).
1580          */
1581         if (allocated) {
1582                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1583                         hammer2_chain_modify(hmp, chain,
1584                                              HAMMER2_MODIFY_OPTDATA);
1585                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1586                         /* not supported in this function */
1587                         panic("hammer2_chain_create: bad type");
1588                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1589                         hammer2_chain_modify(hmp, chain,
1590                                              HAMMER2_MODIFY_OPTDATA);
1591                 } else {
1592                         hammer2_chain_modify(hmp, chain, 0);
1593                 }
1594         } else {
1595                 /*
1596                  * When reconnecting inodes we have to call setsubmod()
1597                  * to ensure that its state propagates up the newly
1598                  * connected parent.
1599                  *
1600                  * Make sure MOVED is set but do not update bref_flush.  If
1601                  * the chain is undergoing modification bref_flush will be
1602                  * updated when it gets flushed.  If it is not then the
1603                  * bref may not have been flushed yet and we do not want to
1604                  * set MODIFIED here as this could result in unnecessary
1605                  * reallocations.
1606                  */
1607                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1608                         hammer2_chain_ref(hmp, chain);
1609                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1610                 }
1611                 hammer2_chain_parent_setsubmod(hmp, chain);
1612         }
1613
1614 done:
1615         if (unlock_parent)
1616                 hammer2_chain_unlock(hmp, parent);
1617         return (chain);
1618 }
1619
1620 /*
1621  * Create an indirect block that covers one or more of the elements in the
1622  * current parent.  Either returns the existing parent with no locking or
1623  * ref changes or returns the new indirect block locked and referenced
1624  * and leaving the original parent lock/ref intact as well.
1625  *
1626  * The returned chain depends on where the specified key falls.
1627  *
1628  * The key/keybits for the indirect mode only needs to follow three rules:
1629  *
1630  * (1) That all elements underneath it fit within its key space and
1631  *
1632  * (2) That all elements outside it are outside its key space.
1633  *
1634  * (3) When creating the new indirect block any elements in the current
1635  *     parent that fit within the new indirect block's keyspace must be
1636  *     moved into the new indirect block.
1637  *
1638  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
1639  *     keyspace the the current parent, but lookup/iteration rules will
1640  *     ensure (and must ensure) that rule (2) for all parents leading up
1641  *     to the nearest inode or the root volume header is adhered to.  This
1642  *     is accomplished by always recursing through matching keyspaces in
1643  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
1644  *
1645  * The current implementation calculates the current worst-case keyspace by
1646  * iterating the current parent and then divides it into two halves, choosing
1647  * whichever half has the most elements (not necessarily the half containing
1648  * the requested key).
1649  *
1650  * We can also opt to use the half with the least number of elements.  This
1651  * causes lower-numbered keys (aka logical file offsets) to recurse through
1652  * fewer indirect blocks and higher-numbered keys to recurse through more.
1653  * This also has the risk of not moving enough elements to the new indirect
1654  * block and being forced to create several indirect blocks before the element
1655  * can be inserted.
1656  */
1657 static
1658 hammer2_chain_t *
1659 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1660                               hammer2_key_t create_key, int create_bits)
1661 {
1662         hammer2_blockref_t *base;
1663         hammer2_blockref_t *bref;
1664         hammer2_chain_t *chain;
1665         hammer2_chain_t *ichain;
1666         hammer2_chain_t dummy;
1667         hammer2_key_t key = create_key;
1668         int keybits = create_bits;
1669         int locount = 0;
1670         int hicount = 0;
1671         int count;
1672         int nbytes;
1673         int i;
1674
1675         /*
1676          * Calculate the base blockref pointer or NULL if the chain
1677          * is known to be empty.  We need to calculate the array count
1678          * for SPLAY lookups either way.
1679          */
1680         hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
1681         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1682                 base = NULL;
1683
1684                 switch(parent->bref.type) {
1685                 case HAMMER2_BREF_TYPE_INODE:
1686                         count = HAMMER2_SET_COUNT;
1687                         break;
1688                 case HAMMER2_BREF_TYPE_INDIRECT:
1689                         count = parent->bytes / sizeof(hammer2_blockref_t);
1690                         break;
1691                 case HAMMER2_BREF_TYPE_VOLUME:
1692                         count = HAMMER2_SET_COUNT;
1693                         break;
1694                 default:
1695                         panic("hammer2_chain_create_indirect: "
1696                               "unrecognized blockref type: %d",
1697                               parent->bref.type);
1698                         count = 0;
1699                         break;
1700                 }
1701         } else {
1702                 switch(parent->bref.type) {
1703                 case HAMMER2_BREF_TYPE_INODE:
1704                         base = &parent->data->ipdata.u.blockset.blockref[0];
1705                         count = HAMMER2_SET_COUNT;
1706                         break;
1707                 case HAMMER2_BREF_TYPE_INDIRECT:
1708                         base = &parent->data->npdata.blockref[0];
1709                         count = parent->bytes / sizeof(hammer2_blockref_t);
1710                         break;
1711                 case HAMMER2_BREF_TYPE_VOLUME:
1712                         base = &hmp->voldata.sroot_blockset.blockref[0];
1713                         count = HAMMER2_SET_COUNT;
1714                         break;
1715                 default:
1716                         panic("hammer2_chain_create_indirect: "
1717                               "unrecognized blockref type: %d",
1718                               parent->bref.type);
1719                         count = 0;
1720                         break;
1721                 }
1722         }
1723
1724         /*
1725          * Scan for an unallocated bref, also skipping any slots occupied
1726          * by in-memory chain elements which may not yet have been updated
1727          * in the parent's bref array.
1728          */
1729         bzero(&dummy, sizeof(dummy));
1730         for (i = 0; i < count; ++i) {
1731                 int nkeybits;
1732
1733                 dummy.index = i;
1734                 chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
1735                 if (chain) {
1736                         bref = &chain->bref;
1737                 } else if (base && base[i].type) {
1738                         bref = &base[i];
1739                 } else {
1740                         continue;
1741                 }
1742
1743                 /*
1744                  * Expand our calculated key range (key, keybits) to fit
1745                  * the scanned key.  nkeybits represents the full range
1746                  * that we will later cut in half (two halves @ nkeybits - 1).
1747                  */
1748                 nkeybits = keybits;
1749                 if (nkeybits < bref->keybits)
1750                         nkeybits = bref->keybits;
1751                 while (nkeybits < 64 &&
1752                        (~(((hammer2_key_t)1 << nkeybits) - 1) &
1753                         (key ^ bref->key)) != 0) {
1754                         ++nkeybits;
1755                 }
1756
1757                 /*
1758                  * If the new key range is larger we have to determine
1759                  * which side of the new key range the existing keys fall
1760                  * under by checking the high bit, then collapsing the
1761                  * locount into the hicount or vise-versa.
1762                  */
1763                 if (keybits != nkeybits) {
1764                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
1765                                 hicount += locount;
1766                                 locount = 0;
1767                         } else {
1768                                 locount += hicount;
1769                                 hicount = 0;
1770                         }
1771                         keybits = nkeybits;
1772                 }
1773
1774                 /*
1775                  * The newly scanned key will be in the lower half or the
1776                  * higher half of the (new) key range.
1777                  */
1778                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
1779                         ++hicount;
1780                 else
1781                         ++locount;
1782         }
1783
1784         /*
1785          * Adjust keybits to represent half of the full range calculated
1786          * above (radix 63 max)
1787          */
1788         --keybits;
1789
1790         /*
1791          * Select whichever half contains the most elements.  Theoretically
1792          * we can select either side as long as it contains at least one
1793          * element (in order to ensure that a free slot is present to hold
1794          * the indirect block).
1795          */
1796         key &= ~(((hammer2_key_t)1 << keybits) - 1);
1797         if (hammer2_indirect_optimize) {
1798                 /*
1799                  * Insert node for least number of keys, this will arrange
1800                  * the first few blocks of a large file or the first few
1801                  * inodes in a directory with fewer indirect blocks when
1802                  * created linearly.
1803                  */
1804                 if (hicount < locount && hicount != 0)
1805                         key |= (hammer2_key_t)1 << keybits;
1806                 else
1807                         key &= ~(hammer2_key_t)1 << keybits;
1808         } else {
1809                 /*
1810                  * Insert node for most number of keys, best for heavily
1811                  * fragmented files.
1812                  */
1813                 if (hicount > locount)
1814                         key |= (hammer2_key_t)1 << keybits;
1815                 else
1816                         key &= ~(hammer2_key_t)1 << keybits;
1817         }
1818
1819         /*
1820          * How big should our new indirect block be?  It has to be at least
1821          * as large as its parent.
1822          */
1823         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
1824                 nbytes = HAMMER2_IND_BYTES_MIN;
1825         else
1826                 nbytes = HAMMER2_IND_BYTES_MAX;
1827         if (nbytes < count * sizeof(hammer2_blockref_t))
1828                 nbytes = count * sizeof(hammer2_blockref_t);
1829
1830         /*
1831          * Ok, create our new indirect block
1832          */
1833         dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
1834         dummy.bref.key = key;
1835         dummy.bref.keybits = keybits;
1836         dummy.bref.data_off = hammer2_bytes_to_radix(nbytes);
1837         ichain = hammer2_chain_alloc(hmp, &dummy.bref);
1838         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
1839
1840         /*
1841          * Iterate the original parent and move the matching brefs into
1842          * the new indirect block.
1843          */
1844         for (i = 0; i < count; ++i) {
1845                 /*
1846                  * For keying purposes access the bref from the media or
1847                  * from our in-memory cache.  In cases where the in-memory
1848                  * cache overrides the media the keyrefs will be the same
1849                  * anyway so we can avoid checking the cache when the media
1850                  * has a key.
1851                  */
1852                 dummy.index = i;
1853                 chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
1854                 if (chain) {
1855                         bref = &chain->bref;
1856                 } else if (base && base[i].type) {
1857                         bref = &base[i];
1858                 } else {
1859                         if (ichain->index < 0)
1860                                 ichain->index = i;
1861                         continue;
1862                 }
1863
1864                 /*
1865                  * Skip keys not in the chosen half (low or high), only bit
1866                  * (keybits - 1) needs to be compared but for safety we
1867                  * will compare all msb bits plus that bit again.
1868                  */
1869                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
1870                     (key ^ bref->key)) != 0) {
1871                         continue;
1872                 }
1873
1874                 /*
1875                  * This element is being moved from the parent, its slot
1876                  * is available for our new indirect block.
1877                  */
1878                 if (ichain->index < 0)
1879                         ichain->index = i;
1880
1881                 /*
1882                  * Load the new indirect block by acquiring or allocating
1883                  * the related chain entries, then simply move them to the
1884                  * new parent (ichain).
1885                  *
1886                  * When adjusting the parent/child relationship we must
1887                  * set the MOVED bit but we do NOT update bref_flush
1888                  * because otherwise we might synchronize a bref that has
1889                  * not yet been flushed.  We depend on chain's bref_flush
1890                  * either being correct or the chain being in a MODIFIED
1891                  * state.
1892                  *
1893                  * We do not want to set MODIFIED here as this would result
1894                  * in unnecessary reallocations.
1895                  *
1896                  * We must still set SUBMODIFIED in the parent but we do
1897                  * that after the loop.
1898                  *
1899                  * XXX we really need a lock here but we don't need the
1900                  *     data.  NODATA feature needed.
1901                  */
1902                 chain = hammer2_chain_get(hmp, parent, i,
1903                                           HAMMER2_LOOKUP_NODATA);
1904                 SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
1905                 if (SPLAY_INSERT(hammer2_chain_splay, &ichain->shead, chain))
1906                         panic("hammer2_chain_create_indirect: collision");
1907                 chain->parent = ichain;
1908                 if (base)
1909                         bzero(&base[i], sizeof(base[i]));
1910                 atomic_add_int(&parent->refs, -1);
1911                 atomic_add_int(&ichain->refs, 1);
1912                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1913                         hammer2_chain_ref(hmp, chain);
1914                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1915                 }
1916                 hammer2_chain_unlock(hmp, chain);
1917                 KKASSERT(parent->refs > 0);
1918                 chain = NULL;
1919         }
1920
1921         /*
1922          * Insert the new indirect block into the parent now that we've
1923          * cleared out some entries in the parent.  We calculated a good
1924          * insertion index in the loop above (ichain->index).
1925          *
1926          * We don't have to set MOVED here because we mark ichain modified
1927          * down below (so the normal modified -> flush -> set-moved sequence
1928          * applies).
1929          */
1930         KKASSERT(ichain->index >= 0);
1931         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, ichain))
1932                 panic("hammer2_chain_create_indirect: ichain insertion");
1933         ichain->parent = parent;
1934         atomic_add_int(&parent->refs, 1);
1935
1936         /*
1937          * Mark the new indirect block modified after insertion, which
1938          * will propagate up through parent all the way to the root and
1939          * also allocate the physical block in ichain for our caller,
1940          * and assign ichain->data to a pre-zero'd space (because there
1941          * is not prior data to copy into it).
1942          *
1943          * We have to set SUBMODIFIED in ichain's flags manually so the
1944          * flusher knows it has to recurse through it to get to all of
1945          * our moved blocks, then call setsubmod() to set the bit
1946          * recursively.
1947          */
1948         hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA);
1949         hammer2_chain_parent_setsubmod(hmp, ichain);
1950         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
1951
1952         /*
1953          * Figure out what to return.
1954          */
1955         if (create_bits > keybits) {
1956                 /*
1957                  * Key being created is way outside the key range,
1958                  * return the original parent.
1959                  */
1960                 hammer2_chain_unlock(hmp, ichain);
1961         } else if (~(((hammer2_key_t)1 << keybits) - 1) &
1962                    (create_key ^ key)) {
1963                 /*
1964                  * Key being created is outside the key range,
1965                  * return the original parent.
1966                  */
1967                 hammer2_chain_unlock(hmp, ichain);
1968         } else {
1969                 /*
1970                  * Otherwise its in the range, return the new parent.
1971                  * (leave both the new and old parent locked).
1972                  */
1973                 parent = ichain;
1974         }
1975
1976         return(parent);
1977 }
1978
1979 /*
1980  * Physically delete the specified chain element.  Note that inodes with
1981  * open descriptors should not be deleted (as with other filesystems) until
1982  * the last open descriptor is closed.
1983  *
1984  * This routine will remove the chain element from its parent and potentially
1985  * also recurse upward and delete indirect blocks which become empty as a
1986  * side effect.
1987  *
1988  * The caller must pass a pointer to the chain's parent, also locked and
1989  * referenced.  (*parentp) will be modified in a manner similar to a lookup
1990  * or iteration when indirect blocks are also deleted as a side effect.
1991  *
1992  * XXX This currently does not adhere to the MOVED flag protocol in that
1993  *     the removal is immediately indicated in the parent's blockref[]
1994  *     array.
1995  */
1996 void
1997 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1998                      hammer2_chain_t *chain, int retain)
1999 {
2000         hammer2_blockref_t *base;
2001         hammer2_inode_t *ip;
2002         int count;
2003
2004         if (chain->parent != parent)
2005                 panic("hammer2_chain_delete: parent mismatch");
2006
2007         /*
2008          * Mark the parent modified so our base[] pointer remains valid
2009          * while we move entries.  For the optimized indirect block
2010          * case mark the parent moved instead.
2011          *
2012          * Calculate the blockref reference in the parent
2013          */
2014         switch(parent->bref.type) {
2015         case HAMMER2_BREF_TYPE_INODE:
2016                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2017                 base = &parent->data->ipdata.u.blockset.blockref[0];
2018                 count = HAMMER2_SET_COUNT;
2019                 break;
2020         case HAMMER2_BREF_TYPE_INDIRECT:
2021                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA |
2022                                                   HAMMER2_MODIFY_NO_MODIFY_TID);
2023                 if (parent->flags & HAMMER2_CHAIN_INITIAL)
2024                         base = NULL;
2025                 else
2026                         base = &parent->data->npdata.blockref[0];
2027                 count = parent->bytes / sizeof(hammer2_blockref_t);
2028                 break;
2029         case HAMMER2_BREF_TYPE_VOLUME:
2030                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2031                 base = &hmp->voldata.sroot_blockset.blockref[0];
2032                 count = HAMMER2_SET_COUNT;
2033                 break;
2034         default:
2035                 panic("hammer2_chain_delete: unrecognized blockref type: %d",
2036                       parent->bref.type);
2037                 count = 0;
2038                 break;
2039         }
2040
2041         /*
2042          * Disconnect the bref in the parent, remove the chain, and
2043          * disconnect in-memory fields from the parent.
2044          */
2045         KKASSERT(chain->index >= 0 && chain->index < count);
2046         if (base)
2047                 bzero(&base[chain->index], sizeof(*base));
2048
2049         SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
2050         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2051         atomic_add_int(&parent->refs, -1);      /* for splay entry */
2052         chain->index = -1;
2053         chain->parent = NULL;
2054
2055         /*
2056          * Cumulative adjustments must be propagated to the parent inode
2057          * when deleting and synchronized to ip.  A future reattachment
2058          * (e.g. during a rename) expects only to use ip_data.*_count.
2059          *
2060          * Clear the pointer to the parent inode.
2061          */
2062         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2063                 ip = chain->u.ip;
2064                 if (ip->pip) {
2065                         ip->pip->delta_icount += ip->delta_icount;
2066                         ip->pip->delta_dcount += ip->delta_dcount;
2067                         ip->ip_data.inode_count += ip->delta_icount;
2068                         ip->ip_data.data_count += ip->delta_dcount;
2069                         ip->delta_icount = 0;
2070                         ip->delta_dcount = 0;
2071                         --ip->pip->delta_icount;
2072                         ip->pip = NULL;
2073                 }
2074                 chain->u.ip->depth = 0;
2075         }
2076
2077         /*
2078          * If retain is 0 the deletion is permanent.  Because the chain is
2079          * no longer connected to the topology a flush will have no
2080          * visibility into it.  We must dispose of the references related
2081          * to the MODIFIED and MOVED flags, otherwise the ref count will
2082          * never transition to 0.
2083          *
2084          * If retain is non-zero the deleted element is likely an inode
2085          * which the vnops frontend will mark DESTROYED and flush.  In that
2086          * situation we must retain the flags for any open file descriptors
2087          * on the (removed) inode.  The final close will destroy the
2088          * disconnected chain.
2089          */
2090         if (retain == 0) {
2091                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2092                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
2093                         hammer2_chain_drop(hmp, chain);
2094                 }
2095                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2096                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2097                         hammer2_chain_drop(hmp, chain);
2098                 }
2099         }
2100
2101         /*
2102          * The chain is still likely referenced, possibly even by a vnode
2103          * (if an inode), so defer further action until the chain gets
2104          * dropped.
2105          */
2106 }
2107
2108 /*
2109  * Recursively flush the specified chain.  The chain is locked and
2110  * referenced by the caller and will remain so on return.  The chain
2111  * will remain referenced throughout but can temporarily lose its
2112  * lock during the recursion to avoid unnecessarily stalling user
2113  * processes.
2114  *
2115  *
2116  */
2117 TAILQ_HEAD(flush_deferral_list, hammer2_chain);
2118
2119 struct hammer2_flush_info {
2120         struct flush_deferral_list flush_list;
2121         int             depth;
2122         hammer2_tid_t   modify_tid;
2123 };
2124
2125 typedef struct hammer2_flush_info hammer2_flush_info_t;
2126
2127 static void
2128 hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
2129                           hammer2_flush_info_t *info)
2130 {
2131         hammer2_blockref_t *bref;
2132         hammer2_off_t pbase;
2133         size_t bbytes;
2134         size_t boff;
2135         char *bdata;
2136         struct buf *bp;
2137         int error;
2138         int wasmodified;
2139
2140         /*
2141          * If we hit the stack recursion depth limit defer the operation.
2142          * The controller of the info structure will execute the deferral
2143          * list and then retry.
2144          *
2145          * This is only applicable if SUBMODIFIED is set.  After a reflush
2146          * SUBMODIFIED will probably be cleared and we want to drop through
2147          * to finish processing the current element so our direct parent
2148          * can process the results.
2149          */
2150         if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT &&
2151             (chain->flags & HAMMER2_CHAIN_SUBMODIFIED)) {
2152                 if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
2153                         hammer2_chain_ref(hmp, chain);
2154                         TAILQ_INSERT_TAIL(&info->flush_list,
2155                                           chain, flush_node);
2156                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
2157                 }
2158                 return;
2159         }
2160
2161         if (hammer2_debug & 0x0008)
2162                 kprintf("%*.*sCHAIN type=%d@%08jx %p/%d %04x {\n",
2163                         info->depth, info->depth, "",
2164                         chain->bref.type, chain->bref.data_off,
2165                         chain, chain->refs, chain->flags);
2166
2167         /*
2168          * If SUBMODIFIED is set we recurse the flush and adjust the
2169          * blockrefs accordingly.
2170          *
2171          * NOTE: Looping on SUBMODIFIED can prevent a flush from ever
2172          *       finishing in the face of filesystem activity.
2173          */
2174         if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
2175                 hammer2_chain_t *child;
2176                 hammer2_chain_t *next;
2177                 hammer2_blockref_t *base;
2178                 int count;
2179
2180                 /*
2181                  * Clear SUBMODIFIED to catch races.  Note that if any
2182                  * child has to be flushed SUBMODIFIED will wind up being
2183                  * set again (for next time), but this does not stop us from
2184                  * synchronizing block updates which occurred.
2185                  *
2186                  * We don't want to set our chain to MODIFIED gratuitously.
2187                  */
2188                 /* XXX SUBMODIFIED not interlocked, can race */
2189                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2190
2191                 /*
2192                  * Flush the children and update the blockrefs in the chain.
2193                  * Be careful of ripouts during the loop.
2194                  */
2195                 next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
2196                 if (next)
2197                         hammer2_chain_ref(hmp, next);
2198                 while ((child = next) != NULL) {
2199                         next = SPLAY_NEXT(hammer2_chain_splay,
2200                                           &chain->shead, child);
2201                         if (next)
2202                                 hammer2_chain_ref(hmp, next);
2203                         /*
2204                          * We only recurse if SUBMODIFIED (internal node)
2205                          * or MODIFIED (internal node or leaf) is set.
2206                          * However, we must still track whether any MOVED
2207                          * entries are present to determine if the chain's
2208                          * blockref's need updating or not.
2209                          */
2210                         if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2211                                              HAMMER2_CHAIN_MODIFIED |
2212                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2213                                 hammer2_chain_drop(hmp, child);
2214                                 continue;
2215                         }
2216                         hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_MAYBE);
2217                         hammer2_chain_drop(hmp, child);
2218                         if (child->parent != chain ||
2219                             (child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2220                                              HAMMER2_CHAIN_MODIFIED |
2221                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2222                                 hammer2_chain_unlock(hmp, child);
2223                                 continue;
2224                         }
2225
2226                         /*
2227                          * Propagate the DESTROYED flag if found set, then
2228                          * recurse the flush.
2229                          */
2230                         if ((chain->flags & HAMMER2_CHAIN_DESTROYED) &&
2231                             (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) {
2232                                 atomic_set_int(&child->flags,
2233                                                HAMMER2_CHAIN_DESTROYED |
2234                                                HAMMER2_CHAIN_SUBMODIFIED);
2235                         }
2236                         ++info->depth;
2237                         hammer2_chain_flush_pass1(hmp, child, info);
2238                         --info->depth;
2239                         hammer2_chain_unlock(hmp, child);
2240                 }
2241
2242                 /*
2243                  * Now synchronize any block updates.
2244                  */
2245                 next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
2246                 if (next)
2247                         hammer2_chain_ref(hmp, next);
2248                 while ((child = next) != NULL) {
2249                         next = SPLAY_NEXT(hammer2_chain_splay,
2250                                           &chain->shead, child);
2251                         if (next)
2252                                 hammer2_chain_ref(hmp, next);
2253                         if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) {
2254                                 hammer2_chain_drop(hmp, child);
2255                                 continue;
2256                         }
2257                         hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_NEVER);
2258                         hammer2_chain_drop(hmp, child);
2259                         if (child->parent != chain ||
2260                             (child->flags & HAMMER2_CHAIN_MOVED) == 0) {
2261                                 hammer2_chain_unlock(hmp, child);
2262                                 continue;
2263                         }
2264
2265                         hammer2_chain_modify(hmp, chain,
2266                                              HAMMER2_MODIFY_NO_MODIFY_TID);
2267
2268                         switch(chain->bref.type) {
2269                         case HAMMER2_BREF_TYPE_INODE:
2270                                 KKASSERT((chain->data->ipdata.op_flags &
2271                                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
2272                                 base = &chain->data->ipdata.u.blockset.
2273                                         blockref[0];
2274                                 count = HAMMER2_SET_COUNT;
2275                                 break;
2276                         case HAMMER2_BREF_TYPE_INDIRECT:
2277                                 base = &chain->data->npdata.blockref[0];
2278                                 count = chain->bytes /
2279                                         sizeof(hammer2_blockref_t);
2280                                 break;
2281                         case HAMMER2_BREF_TYPE_VOLUME:
2282                                 base = &hmp->voldata.sroot_blockset.blockref[0];
2283                                 count = HAMMER2_SET_COUNT;
2284                                 break;
2285                         default:
2286                                 base = NULL;
2287                                 panic("hammer2_chain_get: "
2288                                       "unrecognized blockref type: %d",
2289                                       chain->bref.type);
2290                         }
2291
2292                         KKASSERT(child->index >= 0);
2293                         base[child->index] = child->bref_flush;
2294
2295                         if (chain->bref.mirror_tid <
2296                             child->bref_flush.mirror_tid) {
2297                                 chain->bref.mirror_tid =
2298                                         child->bref_flush.mirror_tid;
2299                         }
2300
2301                         if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME &&
2302                             hmp->voldata.mirror_tid <
2303                             child->bref_flush.mirror_tid) {
2304                                 hmp->voldata.mirror_tid =
2305                                         child->bref_flush.mirror_tid;
2306                         }
2307                         atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
2308                         hammer2_chain_drop(hmp, child); /* MOVED flag */
2309                         hammer2_chain_unlock(hmp, child);
2310                 }
2311         }
2312
2313         /*
2314          * If destroying the object we unconditonally clear the MODIFIED
2315          * and MOVED bits, and we destroy the buffer without writing it
2316          * out.
2317          *
2318          * We don't bother updating the hash/crc or the chain bref.
2319          *
2320          * NOTE: The destroy'd object's bref has already been updated.
2321          *       so we can clear MOVED without propagating mirror_tid
2322          *       or modify_tid upward.
2323          *
2324          * XXX allocations for unflushed data can be returned to the
2325          *     free pool.
2326          */
2327         if (chain->flags & HAMMER2_CHAIN_DESTROYED) {
2328                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2329                         if (chain->bp) {
2330                                 chain->bp->b_flags |= B_INVAL|B_RELBUF;
2331                         }
2332                         atomic_clear_int(&chain->flags,
2333                                          HAMMER2_CHAIN_MODIFIED |
2334                                          HAMMER2_CHAIN_MODIFY_TID);
2335                         hammer2_chain_drop(hmp, chain);
2336                 }
2337                 if (chain->flags & HAMMER2_CHAIN_MODIFIED_AUX) {
2338                         atomic_clear_int(&chain->flags,
2339                                          HAMMER2_CHAIN_MODIFIED_AUX);
2340                 }
2341                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2342                         atomic_clear_int(&chain->flags,
2343                                          HAMMER2_CHAIN_MOVED);
2344                         hammer2_chain_drop(hmp, chain);
2345                 }
2346                 return;
2347         }
2348
2349         /*
2350          * Flush this chain entry only if it is marked modified.
2351          */
2352         if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
2353                              HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2354                 goto done;
2355         }
2356
2357         /*
2358          * Synchronize cumulative data and inode count adjustments to
2359          * the inode and propagate the deltas upward to the parent.
2360          */
2361         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2362                 hammer2_inode_t *ip;
2363
2364                 ip = chain->u.ip;
2365                 ip->ip_data.inode_count += ip->delta_icount;
2366                 ip->ip_data.data_count += ip->delta_dcount;
2367                 if (ip->pip) {
2368                         ip->pip->delta_icount += ip->delta_icount;
2369                         ip->pip->delta_dcount += ip->delta_dcount;
2370                 }
2371                 ip->delta_icount = 0;
2372                 ip->delta_dcount = 0;
2373         }
2374
2375         /*
2376          * Flush if MODIFIED or MODIFIED_AUX is set.  MODIFIED_AUX is only
2377          * used by the volume header (&hmp->vchain).
2378          */
2379         if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
2380                              HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2381                 goto done;
2382         }
2383         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED_AUX);
2384
2385         /*
2386          * Clear MODIFIED and set HAMMER2_CHAIN_MOVED.  The caller
2387          * will re-test the MOVED bit.  We must also update the mirror_tid
2388          * and modify_tid fields as appropriate.
2389          *
2390          * bits own a single chain ref and the MOVED bit owns its own
2391          * chain ref.
2392          */
2393         chain->bref.mirror_tid = info->modify_tid;
2394         if (chain->flags & HAMMER2_CHAIN_MODIFY_TID)
2395                 chain->bref.modify_tid = info->modify_tid;
2396         wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0;
2397         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
2398                                         HAMMER2_CHAIN_MODIFY_TID);
2399
2400         if (chain->flags & HAMMER2_CHAIN_MOVED) {
2401                 /*
2402                  * Drop the ref from the MODIFIED bit we cleared.
2403                  */
2404                 if (wasmodified)
2405                         hammer2_chain_drop(hmp, chain);
2406         } else {
2407                 /*
2408                  * If we were MODIFIED we inherit the ref from clearing
2409                  * that bit, otherwise we need another ref.
2410                  */
2411                 if (wasmodified == 0)
2412                         hammer2_chain_ref(hmp, chain);
2413                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2414         }
2415         chain->bref_flush = chain->bref;
2416
2417         /*
2418          * If this is part of a recursive flush we can go ahead and write
2419          * out the buffer cache buffer and pass a new bref back up the chain.
2420          *
2421          * This will never be a volume header.
2422          */
2423         switch(chain->bref.type) {
2424         case HAMMER2_BREF_TYPE_VOLUME:
2425                 /*
2426                  * The volume header is flushed manually by the syncer, not
2427                  * here.
2428                  */
2429                 break;
2430         case HAMMER2_BREF_TYPE_DATA:
2431                 /*
2432                  * Data elements have already been flushed via the logical
2433                  * file buffer cache.  Their hash was set in the bref by
2434                  * the vop_write code.
2435                  *
2436                  * Make sure the buffer(s) have been flushed out here.
2437                  */
2438 #if 1
2439                 bbytes = chain->bytes;
2440                 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
2441                 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
2442
2443                 bp = getblk(hmp->devvp, pbase, bbytes, GETBLK_NOWAIT, 0);
2444                 if (bp) {
2445                         if ((bp->b_flags & (B_CACHE | B_DIRTY)) ==
2446                             (B_CACHE | B_DIRTY)) {
2447                                 kprintf("x");
2448                                 cluster_awrite(bp);
2449                         } else {
2450                                 bp->b_flags |= B_RELBUF;
2451                                 brelse(bp);
2452                         }
2453                 }
2454 #endif
2455                 break;
2456         case HAMMER2_BREF_TYPE_INDIRECT:
2457                 /*
2458                  * Indirect blocks may be in an INITIAL state.
2459                  */
2460                 break;
2461         default:
2462                 /*
2463                  * Embedded elements have to be flushed out.
2464                  */
2465                 KKASSERT(chain->data != NULL);
2466                 bref = &chain->bref;
2467
2468                 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
2469
2470                 if (chain->bp == NULL) {
2471                         /*
2472                          * The data is embedded, we have to acquire the
2473                          * buffer cache buffer and copy the data into it.
2474                          */
2475                         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
2476                                 bbytes = HAMMER2_MINIOSIZE;
2477                         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
2478                         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
2479
2480                         /*
2481                          * The getblk() optimization can only be used if the
2482                          * physical block size matches the request.
2483                          */
2484                         if (chain->bytes == bbytes) {
2485                                 bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
2486                                 error = 0;
2487                         } else {
2488                                 error = bread(hmp->devvp, pbase, bbytes, &bp);
2489                                 KKASSERT(error == 0);
2490                         }
2491                         bdata = (char *)bp->b_data + boff;
2492
2493                         /*
2494                          * Copy the data to the buffer, mark the buffer
2495                          * dirty, and convert the chain to unmodified.
2496                          */
2497                         bcopy(chain->data, bdata, chain->bytes);
2498                         bp->b_flags |= B_CLUSTEROK;
2499                         bdwrite(bp);
2500                         bp = NULL;
2501                         chain->bref.check.iscsi32.value =
2502                                 hammer2_icrc32(chain->data, chain->bytes);
2503                         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
2504                                 ++hammer2_iod_meta_write;
2505                         else
2506                                 ++hammer2_iod_indr_write;
2507                 } else {
2508                         chain->bref.check.iscsi32.value =
2509                                 hammer2_icrc32(chain->data, chain->bytes);
2510                 }
2511         }
2512
2513         /*
2514          * Adjustments to the bref.  The caller will use this to adjust
2515          * our chain's pointer to this chain element.
2516          */
2517         bref = &chain->bref;
2518
2519         switch(bref->type) {
2520         case HAMMER2_BREF_TYPE_VOLUME:
2521                 KKASSERT(chain->data != NULL);
2522                 KKASSERT(chain->bp == NULL);
2523
2524                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
2525                         hammer2_icrc32(
2526                                 (char *)&hmp->voldata +
2527                                  HAMMER2_VOLUME_ICRC1_OFF,
2528                                 HAMMER2_VOLUME_ICRC1_SIZE);
2529                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
2530                         hammer2_icrc32(
2531                                 (char *)&hmp->voldata +
2532                                  HAMMER2_VOLUME_ICRC0_OFF,
2533                                 HAMMER2_VOLUME_ICRC0_SIZE);
2534                 hmp->voldata.icrc_volheader =
2535                         hammer2_icrc32(
2536                                 (char *)&hmp->voldata +
2537                                  HAMMER2_VOLUME_ICRCVH_OFF,
2538                                 HAMMER2_VOLUME_ICRCVH_SIZE);
2539                 break;
2540         default:
2541                 break;
2542
2543         }
2544 done:
2545         if (hammer2_debug & 0x0008) {
2546                 kprintf("%*.*s} %p/%d %04x ",
2547                         info->depth, info->depth, "",
2548                         chain, chain->refs, chain->flags);
2549         }
2550 }
2551
2552 #if 0
2553 /*
2554  * PASS2 - not yet implemented (should be called only with the root chain?)
2555  */
2556 static void
2557 hammer2_chain_flush_pass2(hammer2_mount_t *hmp, hammer2_chain_t *chain)
2558 {
2559 }
2560 #endif
2561
2562 /*
2563  * Stand-alone flush.  If the chain is unable to completely flush we have
2564  * to be sure that SUBMODIFIED propagates up the parent chain.  We must not
2565  * clear the MOVED bit after flushing in this situation or our desynchronized
2566  * bref will not properly update in the parent.
2567  *
2568  * This routine can be called from several places but the most important
2569  * is from the hammer2_vop_reclaim() function.  We want to try to completely
2570  * clean out the inode structure to prevent disconnected inodes from
2571  * building up and blowing out the kmalloc pool.
2572  *
2573  * If modify_tid is 0 (usual case), a new modify_tid is allocated and
2574  * applied to the flush.  The depth-limit handling code is the only
2575  * code which passes a non-zero modify_tid to hammer2_chain_flush().
2576  */
2577 void
2578 hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
2579                     hammer2_tid_t modify_tid)
2580 {
2581         hammer2_chain_t *parent;
2582         hammer2_chain_t *scan;
2583         hammer2_blockref_t *base;
2584         hammer2_flush_info_t info;
2585         int count;
2586         int reflush;
2587
2588         /*
2589          * Execute the recursive flush and handle deferrals.
2590          *
2591          * Chains can be ridiculously long (thousands deep), so to
2592          * avoid blowing out the kernel stack the recursive flush has a
2593          * depth limit.  Elements at the limit are placed on a list
2594          * for re-execution after the stack has been popped.
2595          */
2596         bzero(&info, sizeof(info));
2597         TAILQ_INIT(&info.flush_list);
2598
2599         if (modify_tid == 0) {
2600                 hammer2_voldata_lock(hmp);
2601                 info.modify_tid = hmp->voldata.alloc_tid++;
2602                 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
2603                 hammer2_voldata_unlock(hmp);
2604         } else {
2605                 info.modify_tid = modify_tid;
2606         }
2607         reflush = 1;
2608
2609         while (reflush) {
2610                 /*
2611                  * Primary recursion
2612                  */
2613                 hammer2_chain_flush_pass1(hmp, chain, &info);
2614                 reflush = 0;
2615
2616                 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
2617                         /*
2618                          * Secondary recursion.  Note that a reference is
2619                          * retained from the element's presence on the
2620                          * deferral list.
2621                          */
2622                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
2623                         TAILQ_REMOVE(&info.flush_list, scan, flush_node);
2624                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
2625
2626                         /*
2627                          * Now that we've popped back up we can do a secondary
2628                          * recursion on the deferred elements.
2629                          */
2630                         if (hammer2_debug & 0x0040)
2631                                 kprintf("defered flush %p\n", scan);
2632                         hammer2_chain_lock(hmp, scan, HAMMER2_RESOLVE_MAYBE);
2633                         hammer2_chain_flush(hmp, scan, info.modify_tid);
2634                         hammer2_chain_unlock(hmp, scan);
2635
2636                         /*
2637                          * Only flag a reflush if SUBMODIFIED is no longer
2638                          * set.  If SUBMODIFIED is set the element will just
2639                          * wind up on our flush_list again.
2640                          */
2641                         if ((scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2642                                             HAMMER2_CHAIN_MODIFIED |
2643                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2644                                 reflush = 1;
2645                         }
2646                         hammer2_chain_drop(hmp, scan);
2647                 }
2648                 if ((hammer2_debug & 0x0040) && reflush)
2649                         kprintf("reflush %p\n", chain);
2650         }
2651
2652         /*
2653          * The SUBMODIFIED bit must propagate upward if the chain could not
2654          * be completely flushed.
2655          */
2656         if (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2657                             HAMMER2_CHAIN_MODIFIED |
2658                             HAMMER2_CHAIN_MODIFIED_AUX |
2659                             HAMMER2_CHAIN_MOVED)) {
2660                 hammer2_chain_parent_setsubmod(hmp, chain);
2661         }
2662
2663         /*
2664          * If the only thing left is a simple bref update try to
2665          * pro-actively update the parent, otherwise return early.
2666          */
2667         parent = chain->parent;
2668         if (parent == NULL) {
2669                 return;
2670         }
2671         if (chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
2672             (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2673                              HAMMER2_CHAIN_MODIFIED |
2674                              HAMMER2_CHAIN_MODIFIED_AUX |
2675                              HAMMER2_CHAIN_MOVED)) != HAMMER2_CHAIN_MOVED) {
2676                 return;
2677         }
2678
2679         /*
2680          * We are locking backwards so allow the lock to fail
2681          */
2682         if (lockmgr(&parent->lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
2683                 return;
2684         }
2685
2686         /*
2687          * We are updating brefs but we have to call chain_modify()
2688          * because our caller is not being run from a recursive flush.
2689          *
2690          * This will also chain up the parent list and set the SUBMODIFIED
2691          * flag.
2692          *
2693          * We do not want to set HAMMER2_CHAIN_MODIFY_TID here because the
2694          * modification is only related to updating a bref in the parent.
2695          *
2696          * When updating the blockset embedded in the volume header we must
2697          * also update voldata.mirror_tid.
2698          */
2699         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
2700         hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2701
2702         switch(parent->bref.type) {
2703         case HAMMER2_BREF_TYPE_INODE:
2704                 base = &parent->data->ipdata.u.blockset.
2705                         blockref[0];
2706                 count = HAMMER2_SET_COUNT;
2707                 break;
2708         case HAMMER2_BREF_TYPE_INDIRECT:
2709                 base = &parent->data->npdata.blockref[0];
2710                 count = parent->bytes /
2711                         sizeof(hammer2_blockref_t);
2712                 break;
2713         case HAMMER2_BREF_TYPE_VOLUME:
2714                 base = &hmp->voldata.sroot_blockset.blockref[0];
2715                 count = HAMMER2_SET_COUNT;
2716                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2717                         if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
2718                                 hmp->voldata.mirror_tid =
2719                                         chain->bref.mirror_tid;
2720                         }
2721                 }
2722                 break;
2723         default:
2724                 base = NULL;
2725                 panic("hammer2_chain_flush: "
2726                       "unrecognized blockref type: %d",
2727                       parent->bref.type);
2728         }
2729
2730         /*
2731          * Update the blockref in the parent.  We do not have to set
2732          * MOVED in the parent because the parent has been marked modified,
2733          * so the flush sequence will pick up the bref change.
2734          *
2735          * We do have to propagate mirror_tid upward.
2736          */
2737         KKASSERT(chain->index >= 0 &&
2738                  chain->index < count);
2739         KKASSERT(chain->parent == parent);
2740         if (chain->flags & HAMMER2_CHAIN_MOVED) {
2741                 base[chain->index] = chain->bref_flush;
2742                 if (parent->bref.mirror_tid < chain->bref_flush.mirror_tid)
2743                         parent->bref.mirror_tid = chain->bref_flush.mirror_tid;
2744                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2745                 hammer2_chain_drop(hmp, chain);
2746         } else if (bcmp(&base[chain->index], &chain->bref_flush,
2747                    sizeof(chain->bref)) != 0) {
2748                 panic("hammer2: unflagged bref update(2)");
2749         }
2750
2751         lockmgr(&parent->lk, LK_RELEASE);       /* release manual lockmgr op */
2752         hammer2_chain_unlock(hmp, parent);
2753 }